fix: simplify embedding server process management

- Remove start_new_session=True to fix signal handling issues
- Simplify termination logic to use standard SIGTERM/SIGKILL
- Remove complex process group management that could cause hangs
- Add timeout-based cleanup to prevent CI hangs while ensuring proper resource cleanup
- Give graceful shutdown more time (5s) since we fixed the server shutdown logic
- Remove unused signal import

This addresses the remaining process management issues that could
cause startup failures and hanging during termination.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Andy Lee
2025-08-13 11:44:32 -07:00
parent e26d6d9d14
commit 24676970eb

View File

@@ -1,7 +1,6 @@
import atexit
import logging
import os
import signal
import socket
import subprocess
import sys
@@ -316,13 +315,12 @@ class EmbeddingServerManager:
stdout_target = None # Direct to console for visible logs
stderr_target = None # Direct to console for visible logs
# IMPORTANT: Use a new session so we can manage the whole process group reliably
# Start embedding server subprocess
self.server_process = subprocess.Popen(
command,
cwd=project_root,
stdout=stdout_target,
stderr=stderr_target,
start_new_session=True,
)
self.server_port = port
logger.info(f"Server process started with PID: {self.server_process.pid}")
@@ -364,26 +362,18 @@ class EmbeddingServerManager:
logger.info(
f"Terminating server process (PID: {self.server_process.pid}) for backend {self.backend_module_name}..."
)
# Try terminating the whole process group first (POSIX)
try:
pgid = os.getpgid(self.server_process.pid)
os.killpg(pgid, signal.SIGTERM)
except Exception:
# Fallback to terminating just the process
self.server_process.terminate()
# Use simple termination - our improved server shutdown should handle this properly
self.server_process.terminate()
try:
self.server_process.wait(timeout=3)
logger.info(f"Server process {self.server_process.pid} terminated.")
self.server_process.wait(timeout=5) # Give more time for graceful shutdown
logger.info(f"Server process {self.server_process.pid} terminated gracefully.")
except subprocess.TimeoutExpired:
logger.warning(
f"Server process {self.server_process.pid} did not terminate gracefully within 3 seconds, killing it."
f"Server process {self.server_process.pid} did not terminate within 5 seconds, force killing..."
)
try:
pgid = os.getpgid(self.server_process.pid)
os.killpg(pgid, signal.SIGKILL)
except Exception:
self.server_process.kill()
self.server_process.kill()
try:
self.server_process.wait(timeout=2)
logger.info(f"Server process {self.server_process.pid} killed successfully.")
@@ -391,12 +381,20 @@ class EmbeddingServerManager:
logger.error(
f"Failed to kill server process {self.server_process.pid} - it may be hung"
)
# Don't hang indefinitely
# Clean up process resources without waiting
# The process should already be terminated/killed above
# Don't wait here as it can hang CI indefinitely
self.server_process = None
# Clean up process resources with timeout to avoid CI hang
try:
# Use shorter timeout in CI environments
is_ci = os.environ.get("CI") == "true"
timeout = 3 if is_ci else 10
self.server_process.wait(timeout=timeout)
logger.info(f"Server process {self.server_process.pid} cleanup completed")
except subprocess.TimeoutExpired:
logger.warning(f"Process cleanup timeout after {timeout}s, proceeding anyway")
except Exception as e:
logger.warning(f"Error during process cleanup: {e}")
finally:
self.server_process = None
def _launch_server_process_colab(self, command: list, port: int) -> None:
"""Launch the server process with Colab-specific settings."""