From 317d9e9ed79d8014b4b3ca56c372b9bc4d8cc484 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Wed, 13 Aug 2025 16:59:30 -0700 Subject: [PATCH] chore(ci): remove unused pytest wrapper and debug runner --- scripts/ci_debug_pytest.py | 160 ---------------------------- scripts/ci_pytest_wrapper.py | 199 ----------------------------------- 2 files changed, 359 deletions(-) delete mode 100644 scripts/ci_debug_pytest.py delete mode 100755 scripts/ci_pytest_wrapper.py diff --git a/scripts/ci_debug_pytest.py b/scripts/ci_debug_pytest.py deleted file mode 100644 index eb0d8f2..0000000 --- a/scripts/ci_debug_pytest.py +++ /dev/null @@ -1,160 +0,0 @@ -import faulthandler -import os -import signal -import subprocess -import sys -import threading -import time -import traceback - - -def setup_hang_detection() -> None: - """Setup signal handlers and periodic dumps to help debug hangs in CI. - - - Enables faulthandler to dump Python stack traces on fatal signals - - Installs handlers for SIGUSR1/2 to dump all thread stacks on demand - - Starts a background thread that periodically dumps stacks - """ - # Enable faulthandler for automatic stack dumps - faulthandler.enable() - - def dump_all_stacks(signum, frame): # type: ignore[no-redef] - print(f"\n๐Ÿ”ฅ [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:") - faulthandler.dump_traceback() - # Also dump current frames manually for completeness - for thread_id, thread_frame in sys._current_frames().items(): - print(f"\n๐Ÿ“ Thread {thread_id}:") - traceback.print_stack(thread_frame) - - def periodic_stack_dump() -> None: - """Periodically dump stacks to catch where the process is stuck.""" - start_time = time.time() - - while True: - time.sleep(120) # Check every 2 minutes - elapsed = time.time() - start_time - - print(f"\nโฐ [HANG DEBUG] Periodic check at {elapsed:.1f}s elapsed:") - - # Check for hanging processes and dump stacks - try: - import subprocess - - # Check for embedding servers that might be hanging - result = subprocess.run( - ["pgrep", "-f", "embedding_server"], capture_output=True, text=True, timeout=5 - ) - if result.stdout.strip(): - print( - f"๐Ÿ“ [HANG DEBUG] Found embedding server processes: {result.stdout.strip()}" - ) - - # Check for zmq processes - result = subprocess.run( - ["pgrep", "-f", "zmq"], capture_output=True, text=True, timeout=5 - ) - if result.stdout.strip(): - print(f"๐Ÿ“ [HANG DEBUG] Found zmq processes: {result.stdout.strip()}") - - except Exception as e: - print(f"๐Ÿ“ [HANG DEBUG] Process check failed: {e}") - - # Dump thread stacks every 4 minutes - if elapsed > 240 and int(elapsed) % 240 < 120: - print(f"\nโš ๏ธ [HANG DEBUG] Stack dump at {elapsed:.1f}s:") - for thread_id, thread_frame in sys._current_frames().items(): - print(f"\n๐Ÿ“ Thread {thread_id}:") - traceback.print_stack(thread_frame) - - # Emergency exit after 8 minutes (should be handled by wrapper timeout) - if elapsed > 480: - print( - f"\n๐Ÿ’ฅ [HANG DEBUG] Emergency exit after {elapsed:.1f}s - pytest taking too long!" - ) - faulthandler.dump_traceback() - # Try to cleanup before exit - try: - import subprocess - - subprocess.run(["pkill", "-9", "-f", "embedding_server"], timeout=2) - subprocess.run(["pkill", "-9", "-f", "zmq"], timeout=2) - except Exception: - pass - import os - - os._exit(124) # Force exit with timeout code - - # Register signal handlers for external debugging - signal.signal(signal.SIGUSR1, dump_all_stacks) - signal.signal(signal.SIGUSR2, dump_all_stacks) - - # Start periodic dumping thread - dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True) - dump_thread.start() - - -def main(argv: list[str]) -> int: - setup_hang_detection() - # Re-exec pytest with debugging enabled - # Use Popen for better control over the subprocess - print(f"๐Ÿš€ [DEBUG] Starting pytest with args: {argv}") - - try: - # Use Popen for non-blocking execution - process = subprocess.Popen( - [sys.executable, "-m", "pytest", *argv], - stdout=sys.stdout, - stderr=sys.stderr, - # Use separate process group to avoid signal inheritance issues - preexec_fn=os.setsid if hasattr(os, "setsid") else None, - ) - - # Monitor the process with a reasonable timeout - start_time = time.time() - timeout = 600 # 10 minutes - poll_interval = 5 # seconds - - while True: - # Check if process has completed - return_code = process.poll() - if return_code is not None: - print(f"โœ… [DEBUG] Pytest completed with return code: {return_code}") - return return_code - - # Check for timeout - elapsed = time.time() - start_time - if elapsed > timeout: - print(f"๐Ÿ’ฅ [DEBUG] Pytest timed out after {elapsed:.1f}s, terminating...") - try: - # Try graceful termination first - process.terminate() - try: - process.wait(timeout=10) - except subprocess.TimeoutExpired: - # Force kill if still running - process.kill() - process.wait() - - # Cleanup any remaining processes - subprocess.run(["pkill", "-9", "-f", "pytest"], timeout=5) - subprocess.run(["pkill", "-9", "-f", "embedding_server"], timeout=5) - except Exception: - pass - return 124 # timeout exit code - - # Wait before next check - time.sleep(poll_interval) - - except Exception as e: - print(f"๐Ÿ’ฅ [DEBUG] Error running pytest: {e}") - # Cleanup on error - try: - subprocess.run(["pkill", "-9", "-f", "pytest"], timeout=5) - subprocess.run(["pkill", "-9", "-f", "embedding_server"], timeout=5) - except Exception: - pass - return 1 - - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) diff --git a/scripts/ci_pytest_wrapper.py b/scripts/ci_pytest_wrapper.py deleted file mode 100755 index 53514f5..0000000 --- a/scripts/ci_pytest_wrapper.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python3 -""" -CI pytest wrapper with comprehensive hang detection and cleanup. -Designed to prevent CI hangs due to subprocess or cleanup issues. -""" - -import os -import signal -import subprocess -import sys -import time - - -def cleanup_all_processes(): - """Aggressively cleanup all related processes.""" - print("๐Ÿงน [CLEANUP] Performing aggressive cleanup...") - - # Kill by pattern - use separate calls to avoid shell injection - # Avoid killing ourselves by being more specific - patterns = [ - "embedding_server", - "hnsw_embedding", - "zmq", - ] - - for pattern in patterns: - try: - subprocess.run(["pkill", "-9", "-f", pattern], timeout=5, capture_output=True) - except Exception: - pass - - # Clean up specific pytest processes but NOT the wrapper itself - try: - result = subprocess.run(["ps", "aux"], capture_output=True, text=True, timeout=5) - lines = result.stdout.split("\n") - current_pid = str(os.getpid()) - - for line in lines: - # Skip our own process - if current_pid in line: - continue - # Only kill actual pytest processes, not wrapper processes - if ( - "python" in line - and "pytest" in line - and "ci_pytest_wrapper.py" not in line - and "ci_debug_pytest.py" not in line - ): - try: - pid = line.split()[1] - subprocess.run(["kill", "-9", pid], timeout=2) - except Exception: - pass - except Exception: - pass - - print("๐Ÿงน [CLEANUP] Cleanup completed") - - -def run_pytest_with_monitoring(pytest_args): - """Run pytest with comprehensive monitoring and timeout handling.""" - - # Pre-test cleanup - print("๐Ÿงน [WRAPPER] Pre-test cleanup...") - cleanup_all_processes() - time.sleep(2) - - # Show pre-test state - print("๐Ÿ“Š [WRAPPER] Pre-test process state:") - try: - result = subprocess.run(["ps", "aux"], capture_output=True, text=True, timeout=5) - relevant_lines = [ - line - for line in result.stdout.split("\n") - if "python" in line or "embedding" in line or "zmq" in line - ] - if relevant_lines: - for line in relevant_lines[:5]: # Show first 5 matches - print(f" {line}") - else: - print(" No relevant processes found") - except Exception: - print(" Process check failed") - - # Setup signal handlers for cleanup - def signal_handler(signum, frame): - print(f"\n๐Ÿ’ฅ [WRAPPER] Received signal {signum}, cleaning up...") - cleanup_all_processes() - sys.exit(128 + signum) - - signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGINT, signal_handler) - - # Run pytest with monitoring - print(f"๐Ÿš€ [WRAPPER] Starting pytest with args: {pytest_args}") - - try: - # Use Popen for better control - cmd = [sys.executable, "scripts/ci_debug_pytest.py", *pytest_args] - process = subprocess.Popen( - cmd, - stdout=sys.stdout, - stderr=sys.stderr, - preexec_fn=os.setsid if hasattr(os, "setsid") else None, - ) - - # Monitor with timeout - start_time = time.time() - timeout = 600 # 10 minutes - monitor_interval = 10 # Check every 10 seconds - - while True: - # Check if process completed - return_code = process.poll() - if return_code is not None: - print(f"โœ… [WRAPPER] Pytest completed with return code: {return_code}") - break - - # Check for timeout - elapsed = time.time() - start_time - if elapsed > timeout: - print(f"๐Ÿ’ฅ [WRAPPER] Pytest timed out after {elapsed:.1f}s") - - # Try graceful termination - try: - print("๐Ÿ”„ [WRAPPER] Attempting graceful termination...") - process.terminate() - try: - process.wait(timeout=10) - except subprocess.TimeoutExpired: - print("๐Ÿ’€ [WRAPPER] Graceful termination failed, force killing...") - process.kill() - process.wait() - except Exception as e: - print(f"โš ๏ธ [WRAPPER] Error during termination: {e}") - - return_code = 124 # timeout exit code - break - - # Monitor progress - if int(elapsed) % 30 == 0: # Every 30 seconds - print(f"๐Ÿ“Š [WRAPPER] Monitor check: {elapsed:.0f}s elapsed, pytest still running") - - time.sleep(monitor_interval) - - # Post-test cleanup verification - print("๐Ÿ” [WRAPPER] Post-test cleanup verification...") - time.sleep(2) - - try: - result = subprocess.run(["ps", "aux"], capture_output=True, text=True, timeout=5) - lines = result.stdout.split("\n") - current_pid = str(os.getpid()) - - remaining = [] - for line in lines: - # Skip our own wrapper process - if current_pid in line or "ci_pytest_wrapper.py" in line: - continue - # Only check for actual problematic processes - if "python" in line and ("pytest" in line or "embedding" in line): - # Skip debug script too - if "ci_debug_pytest.py" not in line: - remaining.append(line) - - if remaining: - print(f"โš ๏ธ [WRAPPER] Found {len(remaining)} remaining processes:") - for line in remaining[:3]: # Show first 3 - print(f" {line}") - print("๐Ÿ’€ [WRAPPER] Performing final cleanup...") - cleanup_all_processes() - else: - print("โœ… [WRAPPER] No remaining processes found") - except Exception: - print("โš ๏ธ [WRAPPER] Post-test verification failed, performing cleanup anyway") - cleanup_all_processes() - - return return_code - - except Exception as e: - print(f"๐Ÿ’ฅ [WRAPPER] Error running pytest: {e}") - cleanup_all_processes() - return 1 - - -def main(): - """Main entry point.""" - if len(sys.argv) < 2: - print("Usage: ci_pytest_wrapper.py ") - return 1 - - pytest_args = sys.argv[1:] - print(f"๐ŸŽฏ [WRAPPER] CI pytest wrapper starting with args: {pytest_args}") - - return run_pytest_with_monitoring(pytest_args) - - -if __name__ == "__main__": - sys.exit(main())