Files
LEANN/tests/conftest.py
Andy Lee a35bfb0354 fix: comprehensive ZMQ timeout and cleanup fixes based on detailed analysis
Based on excellent diagnostic suggestions, implemented multiple fixes:

1. Diagnostics:
   - Added faulthandler to dump stack traces 10s before CI timeout
   - Enhanced CI script with trap handler to show processes/network on timeout
   - Added diag() function to capture pstree, processes, network listeners

2. ZMQ Socket Timeouts (critical fix):
   - Added RCVTIMEO=1000ms and SNDTIMEO=1000ms to all client sockets
   - Added IMMEDIATE=1 to avoid connection blocking
   - Reduced searcher timeout from 30s to 5s
   - This prevents infinite blocking on recv/send operations

3. Context.instance() Fix (major issue):
   - NEVER call term() or destroy() on Context.instance()
   - This was causing blocking as it waits for ALL sockets to close
   - Now only set linger=0 without terminating

4. Enhanced Process Cleanup:
   - Added _reap_children fixture for aggressive session-end cleanup
   - Better recursive child process termination
   - Added final wait to ensure cleanup completes

The 180s timeout was happening because:
- ZMQ recv() was blocking indefinitely without timeout
- Context.instance().term() was waiting for all sockets
- Child processes weren't being fully cleaned up

These changes should prevent the hanging completely.
2025-08-08 18:29:09 -07:00

187 lines
5.1 KiB
Python

"""Global test configuration and cleanup fixtures."""
import faulthandler
import os
import signal
import time
from collections.abc import Generator
import pytest
# Enable faulthandler to dump stack traces
faulthandler.enable()
@pytest.fixture(scope="session", autouse=True)
def _ci_backtraces():
"""Dump stack traces before CI timeout to diagnose hanging."""
if os.getenv("CI") == "true":
# Dump stack traces 10s before the 180s timeout
faulthandler.dump_traceback_later(170, repeat=True)
yield
faulthandler.cancel_dump_traceback_later()
@pytest.fixture(scope="session", autouse=True)
def global_test_cleanup() -> Generator:
"""Global cleanup fixture that runs after all tests.
This ensures all ZMQ connections and child processes are properly cleaned up,
preventing the test runner from hanging on exit.
"""
yield
# Cleanup after all tests
print("\n🧹 Running global test cleanup...")
# 1. Force cleanup of any LeannSearcher instances
try:
import gc
# Force garbage collection to trigger __del__ methods
gc.collect()
time.sleep(0.2)
except Exception:
pass
# 2. Set ZMQ linger but DON'T term Context.instance()
# Terminating the global instance can block if other code still has sockets
try:
import zmq
# Just set linger on the global instance, don't terminate it
ctx = zmq.Context.instance()
ctx.linger = 0
# Do NOT call ctx.term() or ctx.destroy() on the global instance!
# That would block waiting for all sockets to close
except Exception:
pass
# Kill any leftover child processes (including grandchildren)
try:
import psutil
current_process = psutil.Process()
# Get ALL descendants recursively
children = current_process.children(recursive=True)
if children:
print(f"\n⚠️ Cleaning up {len(children)} leftover child processes...")
# First try to terminate gracefully
for child in children:
try:
print(f" Terminating {child.pid} ({child.name()})")
child.terminate()
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
# Wait a bit for processes to terminate
gone, alive = psutil.wait_procs(children, timeout=2)
# Force kill any remaining processes
for child in alive:
try:
print(f" Force killing process {child.pid} ({child.name()})")
child.kill()
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
# Final wait to ensure cleanup
psutil.wait_procs(alive, timeout=1)
except ImportError:
# psutil not installed, try basic process cleanup
try:
# Send SIGTERM to all child processes
os.killpg(os.getpgid(os.getpid()), signal.SIGTERM)
except Exception:
pass
except Exception as e:
print(f"Warning: Error during process cleanup: {e}")
# List any remaining threads (for debugging)
try:
import threading
threads = [t for t in threading.enumerate() if t is not threading.main_thread()]
if threads:
print(f"\n⚠️ {len(threads)} non-main threads still running:")
for t in threads:
print(f" - {t.name} (daemon={t.daemon})")
except Exception:
pass
@pytest.fixture
def auto_cleanup_searcher():
"""Fixture that automatically cleans up LeannSearcher instances."""
searchers = []
def register(searcher):
"""Register a searcher for cleanup."""
searchers.append(searcher)
return searcher
yield register
# Cleanup all registered searchers
for searcher in searchers:
try:
searcher.cleanup()
except Exception:
pass
# Force garbage collection
import gc
gc.collect()
time.sleep(0.1)
@pytest.fixture(scope="session", autouse=True)
def _reap_children():
"""Reap all child processes at session end as a safety net."""
yield
# Final aggressive cleanup
try:
import psutil
me = psutil.Process()
kids = me.children(recursive=True)
for p in kids:
try:
p.terminate()
except Exception:
pass
_, alive = psutil.wait_procs(kids, timeout=2)
for p in alive:
try:
p.kill()
except Exception:
pass
except Exception:
pass
@pytest.fixture(autouse=True)
def cleanup_after_each_test():
"""Cleanup after each test to prevent resource leaks."""
yield
# Force garbage collection to trigger any __del__ methods
import gc
gc.collect()
# Give a moment for async cleanup
time.sleep(0.1)
def pytest_configure(config):
"""Configure pytest with better timeout handling."""
# Set default timeout method to thread if not specified
if not config.getoption("--timeout-method", None):
config.option.timeout_method = "thread"