fix: prevent test runner hanging on Python 3.9/3.13 due to ZMQ and process cleanup issues

Based on excellent analysis from user, implemented comprehensive fixes:

1. ZMQ Socket Cleanup:
   - Set LINGER=0 on all ZMQ sockets (client and server)
   - Use try-finally blocks to ensure socket.close() and context.term()
   - Prevents blocking on exit when ZMQ contexts have pending operations

2. Global Test Cleanup:
   - Added tests/conftest.py with session-scoped cleanup fixture
   - Cleans up leftover ZMQ contexts and child processes after all tests
   - Lists remaining threads for debugging

3. CI Improvements:
   - Apply timeout to ALL Python versions on Linux (not just 3.13)
   - Increased timeout to 180s for better reliability
   - Added process cleanup (pkill) on timeout

4. Dependencies:
   - Added psutil>=5.9.0 to test dependencies for process management

Root cause: Python 3.9/3.13 are more sensitive to cleanup timing during
interpreter shutdown. ZMQ's default LINGER=-1 was blocking exit, and
atexit handlers were unreliable for cleanup.

This should resolve the 'all tests pass but CI hangs' issue.
This commit is contained in:
Andy Lee
2025-08-08 15:57:22 -07:00
parent 05e1efa00a
commit e3762458fc
7 changed files with 138 additions and 29 deletions

View File

@@ -100,6 +100,7 @@ def create_diskann_embedding_server(
socket = context.socket(
zmq.REP
) # REP socket for both BaseSearcher and DiskANN C++ REQ clients
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
socket.bind(f"tcp://*:{zmq_port}")
logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}")

View File

@@ -92,6 +92,7 @@ def create_hnsw_embedding_server(
"""ZMQ server thread"""
context = zmq.Context()
socket = context.socket(zmq.REP)
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
socket.bind(f"tcp://*:{zmq_port}")
logger.info(f"HNSW ZMQ server listening on port {zmq_port}")

View File

@@ -87,21 +87,23 @@ def compute_embeddings_via_server(chunks: list[str], model_name: str, port: int)
# Connect to embedding server
context = zmq.Context()
socket = context.socket(zmq.REQ)
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
socket.connect(f"tcp://localhost:{port}")
# Send chunks to server for embedding computation
request = chunks
socket.send(msgpack.packb(request))
try:
# Send chunks to server for embedding computation
request = chunks
socket.send(msgpack.packb(request))
# Receive embeddings from server
response = socket.recv()
embeddings_list = msgpack.unpackb(response)
# Receive embeddings from server
response = socket.recv()
embeddings_list = msgpack.unpackb(response)
# Convert back to numpy array
embeddings = np.array(embeddings_list, dtype=np.float32)
socket.close()
context.term()
# Convert back to numpy array
embeddings = np.array(embeddings_list, dtype=np.float32)
finally:
socket.close(linger=0)
context.term()
return embeddings

View File

@@ -132,10 +132,13 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
import msgpack
import zmq
context = None
socket = None
try:
context = zmq.Context()
socket = context.socket(zmq.REQ)
socket.setsockopt(zmq.RCVTIMEO, 30000) # 30 second timeout
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
socket.connect(f"tcp://localhost:{zmq_port}")
# Send embedding request
@@ -147,9 +150,6 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
response_bytes = socket.recv()
response = msgpack.unpackb(response_bytes)
socket.close()
context.term()
# Convert response to numpy array
if isinstance(response, list) and len(response) > 0:
return np.array(response, dtype=np.float32)
@@ -158,6 +158,11 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
except Exception as e:
raise RuntimeError(f"Failed to compute embeddings via server: {e}")
finally:
if socket:
socket.close(linger=0)
if context:
context.term()
@abstractmethod
def search(