From 8714472cd8b3e3270b8cba1339a35e2f0cd2c844 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 7 Aug 2025 21:53:58 -0700 Subject: [PATCH] fix: prevent hang in CI by flushing print statements and redirecting embedding server output - Add flush=True to all print statements in convert_to_csr.py to prevent buffer deadlock - Redirect embedding server stdout/stderr to DEVNULL in CI environment (CI=true) - Fix timeout in embedding_server_manager.stop_server() final wait call --- .../leann_backend_hnsw/convert_to_csr.py | 13 ++++++++++++ .../src/leann/embedding_server_manager.py | 20 +++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/convert_to_csr.py b/packages/leann-backend-hnsw/leann_backend_hnsw/convert_to_csr.py index 1f9d4f1..7fb30ec 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/convert_to_csr.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/convert_to_csr.py @@ -1,5 +1,6 @@ import argparse import gc # Import garbage collector interface +import logging import os import struct import sys @@ -7,6 +8,12 @@ import time import numpy as np +# Set up logging to avoid print buffer issues +logger = logging.getLogger(__name__) +LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper() +log_level = getattr(logging, LOG_LEVEL, logging.WARNING) +logger.setLevel(log_level) + # --- FourCCs (add more if needed) --- INDEX_HNSW_FLAT_FOURCC = int.from_bytes(b"IHNf", "little") # Add other HNSW fourccs if you expect different storage types inside HNSW @@ -243,6 +250,12 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings= output_filename: Output CSR index file prune_embeddings: Whether to prune embedding storage (write NULL storage marker) """ + # Disable buffering for print statements to avoid deadlock in CI/pytest + import functools + + global print + print = functools.partial(print, flush=True) + print(f"Starting conversion: {input_filename} -> {output_filename}") start_time = time.time() original_hnsw_data = {} diff --git a/packages/leann-core/src/leann/embedding_server_manager.py b/packages/leann-core/src/leann/embedding_server_manager.py index a576feb..445cbdd 100644 --- a/packages/leann-core/src/leann/embedding_server_manager.py +++ b/packages/leann-core/src/leann/embedding_server_manager.py @@ -305,15 +305,23 @@ class EmbeddingServerManager: project_root = Path(__file__).parent.parent.parent.parent.parent logger.info(f"Command: {' '.join(command)}") - # Let server output go directly to console - # The server will respect LEANN_LOG_LEVEL environment variable - # IMPORTANT: Use a new session so we can manage the whole process group reliably, - # and detach stdio to avoid lingering output keeping CI steps noisy/alive. + # In CI environment, redirect output to avoid buffer deadlock + # Embedding servers use many print statements that can fill buffers + is_ci = os.environ.get("CI") == "true" + if is_ci: + stdout_target = subprocess.DEVNULL + stderr_target = subprocess.DEVNULL + logger.info("CI environment detected, redirecting embedding server output to DEVNULL") + else: + stdout_target = None # Direct to console for visible logs + stderr_target = None # Direct to console for visible logs + + # IMPORTANT: Use a new session so we can manage the whole process group reliably self.server_process = subprocess.Popen( command, cwd=project_root, - stdout=None, # Direct to console for visible logs - stderr=None, # Direct to console for visible logs + stdout=stdout_target, + stderr=stderr_target, start_new_session=True, ) self.server_port = port