This commit is contained in:
yichuan520030910320
2025-07-22 20:44:02 -07:00
11 changed files with 159 additions and 404 deletions

View File

@@ -1,9 +0,0 @@
{
"recommendations": [
"llvm-vs-code-extensions.vscode-clangd",
"ms-python.python",
"ms-vscode.cmake-tools",
"vadimcn.vscode-lldb",
"eamodio.gitlens",
]
}

283
.vscode/launch.json vendored
View File

@@ -1,283 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
// new emdedder
{
"name": "New Embedder",
"type": "debugpy",
"request": "launch",
"program": "demo/main.py",
"console": "integratedTerminal",
"args": [
"--search",
"--use-original",
"--domain",
"dpr",
"--nprobe",
"5000",
"--load",
"flat",
"--embedder",
"intfloat/multilingual-e5-small"
]
}
//python /home/ubuntu/Power-RAG/faiss/demo/simple_build.py
{
"name": "main.py",
"type": "debugpy",
"request": "launch",
"program": "demo/main.py",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"--query",
"1000",
"--load",
"bm25"
]
},
{
"name": "Simple Build",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"faiss/demo/simple_build.py"
],
"env": {
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
}
},
//# Fix for Intel MKL error
//export LD_PRELOAD=/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so
//python faiss/demo/build_demo.py
{
"name": "Build Demo",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"faiss/demo/build_demo.py"
],
"env": {
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
}
},
{
"name": "DiskANN Serve",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"demo/main.py",
"--mode",
"serve",
"--engine",
"sglang",
"--load-indices",
"diskann",
"--domain",
"rpj_wiki",
"--lazy-load",
"--recompute-beighbor-embeddings",
"--port",
"8082",
"--diskann-search-memory-maximum",
"2",
"--diskann-graph",
"240",
"--search-only"
],
"env": {
"PYTHONPATH": "${workspaceFolder}/faiss_repo/build/faiss/python:$PYTHONPATH"
},
"preLaunchTask": "CMake: build",
},
{
"name": "DiskANN Serve MAC",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"demo/main.py",
"--mode",
"serve",
"--engine",
"ollama",
"--load-indices",
"diskann",
"--domain",
"rpj_wiki",
"--lazy-load",
"--recompute-beighbor-embeddings"
],
"preLaunchTask": "CMake: build",
"env": {
"KMP_DUPLICATE_LIB_OK": "TRUE",
"OMP_NUM_THREADS": "1",
"MKL_NUM_THREADS": "1",
"DYLD_INSERT_LIBRARIES": "/Users/ec2-user/Power-RAG/.venv/lib/python3.10/site-packages/torch/lib/libomp.dylib",
"KMP_BLOCKTIME": "0"
}
},
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "ric/main_ric.py",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"--config-name",
"${input:configSelection}"
],
"justMyCode": false
},
//python ./demo/validate_equivalence.py sglang
{
"name": "Validate Equivalence",
"type": "debugpy",
"request": "launch",
"program": "demo/validate_equivalence.py",
"console": "integratedTerminal",
"args": [
"sglang"
],
},
//python demo/retrieval_demo.py --engine sglang --skip-embeddings --domain dpr --load-indices flat ivf_flat
{
"name": "Retrieval Demo",
"type": "debugpy",
"request": "launch",
"program": "demo/retrieval_demo.py",
"console": "integratedTerminal",
"args": [
"--engine",
"vllm",
"--skip-embeddings",
"--domain",
"dpr",
"--load-indices",
// "flat",
"ivf_flat"
],
},
//python demo/retrieval_demo.py --engine sglang --skip-embeddings --domain dpr --load-indices diskann --hnsw-M 64 --hnsw-efConstruction 150 --hnsw-efSearch 128 --hnsw-sq-bits 8
{
"name": "Retrieval Demo DiskANN",
"type": "debugpy",
"request": "launch",
"program": "demo/retrieval_demo.py",
"console": "integratedTerminal",
"args": [
"--engine",
"sglang",
"--skip-embeddings",
"--domain",
"dpr",
"--load-indices",
"diskann",
"--hnsw-M",
"64",
"--hnsw-efConstruction",
"150",
"--hnsw-efSearch",
"128",
"--hnsw-sq-bits",
"8"
],
},
{
"name": "Find Probe",
"type": "debugpy",
"request": "launch",
"program": "find_probe.py",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
},
{
"name": "Python: Attach",
"type": "debugpy",
"request": "attach",
"processId": "${command:pickProcess}",
"justMyCode": true
},
{
"name": "Edge RAG",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"edgerag_demo.py"
],
"env": {
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libiomp5.so /lib/x86_64-linux-gnu/libmkl_core.so /lib/x86_64-linux-gnu/libmkl_intel_lp64.so /lib/x86_64-linux-gnu/libmkl_intel_thread.so",
"MKL_NUM_THREADS": "1",
"OMP_NUM_THREADS": "1",
}
},
{
"name": "Launch Embedding Server",
"type": "debugpy",
"request": "launch",
"program": "demo/embedding_server.py",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"--domain",
"rpj_wiki",
"--zmq-port",
"5556",
]
},
{
"name": "HNSW Serve",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"demo/main.py",
"--domain",
"rpj_wiki",
"--load",
"hnsw",
"--mode",
"serve",
"--search",
"--skip-pa",
"--recompute",
"--hnsw-old"
],
"env": {
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
}
},
],
"inputs": [
{
"id": "configSelection",
"type": "pickString",
"description": "Select a configuration",
"options": [
"example_config",
"vllm_gritlm"
],
"default": "example_config"
}
],
}

43
.vscode/settings.json vendored
View File

@@ -1,43 +0,0 @@
{
"python.analysis.extraPaths": [
"./sglang_repo/python"
],
"cmake.sourceDirectory": "${workspaceFolder}/DiskANN",
"cmake.configureArgs": [
"-DPYBIND=True",
"-DUPDATE_EDITABLE_INSTALL=ON",
],
"cmake.environment": {
"PATH": "/Users/ec2-user/Power-RAG/.venv/bin:${env:PATH}"
},
"cmake.buildDirectory": "${workspaceFolder}/build",
"files.associations": {
"*.tcc": "cpp",
"deque": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"map": "cpp",
"unordered_set": "cpp",
"atomic": "cpp",
"inplace_vector": "cpp",
"*.ipp": "cpp",
"forward_list": "cpp",
"list": "cpp",
"any": "cpp",
"system_error": "cpp",
"__hash_table": "cpp",
"__split_buffer": "cpp",
"__tree": "cpp",
"ios": "cpp",
"set": "cpp",
"__string": "cpp",
"string_view": "cpp",
"ranges": "cpp",
"iosfwd": "cpp"
},
"lldb.displayFormat": "auto",
"lldb.showDisassembly": "auto",
"lldb.dereferencePointers": true,
"lldb.consoleMode": "commands",
}

16
.vscode/tasks.json vendored
View File

@@ -1,16 +0,0 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "cmake",
"label": "CMake: build",
"command": "build",
"targets": [
"all"
],
"group": "build",
"problemMatcher": [],
"detail": "CMake template build task"
}
]
}

View File

@@ -1,40 +1,40 @@
import argparse import argparse
from llama_index.core import SimpleDirectoryReader, Settings from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
import asyncio import asyncio
import dotenv import dotenv
from leann.api import LeannBuilder, LeannSearcher, LeannChat from leann.api import LeannBuilder, LeannChat
import shutil
from pathlib import Path from pathlib import Path
dotenv.load_dotenv() dotenv.load_dotenv()
node_parser = SentenceSplitter(
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
)
print("Loading documents...")
documents = SimpleDirectoryReader(
"examples/data",
recursive=True,
encoding="utf-8",
required_exts=[".pdf", ".txt", ".md"],
).load_data(show_progress=True)
print("Documents loaded.")
all_texts = []
for doc in documents:
nodes = node_parser.get_nodes_from_documents([doc])
for node in nodes:
all_texts.append(node.get_content())
async def main(args): async def main(args):
INDEX_DIR = Path(args.index_dir) INDEX_DIR = Path(args.index_dir)
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann") INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
if not INDEX_DIR.exists(): if not INDEX_DIR.exists():
print(f"--- Index directory not found, building new index ---") node_parser = SentenceSplitter(
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
)
print(f"\n[PHASE 1] Building Leann index...") print("Loading documents...")
documents = SimpleDirectoryReader(
"examples/data",
recursive=True,
encoding="utf-8",
required_exts=[".pdf", ".txt", ".md"],
).load_data(show_progress=True)
print("Documents loaded.")
all_texts = []
for doc in documents:
nodes = node_parser.get_nodes_from_documents([doc])
for node in nodes:
all_texts.append(node.get_content())
print("--- Index directory not found, building new index ---")
print("\n[PHASE 1] Building Leann index...")
# Use HNSW backend for better macOS compatibility # Use HNSW backend for better macOS compatibility
builder = LeannBuilder( builder = LeannBuilder(

View File

@@ -94,7 +94,9 @@ def create_diskann_embedding_server(
def zmq_server_thread(): def zmq_server_thread():
"""ZMQ server thread using REP socket for universal compatibility""" """ZMQ server thread using REP socket for universal compatibility"""
context = zmq.Context() context = zmq.Context()
socket = context.socket(zmq.REP) # REP socket for both BaseSearcher and DiskANN C++ REQ clients socket = context.socket(
zmq.REP
) # REP socket for both BaseSearcher and DiskANN C++ REQ clients
socket.bind(f"tcp://*:{zmq_port}") socket.bind(f"tcp://*:{zmq_port}")
logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}") logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}")
@@ -128,7 +130,9 @@ def create_diskann_embedding_server(
node_ids = list(req_proto.node_ids) node_ids = list(req_proto.node_ids)
if not node_ids: if not node_ids:
raise RuntimeError(f"PROTOBUF: Received empty node_ids! Message size: {len(message)}") raise RuntimeError(
f"PROTOBUF: Received empty node_ids! Message size: {len(message)}"
)
logger.info( logger.info(
f"✅ PROTOBUF: Node ID request for {len(node_ids)} node embeddings: {node_ids[:10]}" f"✅ PROTOBUF: Node ID request for {len(node_ids)} node embeddings: {node_ids[:10]}"
@@ -175,9 +179,7 @@ def create_diskann_embedding_server(
raise raise
# Debug logging # Debug logging
logger.debug( logger.debug(f"Processing {len(texts)} texts")
f"Processing {len(texts)} texts"
)
logger.debug( logger.debug(
f"Text lengths: {[len(t) for t in texts[:5]]}" f"Text lengths: {[len(t) for t in texts[:5]]}"
) # Show first 5 ) # Show first 5
@@ -220,6 +222,7 @@ def create_diskann_embedding_server(
except Exception as e: except Exception as e:
logger.error(f"Error in ZMQ server loop: {e}") logger.error(f"Error in ZMQ server loop: {e}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
raise raise
@@ -237,6 +240,17 @@ def create_diskann_embedding_server(
if __name__ == "__main__": if __name__ == "__main__":
import signal
import sys
def signal_handler(sig, frame):
logger.info(f"Received signal {sig}, shutting down gracefully...")
sys.exit(0)
# Register signal handlers for graceful shutdown
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
parser = argparse.ArgumentParser(description="DiskANN Embedding service") parser = argparse.ArgumentParser(description="DiskANN Embedding service")
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on") parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
parser.add_argument( parser.add_argument(

View File

@@ -268,6 +268,17 @@ def create_hnsw_embedding_server(
if __name__ == "__main__": if __name__ == "__main__":
import signal
import sys
def signal_handler(sig, frame):
logger.info(f"Received signal {sig}, shutting down gracefully...")
sys.exit(0)
# Register signal handlers for graceful shutdown
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
parser = argparse.ArgumentParser(description="HNSW Embedding service") parser = argparse.ArgumentParser(description="HNSW Embedding service")
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on") parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
parser.add_argument( parser.add_argument(

View File

@@ -421,9 +421,9 @@ class LeannSearcher:
logger.info(f" Top_k: {top_k}") logger.info(f" Top_k: {top_k}")
logger.info(f" Additional kwargs: {kwargs}") logger.info(f" Additional kwargs: {kwargs}")
start_time = time.time()
zmq_port = None zmq_port = None
start_time = time.time()
if recompute_embeddings: if recompute_embeddings:
zmq_port = self.backend_impl._ensure_server_running( zmq_port = self.backend_impl._ensure_server_running(
self.meta_path_str, self.meta_path_str,
@@ -431,6 +431,10 @@ class LeannSearcher:
**kwargs, **kwargs,
) )
del expected_zmq_port del expected_zmq_port
zmq_time = time.time() - start_time
logger.info(f" Launching server time: {zmq_time} seconds")
start_time = time.time()
query_embedding = self.backend_impl.compute_query_embedding( query_embedding = self.backend_impl.compute_query_embedding(
query, query,

View File

@@ -25,6 +25,8 @@ def compute_embeddings(
model_name: str, model_name: str,
mode: str = "sentence-transformers", mode: str = "sentence-transformers",
is_build: bool = False, is_build: bool = False,
batch_size: int = 32,
adaptive_optimization: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Unified embedding computation entry point Unified embedding computation entry point
@@ -33,13 +35,20 @@ def compute_embeddings(
texts: List of texts to compute embeddings for texts: List of texts to compute embeddings for
model_name: Model name model_name: Model name
mode: Computation mode ('sentence-transformers', 'openai', 'mlx') mode: Computation mode ('sentence-transformers', 'openai', 'mlx')
is_build: Whether this is a build operation (shows progress bar)
batch_size: Batch size for processing
adaptive_optimization: Whether to use adaptive optimization based on batch size
Returns: Returns:
Normalized embeddings array, shape: (len(texts), embedding_dim) Normalized embeddings array, shape: (len(texts), embedding_dim)
""" """
if mode == "sentence-transformers": if mode == "sentence-transformers":
return compute_embeddings_sentence_transformers( return compute_embeddings_sentence_transformers(
texts, model_name, is_build=is_build texts,
model_name,
is_build=is_build,
batch_size=batch_size,
adaptive_optimization=adaptive_optimization,
) )
elif mode == "openai": elif mode == "openai":
return compute_embeddings_openai(texts, model_name) return compute_embeddings_openai(texts, model_name)
@@ -56,9 +65,19 @@ def compute_embeddings_sentence_transformers(
device: str = "auto", device: str = "auto",
batch_size: int = 32, batch_size: int = 32,
is_build: bool = False, is_build: bool = False,
adaptive_optimization: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Compute embeddings using SentenceTransformer with model caching Compute embeddings using SentenceTransformer with model caching and adaptive optimization
Args:
texts: List of texts to compute embeddings for
model_name: Model name
use_fp16: Whether to use FP16 precision
device: Device to use ('auto', 'cuda', 'mps', 'cpu')
batch_size: Batch size for processing
is_build: Whether this is a build operation (shows progress bar)
adaptive_optimization: Whether to use adaptive optimization based on batch size
""" """
# Handle empty input # Handle empty input
if not texts: if not texts:
@@ -76,28 +95,68 @@ def compute_embeddings_sentence_transformers(
else: else:
device = "cpu" device = "cpu"
# Apply optimizations based on benchmark results
if adaptive_optimization:
# Use optimal batch_size constants for different devices based on benchmark results
if device == "mps":
batch_size = 128 # MPS optimal batch size from benchmark
if model_name == "Qwen/Qwen3-Embedding-0.6B":
batch_size = 64
elif device == "cuda":
batch_size = 256 # CUDA optimal batch size
# Keep original batch_size for CPU
# Create cache key # Create cache key
cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}" cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}_optimized"
# Check if model is already cached # Check if model is already cached
if cache_key in _model_cache: if cache_key in _model_cache:
logger.info(f"Using cached model: {model_name}") logger.info(f"Using cached optimized model: {model_name}")
model = _model_cache[cache_key] model = _model_cache[cache_key]
else: else:
logger.info(f"Loading and caching SentenceTransformer model: {model_name}") logger.info(
f"Loading and caching optimized SentenceTransformer model: {model_name}"
)
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
logger.info(f"Using device: {device}") logger.info(f"Using device: {device}")
# Prepare model and tokenizer optimization parameters # Apply hardware optimizations
if device == "cuda":
# TODO: Haven't tested this yet
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.cuda.set_per_process_memory_fraction(0.9)
elif device == "mps":
try:
if hasattr(torch.mps, "set_per_process_memory_fraction"):
torch.mps.set_per_process_memory_fraction(0.9)
except AttributeError:
logger.warning(
"Some MPS optimizations not available in this PyTorch version"
)
elif device == "cpu":
# TODO: Haven't tested this yet
torch.set_num_threads(min(8, os.cpu_count() or 4))
try:
torch.backends.mkldnn.enabled = True
except AttributeError:
pass
# Prepare optimized model and tokenizer parameters
model_kwargs = { model_kwargs = {
"torch_dtype": torch.float16 if use_fp16 else torch.float32, "torch_dtype": torch.float16 if use_fp16 else torch.float32,
"low_cpu_mem_usage": True, "low_cpu_mem_usage": True,
"_fast_init": True, "_fast_init": True,
"attn_implementation": "eager", # Use eager attention for speed
} }
tokenizer_kwargs = { tokenizer_kwargs = {
"use_fast": True, "use_fast": True,
"padding": True,
"truncation": True,
} }
try: try:
@@ -128,32 +187,44 @@ def compute_embeddings_sentence_transformers(
) )
logger.info("Model loaded successfully! (network + optimized)") logger.info("Model loaded successfully! (network + optimized)")
# Apply additional optimizations (if supported) # Apply additional optimizations based on mode
if use_fp16 and device in ["cuda", "mps"]: if use_fp16 and device in ["cuda", "mps"]:
try: try:
model = model.half() model = model.half()
model = torch.compile(model) logger.info(f"Applied FP16 precision: {model_name}")
logger.info(
f"Using FP16 precision and compile optimization: {model_name}"
)
except Exception as e: except Exception as e:
logger.warning(f"FP16 or compile optimization failed: {e}") logger.warning(f"FP16 optimization failed: {e}")
# Apply torch.compile optimization
if device in ["cuda", "mps"]:
try:
model = torch.compile(model, mode="reduce-overhead", dynamic=True)
logger.info(f"Applied torch.compile optimization: {model_name}")
except Exception as e:
logger.warning(f"torch.compile optimization failed: {e}")
# Set model to eval mode and disable gradients for inference
model.eval()
for param in model.parameters():
param.requires_grad_(False)
# Cache the model # Cache the model
_model_cache[cache_key] = model _model_cache[cache_key] = model
logger.info(f"Model cached: {cache_key}") logger.info(f"Model cached: {cache_key}")
# Compute embeddings # Compute embeddings with optimized inference mode
logger.info("Starting embedding computation...") logger.info(f"Starting embedding computation... (batch_size: {batch_size})")
embeddings = model.encode( # Use torch.inference_mode for optimal performance
texts, with torch.inference_mode():
batch_size=batch_size, embeddings = model.encode(
show_progress_bar=is_build, # Don't show progress bar in server environment texts,
convert_to_numpy=True, batch_size=batch_size,
normalize_embeddings=False, show_progress_bar=is_build, # Don't show progress bar in server environment
device=device, convert_to_numpy=True,
) normalize_embeddings=False,
device=device,
)
logger.info( logger.info(
f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}" f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"

View File

@@ -13,7 +13,7 @@ import psutil
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper() LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
logging.basicConfig( logging.basicConfig(
level=getattr(logging, LOG_LEVEL, logging.INFO), level=getattr(logging, LOG_LEVEL, logging.INFO),
format="%(asctime)s - %(levelname)s - %(message)s", format="%(levelname)s - %(name)s - %(message)s",
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -339,4 +339,10 @@ class EmbeddingServerManager:
) )
self.server_process.kill() self.server_process.kill()
# Clean up process resources to prevent resource tracker warnings
try:
self.server_process.wait() # Ensure process is fully cleaned up
except Exception:
pass
self.server_process = None self.server_process = None