Merge branch 'main' of https://github.com/yichuan-w/LEANN
This commit is contained in:
9
.vscode/extensions.json
vendored
9
.vscode/extensions.json
vendored
@@ -1,9 +0,0 @@
|
|||||||
{
|
|
||||||
"recommendations": [
|
|
||||||
"llvm-vs-code-extensions.vscode-clangd",
|
|
||||||
"ms-python.python",
|
|
||||||
"ms-vscode.cmake-tools",
|
|
||||||
"vadimcn.vscode-lldb",
|
|
||||||
"eamodio.gitlens",
|
|
||||||
]
|
|
||||||
}
|
|
||||||
283
.vscode/launch.json
vendored
283
.vscode/launch.json
vendored
@@ -1,283 +0,0 @@
|
|||||||
{
|
|
||||||
// Use IntelliSense to learn about possible attributes.
|
|
||||||
// Hover to view descriptions of existing attributes.
|
|
||||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
|
||||||
"version": "0.2.0",
|
|
||||||
"configurations": [
|
|
||||||
// new emdedder
|
|
||||||
{
|
|
||||||
"name": "New Embedder",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "demo/main.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"args": [
|
|
||||||
"--search",
|
|
||||||
"--use-original",
|
|
||||||
"--domain",
|
|
||||||
"dpr",
|
|
||||||
"--nprobe",
|
|
||||||
"5000",
|
|
||||||
"--load",
|
|
||||||
"flat",
|
|
||||||
"--embedder",
|
|
||||||
"intfloat/multilingual-e5-small"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
//python /home/ubuntu/Power-RAG/faiss/demo/simple_build.py
|
|
||||||
{
|
|
||||||
"name": "main.py",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "demo/main.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"--query",
|
|
||||||
"1000",
|
|
||||||
"--load",
|
|
||||||
"bm25"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Simple Build",
|
|
||||||
"type": "lldb",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "${workspaceFolder}/.venv/bin/python",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"faiss/demo/simple_build.py"
|
|
||||||
],
|
|
||||||
"env": {
|
|
||||||
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
//# Fix for Intel MKL error
|
|
||||||
//export LD_PRELOAD=/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so
|
|
||||||
//python faiss/demo/build_demo.py
|
|
||||||
{
|
|
||||||
"name": "Build Demo",
|
|
||||||
"type": "lldb",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "${workspaceFolder}/.venv/bin/python",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"faiss/demo/build_demo.py"
|
|
||||||
],
|
|
||||||
"env": {
|
|
||||||
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "DiskANN Serve",
|
|
||||||
"type": "lldb",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "${workspaceFolder}/.venv/bin/python",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"demo/main.py",
|
|
||||||
"--mode",
|
|
||||||
"serve",
|
|
||||||
"--engine",
|
|
||||||
"sglang",
|
|
||||||
"--load-indices",
|
|
||||||
"diskann",
|
|
||||||
"--domain",
|
|
||||||
"rpj_wiki",
|
|
||||||
"--lazy-load",
|
|
||||||
"--recompute-beighbor-embeddings",
|
|
||||||
"--port",
|
|
||||||
"8082",
|
|
||||||
"--diskann-search-memory-maximum",
|
|
||||||
"2",
|
|
||||||
"--diskann-graph",
|
|
||||||
"240",
|
|
||||||
"--search-only"
|
|
||||||
],
|
|
||||||
"env": {
|
|
||||||
"PYTHONPATH": "${workspaceFolder}/faiss_repo/build/faiss/python:$PYTHONPATH"
|
|
||||||
},
|
|
||||||
"preLaunchTask": "CMake: build",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "DiskANN Serve MAC",
|
|
||||||
"type": "lldb",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "${workspaceFolder}/.venv/bin/python",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"demo/main.py",
|
|
||||||
"--mode",
|
|
||||||
"serve",
|
|
||||||
"--engine",
|
|
||||||
"ollama",
|
|
||||||
"--load-indices",
|
|
||||||
"diskann",
|
|
||||||
"--domain",
|
|
||||||
"rpj_wiki",
|
|
||||||
"--lazy-load",
|
|
||||||
"--recompute-beighbor-embeddings"
|
|
||||||
],
|
|
||||||
"preLaunchTask": "CMake: build",
|
|
||||||
"env": {
|
|
||||||
"KMP_DUPLICATE_LIB_OK": "TRUE",
|
|
||||||
"OMP_NUM_THREADS": "1",
|
|
||||||
"MKL_NUM_THREADS": "1",
|
|
||||||
"DYLD_INSERT_LIBRARIES": "/Users/ec2-user/Power-RAG/.venv/lib/python3.10/site-packages/torch/lib/libomp.dylib",
|
|
||||||
"KMP_BLOCKTIME": "0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Python Debugger: Current File with Arguments",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "ric/main_ric.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"--config-name",
|
|
||||||
"${input:configSelection}"
|
|
||||||
],
|
|
||||||
"justMyCode": false
|
|
||||||
},
|
|
||||||
//python ./demo/validate_equivalence.py sglang
|
|
||||||
{
|
|
||||||
"name": "Validate Equivalence",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "demo/validate_equivalence.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"args": [
|
|
||||||
"sglang"
|
|
||||||
],
|
|
||||||
},
|
|
||||||
//python demo/retrieval_demo.py --engine sglang --skip-embeddings --domain dpr --load-indices flat ivf_flat
|
|
||||||
{
|
|
||||||
"name": "Retrieval Demo",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "demo/retrieval_demo.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"args": [
|
|
||||||
"--engine",
|
|
||||||
"vllm",
|
|
||||||
"--skip-embeddings",
|
|
||||||
"--domain",
|
|
||||||
"dpr",
|
|
||||||
"--load-indices",
|
|
||||||
// "flat",
|
|
||||||
"ivf_flat"
|
|
||||||
],
|
|
||||||
},
|
|
||||||
//python demo/retrieval_demo.py --engine sglang --skip-embeddings --domain dpr --load-indices diskann --hnsw-M 64 --hnsw-efConstruction 150 --hnsw-efSearch 128 --hnsw-sq-bits 8
|
|
||||||
{
|
|
||||||
"name": "Retrieval Demo DiskANN",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "demo/retrieval_demo.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"args": [
|
|
||||||
"--engine",
|
|
||||||
"sglang",
|
|
||||||
"--skip-embeddings",
|
|
||||||
"--domain",
|
|
||||||
"dpr",
|
|
||||||
"--load-indices",
|
|
||||||
"diskann",
|
|
||||||
"--hnsw-M",
|
|
||||||
"64",
|
|
||||||
"--hnsw-efConstruction",
|
|
||||||
"150",
|
|
||||||
"--hnsw-efSearch",
|
|
||||||
"128",
|
|
||||||
"--hnsw-sq-bits",
|
|
||||||
"8"
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Find Probe",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "find_probe.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Python: Attach",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "attach",
|
|
||||||
"processId": "${command:pickProcess}",
|
|
||||||
"justMyCode": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Edge RAG",
|
|
||||||
"type": "lldb",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "${workspaceFolder}/.venv/bin/python",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"edgerag_demo.py"
|
|
||||||
],
|
|
||||||
"env": {
|
|
||||||
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libiomp5.so /lib/x86_64-linux-gnu/libmkl_core.so /lib/x86_64-linux-gnu/libmkl_intel_lp64.so /lib/x86_64-linux-gnu/libmkl_intel_thread.so",
|
|
||||||
"MKL_NUM_THREADS": "1",
|
|
||||||
"OMP_NUM_THREADS": "1",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Launch Embedding Server",
|
|
||||||
"type": "debugpy",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "demo/embedding_server.py",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"--domain",
|
|
||||||
"rpj_wiki",
|
|
||||||
"--zmq-port",
|
|
||||||
"5556",
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "HNSW Serve",
|
|
||||||
"type": "lldb",
|
|
||||||
"request": "launch",
|
|
||||||
"program": "${workspaceFolder}/.venv/bin/python",
|
|
||||||
"console": "integratedTerminal",
|
|
||||||
"cwd": "${workspaceFolder}",
|
|
||||||
"args": [
|
|
||||||
"demo/main.py",
|
|
||||||
"--domain",
|
|
||||||
"rpj_wiki",
|
|
||||||
"--load",
|
|
||||||
"hnsw",
|
|
||||||
"--mode",
|
|
||||||
"serve",
|
|
||||||
"--search",
|
|
||||||
"--skip-pa",
|
|
||||||
"--recompute",
|
|
||||||
"--hnsw-old"
|
|
||||||
],
|
|
||||||
"env": {
|
|
||||||
"LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"inputs": [
|
|
||||||
{
|
|
||||||
"id": "configSelection",
|
|
||||||
"type": "pickString",
|
|
||||||
"description": "Select a configuration",
|
|
||||||
"options": [
|
|
||||||
"example_config",
|
|
||||||
"vllm_gritlm"
|
|
||||||
],
|
|
||||||
"default": "example_config"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
43
.vscode/settings.json
vendored
43
.vscode/settings.json
vendored
@@ -1,43 +0,0 @@
|
|||||||
{
|
|
||||||
"python.analysis.extraPaths": [
|
|
||||||
"./sglang_repo/python"
|
|
||||||
],
|
|
||||||
"cmake.sourceDirectory": "${workspaceFolder}/DiskANN",
|
|
||||||
"cmake.configureArgs": [
|
|
||||||
"-DPYBIND=True",
|
|
||||||
"-DUPDATE_EDITABLE_INSTALL=ON",
|
|
||||||
],
|
|
||||||
"cmake.environment": {
|
|
||||||
"PATH": "/Users/ec2-user/Power-RAG/.venv/bin:${env:PATH}"
|
|
||||||
},
|
|
||||||
"cmake.buildDirectory": "${workspaceFolder}/build",
|
|
||||||
"files.associations": {
|
|
||||||
"*.tcc": "cpp",
|
|
||||||
"deque": "cpp",
|
|
||||||
"string": "cpp",
|
|
||||||
"unordered_map": "cpp",
|
|
||||||
"vector": "cpp",
|
|
||||||
"map": "cpp",
|
|
||||||
"unordered_set": "cpp",
|
|
||||||
"atomic": "cpp",
|
|
||||||
"inplace_vector": "cpp",
|
|
||||||
"*.ipp": "cpp",
|
|
||||||
"forward_list": "cpp",
|
|
||||||
"list": "cpp",
|
|
||||||
"any": "cpp",
|
|
||||||
"system_error": "cpp",
|
|
||||||
"__hash_table": "cpp",
|
|
||||||
"__split_buffer": "cpp",
|
|
||||||
"__tree": "cpp",
|
|
||||||
"ios": "cpp",
|
|
||||||
"set": "cpp",
|
|
||||||
"__string": "cpp",
|
|
||||||
"string_view": "cpp",
|
|
||||||
"ranges": "cpp",
|
|
||||||
"iosfwd": "cpp"
|
|
||||||
},
|
|
||||||
"lldb.displayFormat": "auto",
|
|
||||||
"lldb.showDisassembly": "auto",
|
|
||||||
"lldb.dereferencePointers": true,
|
|
||||||
"lldb.consoleMode": "commands",
|
|
||||||
}
|
|
||||||
16
.vscode/tasks.json
vendored
16
.vscode/tasks.json
vendored
@@ -1,16 +0,0 @@
|
|||||||
{
|
|
||||||
"version": "2.0.0",
|
|
||||||
"tasks": [
|
|
||||||
{
|
|
||||||
"type": "cmake",
|
|
||||||
"label": "CMake: build",
|
|
||||||
"command": "build",
|
|
||||||
"targets": [
|
|
||||||
"all"
|
|
||||||
],
|
|
||||||
"group": "build",
|
|
||||||
"problemMatcher": [],
|
|
||||||
"detail": "CMake template build task"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,40 +1,40 @@
|
|||||||
import argparse
|
import argparse
|
||||||
from llama_index.core import SimpleDirectoryReader, Settings
|
from llama_index.core import SimpleDirectoryReader
|
||||||
from llama_index.core.node_parser import SentenceSplitter
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
import asyncio
|
import asyncio
|
||||||
import dotenv
|
import dotenv
|
||||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
from leann.api import LeannBuilder, LeannChat
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
dotenv.load_dotenv()
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
node_parser = SentenceSplitter(
|
|
||||||
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
|
|
||||||
)
|
|
||||||
print("Loading documents...")
|
|
||||||
documents = SimpleDirectoryReader(
|
|
||||||
"examples/data",
|
|
||||||
recursive=True,
|
|
||||||
encoding="utf-8",
|
|
||||||
required_exts=[".pdf", ".txt", ".md"],
|
|
||||||
).load_data(show_progress=True)
|
|
||||||
print("Documents loaded.")
|
|
||||||
all_texts = []
|
|
||||||
for doc in documents:
|
|
||||||
nodes = node_parser.get_nodes_from_documents([doc])
|
|
||||||
for node in nodes:
|
|
||||||
all_texts.append(node.get_content())
|
|
||||||
|
|
||||||
|
|
||||||
async def main(args):
|
async def main(args):
|
||||||
INDEX_DIR = Path(args.index_dir)
|
INDEX_DIR = Path(args.index_dir)
|
||||||
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
||||||
|
|
||||||
if not INDEX_DIR.exists():
|
if not INDEX_DIR.exists():
|
||||||
print(f"--- Index directory not found, building new index ---")
|
node_parser = SentenceSplitter(
|
||||||
|
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
print(f"\n[PHASE 1] Building Leann index...")
|
print("Loading documents...")
|
||||||
|
documents = SimpleDirectoryReader(
|
||||||
|
"examples/data",
|
||||||
|
recursive=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
required_exts=[".pdf", ".txt", ".md"],
|
||||||
|
).load_data(show_progress=True)
|
||||||
|
print("Documents loaded.")
|
||||||
|
all_texts = []
|
||||||
|
for doc in documents:
|
||||||
|
nodes = node_parser.get_nodes_from_documents([doc])
|
||||||
|
for node in nodes:
|
||||||
|
all_texts.append(node.get_content())
|
||||||
|
|
||||||
|
print("--- Index directory not found, building new index ---")
|
||||||
|
|
||||||
|
print("\n[PHASE 1] Building Leann index...")
|
||||||
|
|
||||||
# Use HNSW backend for better macOS compatibility
|
# Use HNSW backend for better macOS compatibility
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
|
|||||||
@@ -94,7 +94,9 @@ def create_diskann_embedding_server(
|
|||||||
def zmq_server_thread():
|
def zmq_server_thread():
|
||||||
"""ZMQ server thread using REP socket for universal compatibility"""
|
"""ZMQ server thread using REP socket for universal compatibility"""
|
||||||
context = zmq.Context()
|
context = zmq.Context()
|
||||||
socket = context.socket(zmq.REP) # REP socket for both BaseSearcher and DiskANN C++ REQ clients
|
socket = context.socket(
|
||||||
|
zmq.REP
|
||||||
|
) # REP socket for both BaseSearcher and DiskANN C++ REQ clients
|
||||||
socket.bind(f"tcp://*:{zmq_port}")
|
socket.bind(f"tcp://*:{zmq_port}")
|
||||||
logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}")
|
logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}")
|
||||||
|
|
||||||
@@ -128,7 +130,9 @@ def create_diskann_embedding_server(
|
|||||||
node_ids = list(req_proto.node_ids)
|
node_ids = list(req_proto.node_ids)
|
||||||
|
|
||||||
if not node_ids:
|
if not node_ids:
|
||||||
raise RuntimeError(f"PROTOBUF: Received empty node_ids! Message size: {len(message)}")
|
raise RuntimeError(
|
||||||
|
f"PROTOBUF: Received empty node_ids! Message size: {len(message)}"
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"✅ PROTOBUF: Node ID request for {len(node_ids)} node embeddings: {node_ids[:10]}"
|
f"✅ PROTOBUF: Node ID request for {len(node_ids)} node embeddings: {node_ids[:10]}"
|
||||||
@@ -175,9 +179,7 @@ def create_diskann_embedding_server(
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
# Debug logging
|
# Debug logging
|
||||||
logger.debug(
|
logger.debug(f"Processing {len(texts)} texts")
|
||||||
f"Processing {len(texts)} texts"
|
|
||||||
)
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Text lengths: {[len(t) for t in texts[:5]]}"
|
f"Text lengths: {[len(t) for t in texts[:5]]}"
|
||||||
) # Show first 5
|
) # Show first 5
|
||||||
@@ -220,6 +222,7 @@ def create_diskann_embedding_server(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in ZMQ server loop: {e}")
|
logger.error(f"Error in ZMQ server loop: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@@ -237,6 +240,17 @@ def create_diskann_embedding_server(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
logger.info(f"Received signal {sig}, shutting down gracefully...")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Register signal handlers for graceful shutdown
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="DiskANN Embedding service")
|
parser = argparse.ArgumentParser(description="DiskANN Embedding service")
|
||||||
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
|
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: af2a26481e...25339b0341
@@ -268,6 +268,17 @@ def create_hnsw_embedding_server(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
logger.info(f"Received signal {sig}, shutting down gracefully...")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Register signal handlers for graceful shutdown
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="HNSW Embedding service")
|
parser = argparse.ArgumentParser(description="HNSW Embedding service")
|
||||||
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
|
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -421,9 +421,9 @@ class LeannSearcher:
|
|||||||
logger.info(f" Top_k: {top_k}")
|
logger.info(f" Top_k: {top_k}")
|
||||||
logger.info(f" Additional kwargs: {kwargs}")
|
logger.info(f" Additional kwargs: {kwargs}")
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
zmq_port = None
|
zmq_port = None
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
if recompute_embeddings:
|
if recompute_embeddings:
|
||||||
zmq_port = self.backend_impl._ensure_server_running(
|
zmq_port = self.backend_impl._ensure_server_running(
|
||||||
self.meta_path_str,
|
self.meta_path_str,
|
||||||
@@ -431,6 +431,10 @@ class LeannSearcher:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
del expected_zmq_port
|
del expected_zmq_port
|
||||||
|
zmq_time = time.time() - start_time
|
||||||
|
logger.info(f" Launching server time: {zmq_time} seconds")
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
query_embedding = self.backend_impl.compute_query_embedding(
|
query_embedding = self.backend_impl.compute_query_embedding(
|
||||||
query,
|
query,
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ def compute_embeddings(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
mode: str = "sentence-transformers",
|
mode: str = "sentence-transformers",
|
||||||
is_build: bool = False,
|
is_build: bool = False,
|
||||||
|
batch_size: int = 32,
|
||||||
|
adaptive_optimization: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Unified embedding computation entry point
|
Unified embedding computation entry point
|
||||||
@@ -33,13 +35,20 @@ def compute_embeddings(
|
|||||||
texts: List of texts to compute embeddings for
|
texts: List of texts to compute embeddings for
|
||||||
model_name: Model name
|
model_name: Model name
|
||||||
mode: Computation mode ('sentence-transformers', 'openai', 'mlx')
|
mode: Computation mode ('sentence-transformers', 'openai', 'mlx')
|
||||||
|
is_build: Whether this is a build operation (shows progress bar)
|
||||||
|
batch_size: Batch size for processing
|
||||||
|
adaptive_optimization: Whether to use adaptive optimization based on batch size
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Normalized embeddings array, shape: (len(texts), embedding_dim)
|
Normalized embeddings array, shape: (len(texts), embedding_dim)
|
||||||
"""
|
"""
|
||||||
if mode == "sentence-transformers":
|
if mode == "sentence-transformers":
|
||||||
return compute_embeddings_sentence_transformers(
|
return compute_embeddings_sentence_transformers(
|
||||||
texts, model_name, is_build=is_build
|
texts,
|
||||||
|
model_name,
|
||||||
|
is_build=is_build,
|
||||||
|
batch_size=batch_size,
|
||||||
|
adaptive_optimization=adaptive_optimization,
|
||||||
)
|
)
|
||||||
elif mode == "openai":
|
elif mode == "openai":
|
||||||
return compute_embeddings_openai(texts, model_name)
|
return compute_embeddings_openai(texts, model_name)
|
||||||
@@ -56,9 +65,19 @@ def compute_embeddings_sentence_transformers(
|
|||||||
device: str = "auto",
|
device: str = "auto",
|
||||||
batch_size: int = 32,
|
batch_size: int = 32,
|
||||||
is_build: bool = False,
|
is_build: bool = False,
|
||||||
|
adaptive_optimization: bool = True,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Compute embeddings using SentenceTransformer with model caching
|
Compute embeddings using SentenceTransformer with model caching and adaptive optimization
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to compute embeddings for
|
||||||
|
model_name: Model name
|
||||||
|
use_fp16: Whether to use FP16 precision
|
||||||
|
device: Device to use ('auto', 'cuda', 'mps', 'cpu')
|
||||||
|
batch_size: Batch size for processing
|
||||||
|
is_build: Whether this is a build operation (shows progress bar)
|
||||||
|
adaptive_optimization: Whether to use adaptive optimization based on batch size
|
||||||
"""
|
"""
|
||||||
# Handle empty input
|
# Handle empty input
|
||||||
if not texts:
|
if not texts:
|
||||||
@@ -76,28 +95,68 @@ def compute_embeddings_sentence_transformers(
|
|||||||
else:
|
else:
|
||||||
device = "cpu"
|
device = "cpu"
|
||||||
|
|
||||||
|
# Apply optimizations based on benchmark results
|
||||||
|
if adaptive_optimization:
|
||||||
|
# Use optimal batch_size constants for different devices based on benchmark results
|
||||||
|
if device == "mps":
|
||||||
|
batch_size = 128 # MPS optimal batch size from benchmark
|
||||||
|
if model_name == "Qwen/Qwen3-Embedding-0.6B":
|
||||||
|
batch_size = 64
|
||||||
|
elif device == "cuda":
|
||||||
|
batch_size = 256 # CUDA optimal batch size
|
||||||
|
# Keep original batch_size for CPU
|
||||||
|
|
||||||
# Create cache key
|
# Create cache key
|
||||||
cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}"
|
cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}_optimized"
|
||||||
|
|
||||||
# Check if model is already cached
|
# Check if model is already cached
|
||||||
if cache_key in _model_cache:
|
if cache_key in _model_cache:
|
||||||
logger.info(f"Using cached model: {model_name}")
|
logger.info(f"Using cached optimized model: {model_name}")
|
||||||
model = _model_cache[cache_key]
|
model = _model_cache[cache_key]
|
||||||
else:
|
else:
|
||||||
logger.info(f"Loading and caching SentenceTransformer model: {model_name}")
|
logger.info(
|
||||||
|
f"Loading and caching optimized SentenceTransformer model: {model_name}"
|
||||||
|
)
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
logger.info(f"Using device: {device}")
|
logger.info(f"Using device: {device}")
|
||||||
|
|
||||||
# Prepare model and tokenizer optimization parameters
|
# Apply hardware optimizations
|
||||||
|
if device == "cuda":
|
||||||
|
# TODO: Haven't tested this yet
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.benchmark = True
|
||||||
|
torch.backends.cudnn.deterministic = False
|
||||||
|
torch.cuda.set_per_process_memory_fraction(0.9)
|
||||||
|
elif device == "mps":
|
||||||
|
try:
|
||||||
|
if hasattr(torch.mps, "set_per_process_memory_fraction"):
|
||||||
|
torch.mps.set_per_process_memory_fraction(0.9)
|
||||||
|
except AttributeError:
|
||||||
|
logger.warning(
|
||||||
|
"Some MPS optimizations not available in this PyTorch version"
|
||||||
|
)
|
||||||
|
elif device == "cpu":
|
||||||
|
# TODO: Haven't tested this yet
|
||||||
|
torch.set_num_threads(min(8, os.cpu_count() or 4))
|
||||||
|
try:
|
||||||
|
torch.backends.mkldnn.enabled = True
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Prepare optimized model and tokenizer parameters
|
||||||
model_kwargs = {
|
model_kwargs = {
|
||||||
"torch_dtype": torch.float16 if use_fp16 else torch.float32,
|
"torch_dtype": torch.float16 if use_fp16 else torch.float32,
|
||||||
"low_cpu_mem_usage": True,
|
"low_cpu_mem_usage": True,
|
||||||
"_fast_init": True,
|
"_fast_init": True,
|
||||||
|
"attn_implementation": "eager", # Use eager attention for speed
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer_kwargs = {
|
tokenizer_kwargs = {
|
||||||
"use_fast": True,
|
"use_fast": True,
|
||||||
|
"padding": True,
|
||||||
|
"truncation": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -128,32 +187,44 @@ def compute_embeddings_sentence_transformers(
|
|||||||
)
|
)
|
||||||
logger.info("Model loaded successfully! (network + optimized)")
|
logger.info("Model loaded successfully! (network + optimized)")
|
||||||
|
|
||||||
# Apply additional optimizations (if supported)
|
# Apply additional optimizations based on mode
|
||||||
if use_fp16 and device in ["cuda", "mps"]:
|
if use_fp16 and device in ["cuda", "mps"]:
|
||||||
try:
|
try:
|
||||||
model = model.half()
|
model = model.half()
|
||||||
model = torch.compile(model)
|
logger.info(f"Applied FP16 precision: {model_name}")
|
||||||
logger.info(
|
|
||||||
f"Using FP16 precision and compile optimization: {model_name}"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"FP16 or compile optimization failed: {e}")
|
logger.warning(f"FP16 optimization failed: {e}")
|
||||||
|
|
||||||
|
# Apply torch.compile optimization
|
||||||
|
if device in ["cuda", "mps"]:
|
||||||
|
try:
|
||||||
|
model = torch.compile(model, mode="reduce-overhead", dynamic=True)
|
||||||
|
logger.info(f"Applied torch.compile optimization: {model_name}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"torch.compile optimization failed: {e}")
|
||||||
|
|
||||||
|
# Set model to eval mode and disable gradients for inference
|
||||||
|
model.eval()
|
||||||
|
for param in model.parameters():
|
||||||
|
param.requires_grad_(False)
|
||||||
|
|
||||||
# Cache the model
|
# Cache the model
|
||||||
_model_cache[cache_key] = model
|
_model_cache[cache_key] = model
|
||||||
logger.info(f"Model cached: {cache_key}")
|
logger.info(f"Model cached: {cache_key}")
|
||||||
|
|
||||||
# Compute embeddings
|
# Compute embeddings with optimized inference mode
|
||||||
logger.info("Starting embedding computation...")
|
logger.info(f"Starting embedding computation... (batch_size: {batch_size})")
|
||||||
|
|
||||||
embeddings = model.encode(
|
# Use torch.inference_mode for optimal performance
|
||||||
texts,
|
with torch.inference_mode():
|
||||||
batch_size=batch_size,
|
embeddings = model.encode(
|
||||||
show_progress_bar=is_build, # Don't show progress bar in server environment
|
texts,
|
||||||
convert_to_numpy=True,
|
batch_size=batch_size,
|
||||||
normalize_embeddings=False,
|
show_progress_bar=is_build, # Don't show progress bar in server environment
|
||||||
device=device,
|
convert_to_numpy=True,
|
||||||
)
|
normalize_embeddings=False,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
|
f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ import psutil
|
|||||||
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=getattr(logging, LOG_LEVEL, logging.INFO),
|
level=getattr(logging, LOG_LEVEL, logging.INFO),
|
||||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
format="%(levelname)s - %(name)s - %(message)s",
|
||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -339,4 +339,10 @@ class EmbeddingServerManager:
|
|||||||
)
|
)
|
||||||
self.server_process.kill()
|
self.server_process.kill()
|
||||||
|
|
||||||
|
# Clean up process resources to prevent resource tracker warnings
|
||||||
|
try:
|
||||||
|
self.server_process.wait() # Ensure process is fully cleaned up
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
self.server_process = None
|
self.server_process = None
|
||||||
|
|||||||
Reference in New Issue
Block a user