Merge branch 'main' into readme-polish

docs: follow yichuan's suggestion
2025-07-19 21:47:17 -07:00 · 2025-07-19 21:44:31 -07:00 · 2025-07-19 21:21:41 -07:00 · 2025-07-19 21:02:25 -07:00 · 2025-07-19 20:45:50 -07:00 · 2025-07-19 20:42:52 -07:00
69 changed files with 26692 additions and 4959 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -84,6 +84,4 @@ test_*.py
 packages/leann-backend-diskann/third_party/DiskANN/_deps/

 *.meta.json
-*.passages.json
-
-batchtest.py
+*.passages.json
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,9 @@
 [submodule "packages/leann-backend-diskann/third_party/DiskANN"]
 	path = packages/leann-backend-diskann/third_party/DiskANN
-	url = https://github.com/yichuan-w/DiskANN.git
+	url = https://github.com/yichuan520030910320/DiskANN.git
 [submodule "packages/leann-backend-hnsw/third_party/faiss"]
 	path = packages/leann-backend-hnsw/third_party/faiss
-	url = https://github.com/yichuan-w/faiss.git
+	url = https://github.com/yichuan520030910320/faiss.git
 [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"]
 	path = packages/leann-backend-hnsw/third_party/msgpack-c
 	url = https://github.com/msgpack/msgpack-c.git
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,9 @@
+{
+    "recommendations": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "ms-python.python",
+        "ms-vscode.cmake-tools",
+        "vadimcn.vscode-lldb",
+        "eamodio.gitlens",
+    ]
+}
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,283 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        // new emdedder
+        {
+            "name": "New Embedder",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "demo/main.py",
+            "console": "integratedTerminal",
+            "args": [
+                "--search",
+                "--use-original",
+                "--domain",
+                "dpr",
+                "--nprobe",
+                "5000",
+                "--load",
+                "flat",
+                "--embedder",
+                "intfloat/multilingual-e5-small"
+            ]
+        }
+        //python /home/ubuntu/Power-RAG/faiss/demo/simple_build.py
+        {
+            "name": "main.py",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "demo/main.py",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "--query",
+                "1000",
+                "--load",
+                "bm25"
+            ]
+        },
+        {
+            "name": "Simple Build",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "faiss/demo/simple_build.py"
+            ],
+            "env": {
+                "LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
+            }
+        },
+        //# Fix for Intel MKL error
+        //export LD_PRELOAD=/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so
+        //python faiss/demo/build_demo.py
+        {
+            "name": "Build Demo",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "faiss/demo/build_demo.py"
+            ],
+            "env": {
+                "LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
+            }
+        },
+        {
+            "name": "DiskANN Serve",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "demo/main.py",
+                "--mode",
+                "serve",
+                "--engine",
+                "sglang",
+                "--load-indices",
+                "diskann",
+                "--domain",
+                "rpj_wiki",
+                "--lazy-load",
+                "--recompute-beighbor-embeddings",
+                "--port",
+                "8082",
+                "--diskann-search-memory-maximum",
+                "2",
+                "--diskann-graph",
+                "240",
+                "--search-only"
+            ],
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/faiss_repo/build/faiss/python:$PYTHONPATH"
+            },
+            "preLaunchTask": "CMake: build",
+        },
+        {
+            "name": "DiskANN Serve MAC",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "demo/main.py",
+                "--mode",
+                "serve",
+                "--engine",
+                "ollama",
+                "--load-indices",
+                "diskann",
+                "--domain",
+                "rpj_wiki",
+                "--lazy-load",
+                "--recompute-beighbor-embeddings"
+            ],
+            "preLaunchTask": "CMake: build",
+            "env": {
+                "KMP_DUPLICATE_LIB_OK": "TRUE",
+                "OMP_NUM_THREADS": "1",
+                "MKL_NUM_THREADS": "1",
+                "DYLD_INSERT_LIBRARIES": "/Users/ec2-user/Power-RAG/.venv/lib/python3.10/site-packages/torch/lib/libomp.dylib",
+                "KMP_BLOCKTIME": "0"
+            }
+        },
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "ric/main_ric.py",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "--config-name",
+                "${input:configSelection}"
+            ],
+            "justMyCode": false
+        },
+        //python ./demo/validate_equivalence.py sglang
+        {
+            "name": "Validate Equivalence",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "demo/validate_equivalence.py",
+            "console": "integratedTerminal",
+            "args": [
+                "sglang"
+            ],
+        },
+        //python demo/retrieval_demo.py --engine sglang  --skip-embeddings --domain dpr --load-indices flat ivf_flat
+        {
+            "name": "Retrieval Demo",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "demo/retrieval_demo.py",
+            "console": "integratedTerminal",
+            "args": [
+                "--engine",
+                "vllm",
+                "--skip-embeddings",
+                "--domain",
+                "dpr",
+                "--load-indices",
+                // "flat",
+                "ivf_flat"
+            ],
+        },
+        //python demo/retrieval_demo.py --engine sglang  --skip-embeddings --domain dpr  --load-indices  diskann --hnsw-M 64 --hnsw-efConstruction 150 --hnsw-efSearch 128  --hnsw-sq-bits 8 
+        {
+            "name": "Retrieval Demo DiskANN",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "demo/retrieval_demo.py",
+            "console": "integratedTerminal",
+            "args": [
+                "--engine",
+                "sglang",
+                "--skip-embeddings",
+                "--domain",
+                "dpr",
+                "--load-indices",
+                "diskann",
+                "--hnsw-M",
+                "64",
+                "--hnsw-efConstruction",
+                "150",
+                "--hnsw-efSearch",
+                "128",
+                "--hnsw-sq-bits",
+                "8"
+            ],
+        },
+        {
+            "name": "Find Probe",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "find_probe.py",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+        },
+        {
+            "name": "Python: Attach",
+            "type": "debugpy",
+            "request": "attach",
+            "processId": "${command:pickProcess}",
+            "justMyCode": true
+        },
+        {
+            "name": "Edge RAG",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "edgerag_demo.py"
+            ],
+            "env": {
+                "LD_PRELOAD": "/lib/x86_64-linux-gnu/libiomp5.so /lib/x86_64-linux-gnu/libmkl_core.so /lib/x86_64-linux-gnu/libmkl_intel_lp64.so /lib/x86_64-linux-gnu/libmkl_intel_thread.so",
+                "MKL_NUM_THREADS": "1",
+                "OMP_NUM_THREADS": "1",
+            }
+        },
+        {
+            "name": "Launch Embedding Server",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "demo/embedding_server.py",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "--domain",
+                "rpj_wiki",
+                "--zmq-port",
+                "5556",
+            ]
+        },
+        {
+            "name": "HNSW Serve",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "demo/main.py",
+                "--domain",
+                "rpj_wiki",
+                "--load",
+                "hnsw",
+                "--mode",
+                "serve",
+                "--search",
+                "--skip-pa",
+                "--recompute",
+                "--hnsw-old"
+            ],
+            "env": {
+                "LD_PRELOAD": "/lib/x86_64-linux-gnu/libmkl_core.so:/lib/x86_64-linux-gnu/libmkl_intel_thread.so:/lib/x86_64-linux-gnu/libmkl_intel_lp64.so:/lib/x86_64-linux-gnu/libiomp5.so"
+            }
+        },
+    ],
+    "inputs": [
+        {
+            "id": "configSelection",
+            "type": "pickString",
+            "description": "Select a configuration",
+            "options": [
+                "example_config",
+                "vllm_gritlm"
+            ],
+            "default": "example_config"
+        }
+    ],
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,43 @@
+{
+    "python.analysis.extraPaths": [
+        "./sglang_repo/python"
+    ],
+    "cmake.sourceDirectory": "${workspaceFolder}/DiskANN",
+    "cmake.configureArgs": [
+        "-DPYBIND=True",
+        "-DUPDATE_EDITABLE_INSTALL=ON",
+    ],
+    "cmake.environment": {
+        "PATH": "/Users/ec2-user/Power-RAG/.venv/bin:${env:PATH}"
+    },
+    "cmake.buildDirectory": "${workspaceFolder}/build",
+    "files.associations": {
+        "*.tcc": "cpp",
+        "deque": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "map": "cpp",
+        "unordered_set": "cpp",
+        "atomic": "cpp",
+        "inplace_vector": "cpp",
+        "*.ipp": "cpp",
+        "forward_list": "cpp",
+        "list": "cpp",
+        "any": "cpp",
+        "system_error": "cpp",
+        "__hash_table": "cpp",
+        "__split_buffer": "cpp",
+        "__tree": "cpp",
+        "ios": "cpp",
+        "set": "cpp",
+        "__string": "cpp",
+        "string_view": "cpp",
+        "ranges": "cpp",
+        "iosfwd": "cpp"
+    },
+    "lldb.displayFormat": "auto",
+    "lldb.showDisassembly": "auto",
+    "lldb.dereferencePointers": true,
+    "lldb.consoleMode": "commands",
+}
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -0,0 +1,16 @@
+{
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"type": "cmake",
+			"label": "CMake: build",
+			"command": "build",
+			"targets": [
+				"all"
+			],
+			"group": "build",
+			"problemMatcher": [],
+			"detail": "CMake template build task"
+		}
+	]
+}
--- a/README.md
+++ b/README.md
@@ -12,75 +12,62 @@
    The smallest vector index in the world. RAG Everything with LEANN!
 </h2>

-LEANN is a revolutionary vector database that democratizes personal AI. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using **[97% less storage]** than traditional solutions **without accuracy loss**.
-
-LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
-
-**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can search your **[file system](#process-any-documents-pdf-txt-md)**, **[emails](#search-your-entire-life)**, **[browser history](#time-machine-for-the-web)**, **[chat history](#wechat-detective)**, or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
+LEANN is a revolutionary vector database that makes personal AI accessible to everyone. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using **97% less storage** than traditional solutions **without accuracy loss**.

+RAG your **[emails](#-search-your-entire-life)**, **[browser history](#-time-machine-for-the-web)**, **[WeChat](#-wechat-detective)**, or 60M documents on your laptop, in nearly zero cost. No cloud, no API keys, completely private.

+LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Read more →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)

 ## Why LEANN?

 <p align="center">
-  <img src="assets/effects.png" alt="LEANN vs Traditional Vector DB Storage Comparison" width="70%">
+  <img src="assets/effects.png" alt="LEANN vs Traditional Vector DB Storage Comparison" width="100%">
 </p>

-> **The numbers speak for themselves:** Index 60 million Wikipedia chunks in just 6GB instead of 201GB. From emails to browser history, everything fits on your laptop. [See detailed benchmarks for different applications below ↓](#storage-usage-comparison)
+**The numbers speak for themselves:** Index 60 million Wikipedia articles in just 6GB instead of 201GB. From emails to browser history, everything fits on your laptop. [See detailed benchmarks below ↓](#storage-usage-comparison)

+## Why This Matters

 🔒 **Privacy:** Your data never leaves your laptop. No OpenAI, no cloud, no "terms of service".

-🪶 **Lightweight:** Graph-based recomputation eliminates heavy embedding storage, while smart graph pruning and CSR format minimize graph storage overhead. Always less storage, less memory usage!
+🪶 **Lightweight:** Smart graph pruning means less storage, less memory usage, better performance on your existing hardware.

-📈 **Scalability:** Handle messy personal data that would crash traditional vector DBs, easily managing your growing personalized data and agent generated memory!
+📈 **Scalability:** Organize our messy personal data that would crash traditional vector DBs, with performance that gets better as your data grows more personalized.

 ✨ **No Accuracy Loss:** Maintain the same search quality as heavyweight solutions while using 97% less storage.

 ## Quick Start in 1 minute

 ```bash
-git clone git@github.com:yichuan-w/LEANN.git leann
+git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann
 cd leann
 git submodule update --init --recursive
 ```

 **macOS:**
 ```bash
-brew install llvm libomp boost protobuf zeromq
+brew install llvm libomp boost protobuf
 export CC=$(brew --prefix llvm)/bin/clang
 export CXX=$(brew --prefix llvm)/bin/clang++
-
-# Install with HNSW backend (default, recommended for most users)
 uv sync
-
-# Or add DiskANN backend if you want to test more options
-uv sync --extra diskann
 ```

 **Linux (Ubuntu/Debian):**
 ```bash
-sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
-
-# Install with HNSW backend (default, recommended for most users)
+sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev
 uv sync
-
-# Or add DiskANN backend if you want to test more options
-uv sync --extra diskann
 ```

+**Ollama Setup (Optional for Local LLM):**

-
-**Ollama Setup (Recommended for full privacy):**
-
-> *You can skip this installation if you only want to use OpenAI API for generation.*
-
+*We support both hf-transformers and Ollama for local LLMs. Ollama is recommended for faster performance.*

 *macOS:*

-First, [download Ollama for macOS](https://ollama.com/download/mac).
-
 ```bash
+# Install Ollama
+brew install ollama
+
 # Pull a lightweight model (recommended for consumer hardware)
 ollama pull llama3.2:1b
 ```
@@ -97,37 +84,28 @@ ollama serve &
 ollama pull llama3.2:1b
 ```

+You can also replace `llama3.2:1b` to `deepseek-r1:1.5b` or `qwen3:4b` for better performance but higher memory usage.
+
 ## Dead Simple API

 Just 3 lines of code. Our declarative API makes RAG as easy as writing a config file:

 ```python
-from leann.api import LeannBuilder, LeannSearcher, LeannChat
+from leann.api import LeannBuilder, LeannSearcher

-# 1. Build the index (no embeddings stored!)
+# 1. Build index (no embeddings stored!)
 builder = LeannBuilder(backend_name="hnsw")
 builder.add_text("C# is a powerful programming language")
-builder.add_text("Python is a powerful programming language and it is very popular")
-builder.add_text("Machine learning transforms industries")
+builder.add_text("Python is a powerful programming language")
+builder.add_text("Machine learning transforms industries")  
 builder.add_text("Neural networks process complex data")
-builder.add_text("Leann is a great storage saving engine for RAG on your MacBook")
+builder.add_text("Leann is a great storage saving engine for RAG on your macbook")
 builder.build_index("knowledge.leann")

 # 2. Search with real-time embeddings
 searcher = LeannSearcher("knowledge.leann")
-results = searcher.search("programming languages", top_k=2)
-
-# 3. Chat with LEANN using retrieved results
-llm_config = {
-    "type": "ollama",
-    "model": "llama3.2:1b"
-}
-
-chat = LeannChat(index_path="knowledge.leann", llm_config=llm_config)
-response = chat.ask(
-    "Compare the two retrieved programming languages and say which one is more popular today.",
-    top_k=2,
-)
+results = searcher.search("C++ programming languages", top_k=2, recompute_beighbor_embeddings=True)
+print(results)
 ```

 **That's it.** No cloud setup, no API keys, no "fine-tuning". Just your data, your questions, your laptop.
@@ -138,28 +116,28 @@ response = chat.ask(

 LEANN supports RAGing a lot of data sources, like .pdf, .txt, .md, and also supports RAGing your WeChat, Google Search History, and more.

-### Process Any Documents (.pdf, .txt, .md)
+### 📚 Process Any Documents (.pdf, .txt, .md)

-Above we showed the Python API, while this CLI script demonstrates the same concepts while directly processing PDFs and documents, and even any directory that stores your personal files!
-
-The following scripts use Ollama `qwen3:8b` by default, so you need `ollama pull qwen3:8b` first. For other models: `--llm openai --model gpt-4o` (requires `OPENAI_API_KEY` environment variable) or `--llm hf --model Qwen/Qwen3-4B`.
+Above we showed the Python API, while this CLI script demonstrates the same concepts while directly processing PDFs and documents.

 ```bash
-# Drop your PDFs, .txt, .md files into apps/documents/data/
-python -m apps.documents
+# Drop your PDFs, .txt, .md files into examples/data/
+uv run ./examples/main_cli_example.py

-# Or with uv
-uv run python -m apps.documents
+# Or use python directly
+source .venv/bin/activate
+python ./examples/main_cli_example.py
 ```

-
+Uses Ollama `qwen3:8b` by default. For other models: `--llm openai --model gpt-4o` (requires `OPENAI_API_KEY` environment variable) or `--llm hf --model Qwen/Qwen3-4B`.

 **Works with any text format** - research papers, personal notes, presentations. Built with LlamaIndex for document parsing.

-### Search Your Entire Life
+### 🕵️ Search Your Entire Life
 ```bash
-python -m apps.email
-# "What's the number of class recommend to take per semester for incoming EECS students?"
+python examples/mail_reader_leann.py
+# "What did my boss say about the Christmas party last year?"
+# "Find all emails from my mom about birthday plans"
 ```
 **90K emails → 14MB.** Finally, search your email like you search Google.

@@ -168,19 +146,19 @@ python -m apps.email

 ```bash
 # Use default mail path (works for most macOS setups)
-python -m apps.email
+python examples/mail_reader_leann.py

 # Run with custom index directory
-python -m apps.email --index-dir "./my_mail_index"
+python examples/mail_reader_leann.py --index-dir "./my_mail_index"

 # Process all emails (may take time but indexes everything)
-python -m apps.email --max-emails -1
+python examples/mail_reader_leann.py --max-emails -1

 # Limit number of emails processed (useful for testing)
-python -m apps.email --max-emails 1000
+python examples/mail_reader_leann.py --max-emails 1000

 # Run a single query
-python -m apps.email --query "What did my boss say about deadlines?"
+python examples/mail_reader_leann.py --query "What did my boss say about deadlines?"
 ```

 </details>
@@ -194,10 +172,11 @@ Once the index is built, you can ask questions like:
 - "Show me emails about travel expenses"
 </details>

-### Time Machine for the Web  
+### 🌐 Time Machine for the Web  
 ```bash
-python -m apps.browser
-# "Tell me my browser history about machine learning system stuff?"
+python examples/google_history_reader_leann.py
+# "What was that AI paper I read last month?"
+# "Show me all the cooking videos I watched"
 ```
 **38K browser entries → 6MB.** Your browser history becomes your personal search engine.

@@ -206,16 +185,16 @@ python -m apps.browser

 ```bash
 # Use default Chrome profile (auto-finds all profiles)
-python -m apps.browser
+python examples/google_history_reader_leann.py

 # Run with custom index directory
-python -m apps.browser --index-dir "./my_chrome_index"
+python examples/google_history_reader_leann.py --index-dir "./my_chrome_index"

 # Limit number of history entries processed (useful for testing)
-python -m apps.browser --max-entries 500
+python examples/google_history_reader_leann.py --max-entries 500

 # Run a single query
-python -m apps.browser --query "What websites did I visit about machine learning?"
+python examples/google_history_reader_leann.py --query "What websites did I visit about machine learning?"
 ```

 </details>
@@ -248,10 +227,10 @@ Once the index is built, you can ask questions like:

 </details>

-### WeChat Detective
+### 💬 WeChat Detective

 ```bash
-python -m apps.wechat
+python examples/wechat_history_reader_leann.py
 # "Show me all group chats about weekend plans"
 ```
 **400K messages → 64MB.** Search years of chat history in any language.
@@ -273,19 +252,19 @@ sudo packages/wechat-exporter/wechattweak-cli install

 ```bash
 # Use default settings (recommended for first run)
-python -m apps.wechat
+python examples/wechat_history_reader_leann.py

 # Run with custom export directory and wehn we run the first time, LEANN will export all chat history automatically for you
-python -m apps.wechat --export-dir "./my_wechat_exports"
+python examples/wechat_history_reader_leann.py --export-dir "./my_wechat_exports"

 # Run with custom index directory
-python -m apps.wechat --index-dir "./my_wechat_index"
+python examples/wechat_history_reader_leann.py --index-dir "./my_wechat_index"

 # Limit number of chat entries processed (useful for testing)
-python -m apps.wechat --max-entries 1000
+python examples/wechat_history_reader_leann.py --max-entries 1000

 # Run a single query
-python -m apps.wechat --query "Show me conversations about travel plans"
+python examples/wechat_history_reader_leann.py --query "Show me conversations about travel plans"
 ```

 </details>
@@ -300,73 +279,6 @@ Once the index is built, you can ask questions like:
 </details>


-
-## 🖥️ Command Line Interface
-
-LEANN includes a powerful CLI for document processing and search. Perfect for quick document indexing and interactive chat.
-
-```bash
-# Build an index from documents
-leann build my-docs --docs ./documents
-
-# Search your documents  
-leann search my-docs "machine learning concepts"
-
-# Interactive chat with your documents
-leann ask my-docs --interactive
-
-# List all your indexes
-leann list
-```
-
-**Key CLI features:**
- Auto-detects document formats (PDF, TXT, MD, DOCX)
- Smart text chunking with overlap
- Multiple LLM providers (Ollama, OpenAI, HuggingFace)
- Organized index storage in `~/.leann/indexes/`
- Support for advanced search parameters
-
-<details>
-<summary><strong>📋 Click to expand: Complete CLI Reference</strong></summary>
-
-**Build Command:**
-```bash
-leann build INDEX_NAME --docs DIRECTORY [OPTIONS]
-
-Options:
-  --backend {hnsw,diskann}     Backend to use (default: hnsw)
-  --embedding-model MODEL      Embedding model (default: facebook/contriever)
-  --graph-degree N            Graph degree (default: 32)
-  --complexity N              Build complexity (default: 64)
-  --force                     Force rebuild existing index
-  --compact                   Use compact storage (default: true)
-  --recompute                 Enable recomputation (default: true)
-```
-
-**Search Command:**
-```bash
-leann search INDEX_NAME QUERY [OPTIONS]
-
-Options:
-  --top-k N                   Number of results (default: 5)
-  --complexity N              Search complexity (default: 64)
-  --recompute-embeddings      Use recomputation for highest accuracy
-  --pruning-strategy {global,local,proportional}
-```
-
-**Ask Command:**
-```bash
-leann ask INDEX_NAME [OPTIONS]
-
-Options:
-  --llm {ollama,openai,hf}    LLM provider (default: ollama)
-  --model MODEL               Model name (default: qwen3:8b)
-  --interactive              Interactive chat mode
-  --top-k N                  Retrieval count (default: 20)
-```
-
-</details>
-
 ## 🏗️ Architecture & How It Works

 <p align="center">
@@ -387,7 +299,7 @@ Options:

 Run the comparison yourself:
 ```bash
-python -m apps.benchmarks
+python examples/compare_faiss_vs_leann.py
 ```

 | System | Storage | 
@@ -398,7 +310,15 @@ python -m apps.benchmarks

 Same dataset, same hardware, same embedding model. LEANN just works better.

+## Reproduce Our Results

+```bash
+uv pip install -e ".[dev]"  # Install dev dependencies
+python examples/run_evaluation.py data/indices/dpr/dpr_diskann      # DPR dataset
+python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index  # Wikipedia
+```
+
+The evaluation script downloads data automatically on first run.

 ### Storage Usage Comparison

@@ -425,15 +345,6 @@ Same dataset, same hardware, same embedding model. LEANN just works better.

 *Benchmarks run on Apple M3 Pro 36 GB*

-## Reproduce Our Results
-
-```bash
-uv pip install -e ".[dev]"  # Install dev dependencies
-python -m apps.evaluation data/indices/dpr/dpr_diskann      # DPR dataset
-python -m apps.evaluation data/indices/rpj_wiki/rpj_wiki.index  # Wikipedia
-```
-
-The evaluation script downloads data automatically on first run. The last three results were tested with partial personal data, and you can reproduce them with your own data!
 ## 🔬 Paper

 If you find Leann useful, please cite:
@@ -510,17 +421,6 @@ export NCCL_IB_DISABLE=1
 export NCCL_NET_PLUGIN=none
 export NCCL_SOCKET_IFNAME=ens5
 ``` -->
-## FAQ
-
-### 1. My building time seems long
-
-You can speed up the process by using a lightweight embedding model. Add this to your arguments:
-
-```bash
--embedding-model sentence-transformers/all-MiniLM-L6-v2
-```
-**Model sizes:** `all-MiniLM-L6-v2` (30M parameters), `facebook/contriever` (~100M parameters), `Qwen3-0.6B` (600M parameters)
-

 ## 📈 Roadmap

--- a/apps/init.py
+++ b/apps/init.py
--- a/apps/benchmarks/init.py
+++ b/apps/benchmarks/init.py
--- a/apps/benchmarks/main.py
+++ b/apps/benchmarks/main.py
@@ -1,338 +0,0 @@
-#!/usr/bin/env python3
-"""
-Memory comparison between Faiss HNSW and LEANN HNSW backend
-"""
-
-import logging
-import os
-import sys
-import time
-import psutil
-import gc
-import subprocess
-from pathlib import Path
-from llama_index.core.node_parser import SentenceSplitter
-
-# Setup logging
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def get_memory_usage():
-    """Get current memory usage in MB"""
-    process = psutil.Process()
-    return process.memory_info().rss / 1024 / 1024
-
-
-def print_memory_stats(stage: str, start_mem: float):
-    """Print memory statistics"""
-    current_mem = get_memory_usage()
-    diff = current_mem - start_mem
-    print(f"[{stage}] Memory: {current_mem:.1f} MB (+{diff:.1f} MB)")
-    return current_mem
-
-
-class MemoryTracker:
-    def __init__(self, name: str):
-        self.name = name
-        self.start_mem = get_memory_usage()
-        self.stages = []
-
-    def checkpoint(self, stage: str):
-        current_mem = print_memory_stats(f"{self.name} - {stage}", self.start_mem)
-        self.stages.append((stage, current_mem))
-        return current_mem
-
-    def summary(self):
-        print(f"\n=== {self.name} Memory Summary ===")
-        for stage, mem in self.stages:
-            print(f"{stage}: {mem:.1f} MB")
-        peak_mem = max(mem for _, mem in self.stages)
-        print(f"Peak Memory: {peak_mem:.1f} MB")
-        print(f"Total Memory Increase: {peak_mem - self.start_mem:.1f} MB")
-        return peak_mem
-
-
-def test_faiss_hnsw():
-    """Test Faiss HNSW Vector Store in subprocess"""
-    print("\n" + "=" * 50)
-    print("TESTING FAISS HNSW VECTOR STORE")
-    print("=" * 50)
-
-    try:
-        # Get the directory of this script
-        script_dir = Path(__file__).parent
-        faiss_script = script_dir / "faiss_only.py"
-        result = subprocess.run(
-            [sys.executable, str(faiss_script)],
-            capture_output=True,
-            text=True,
-            timeout=300,
-        )
-
-        print(result.stdout)
-        if result.stderr:
-            print("Stderr:", result.stderr)
-
-        if result.returncode != 0:
-            return {
-                "peak_memory": float("inf"),
-                "error": f"Process failed with code {result.returncode}",
-            }
-
-        # Parse peak memory from output
-        lines = result.stdout.split("\n")
-        peak_memory = 0.0
-
-        for line in lines:
-            if "Peak Memory:" in line:
-                peak_memory = float(
-                    line.split("Peak Memory:")[1].split("MB")[0].strip()
-                )
-
-        return {"peak_memory": peak_memory}
-
-    except Exception as e:
-        return {
-            "peak_memory": float("inf"),
-            "error": str(e),
-        }
-
-
-def test_leann_hnsw():
-    """Test LEANN HNSW Search Memory (load existing index)"""
-    print("\n" + "=" * 50)
-    print("TESTING LEANN HNSW SEARCH MEMORY")
-    print("=" * 50)
-
-    tracker = MemoryTracker("LEANN HNSW Search")
-
-    # Import and setup
-    tracker.checkpoint("Initial")
-
-    from leann.api import LeannSearcher
-
-    tracker.checkpoint("After imports")
-
-    from llama_index.core import SimpleDirectoryReader
-    from leann.api import LeannBuilder, LeannSearcher
-
-
-    # Load and parse documents
-    documents = SimpleDirectoryReader(
-        "../documents/data",
-        recursive=True,
-        encoding="utf-8",
-        required_exts=[".pdf", ".txt", ".md"],
-    ).load_data()
-
-    tracker.checkpoint("After document loading")
-
-    # Parse into chunks
-    node_parser = SentenceSplitter(
-        chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
-    )
-
-    all_texts = []
-    for doc in documents:
-        nodes = node_parser.get_nodes_from_documents([doc])
-        for node in nodes:
-            all_texts.append(node.get_content())
-
-    tracker.checkpoint("After text chunking")
-
-    # Build LEANN index
-    INDEX_DIR = Path("./test_leann_comparison")
-    INDEX_PATH = str(INDEX_DIR / "comparison.leann")
-
-    # Check if index already exists
-    if os.path.exists(INDEX_PATH + ".meta.json"):
-        print("Loading existing LEANN HNSW index...")
-        tracker.checkpoint("After loading existing index")
-    else:
-        print("Building new LEANN HNSW index...")
-        # Clean up previous index
-        import shutil
-
-        if INDEX_DIR.exists():
-            shutil.rmtree(INDEX_DIR)
-
-        builder = LeannBuilder(
-            backend_name="hnsw",
-            embedding_model="facebook/contriever",
-            graph_degree=32,
-            complexity=64,
-            is_compact=True,
-            is_recompute=True,
-            num_threads=1,
-        )
-
-        tracker.checkpoint("After builder setup")
-
-        print("Building LEANN HNSW index...")
-
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
-
-        builder.build_index(INDEX_PATH)
-        del builder
-        gc.collect()
-
-        tracker.checkpoint("After index building")
-
-    # Find existing LEANN index
-    index_paths = [
-        "./test_leann_comparison/comparison.leann",
-    ]
-    index_path = None
-    for path in index_paths:
-        if os.path.exists(path + ".meta.json"):
-            index_path = path
-            break
-
-    if not index_path:
-        print("❌ LEANN index not found. Please build it first")
-        return {"peak_memory": float("inf"), "error": "Index not found"}
-
-    # Measure runtime memory overhead
-    print("\nMeasuring runtime memory overhead...")
-    runtime_start_mem = get_memory_usage()
-    print(f"Before load memory: {runtime_start_mem:.1f} MB")
-    tracker.checkpoint("Before load memory")
-    
-    # Load searcher
-    searcher = LeannSearcher(index_path)
-    tracker.checkpoint("After searcher loading")
-
-
-
-    print("Running search queries...")
-    queries = [
-        "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面，任务令一般在什么城市颁发",
-        "What is LEANN and how does it work?",
-        "华为诺亚方舟实验室的主要研究内容",
-    ]
-
-    for i, query in enumerate(queries):
-        start_time = time.time()
-        # Use same parameters as Faiss: top_k=20, ef=120 (complexity parameter)
-        _ = searcher.search(query, top_k=20, ef=120)
-        query_time = time.time() - start_time
-        print(f"Query {i + 1} time: {query_time:.3f}s")
-        tracker.checkpoint(f"After query {i + 1}")
-
-    runtime_end_mem = get_memory_usage()
-    runtime_overhead = runtime_end_mem - runtime_start_mem
-
-    peak_memory = tracker.summary()
-    print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
-
-    # Get storage size before cleanup
-    storage_size = 0
-    INDEX_DIR = Path(index_path).parent
-    if INDEX_DIR.exists():
-        total_size = 0
-        for dirpath, _, filenames in os.walk(str(INDEX_DIR)):
-            for filename in filenames:
-                # Only count actual index files, skip text data and backups
-                if filename.endswith((".old", ".tmp", ".bak", ".jsonl", ".json")):
-                    continue
-                # Count .index, .idx, .map files (actual index structures)
-                if filename.endswith((".index", ".idx", ".map")):
-                    filepath = os.path.join(dirpath, filename)
-                    total_size += os.path.getsize(filepath)
-        storage_size = total_size / (1024 * 1024)  # Convert to MB
-
-    # Clean up
-    del searcher
-    gc.collect()
-
-    return {
-        "peak_memory": peak_memory,
-        "storage_size": storage_size,
-    }
-
-
-def main():
-    """Run comparison tests"""
-    print("Storage + Search Memory Comparison: Faiss HNSW vs LEANN HNSW")
-    print("=" * 60)
-
-    # Test Faiss HNSW
-    faiss_results = test_faiss_hnsw()
-
-    # Force garbage collection
-    gc.collect()
-    time.sleep(2)
-
-    # Test LEANN HNSW
-    leann_results = test_leann_hnsw()
-
-    # Final comparison
-    print("\n" + "=" * 60)
-    print("STORAGE + SEARCH MEMORY COMPARISON")
-    print("=" * 60)
-
-    # Get storage sizes
-    faiss_storage_size = 0
-    leann_storage_size = leann_results.get("storage_size", 0)
-
-    # Get Faiss storage size using Python
-    if os.path.exists("./storage_faiss"):
-        total_size = 0
-        for dirpath, _, filenames in os.walk("./storage_faiss"):
-            for filename in filenames:
-                filepath = os.path.join(dirpath, filename)
-                total_size += os.path.getsize(filepath)
-        faiss_storage_size = total_size / (1024 * 1024)  # Convert to MB
-
-    print("Faiss HNSW:")
-    if "error" in faiss_results:
-        print(f"  ❌ Failed: {faiss_results['error']}")
-    else:
-        print(f"  Search Memory: {faiss_results['peak_memory']:.1f} MB")
-        print(f"  Storage Size: {faiss_storage_size:.1f} MB")
-
-    print("\nLEANN HNSW:")
-    if "error" in leann_results:
-        print(f"  ❌ Failed: {leann_results['error']}")
-    else:
-        print(f"  Search Memory: {leann_results['peak_memory']:.1f} MB")
-        print(f"  Storage Size: {leann_storage_size:.1f} MB")
-
-    # Calculate improvements only if both tests succeeded
-    if "error" not in faiss_results and "error" not in leann_results:
-        memory_ratio = faiss_results["peak_memory"] / leann_results["peak_memory"]
-
-        print("\nLEANN vs Faiss Performance:")
-        memory_saving = faiss_results["peak_memory"] - leann_results["peak_memory"]
-        print(
-            f"  Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)"
-        )
-
-        # Storage comparison
-        if leann_storage_size > faiss_storage_size:
-            storage_ratio = leann_storage_size / faiss_storage_size
-            print(
-                f"  Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)"
-            )
-        elif faiss_storage_size > leann_storage_size:
-            storage_ratio = faiss_storage_size / leann_storage_size
-            print(
-                f"  Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)"
-            )
-        else:
-            print("  Storage Size: similar")
-    else:
-        if "error" not in leann_results:
-            print("\n✅ LEANN HNSW completed successfully!")
-            print(f"📊 Search Memory: {leann_results['peak_memory']:.1f} MB")
-            print(f"📊 Storage Size: {leann_storage_size:.1f} MB")
-        if "error" not in faiss_results:
-            print("\n✅ Faiss HNSW completed successfully!")
-            print(f"📊 Search Memory: {faiss_results['peak_memory']:.1f} MB")
-            print(f"📊 Storage Size: {faiss_storage_size:.1f} MB")
-
-
-if __name__ == "__main__":
-    main()
--- a/apps/benchmarks/faiss_only.py
+++ b/apps/benchmarks/faiss_only.py
@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-"""Test only Faiss HNSW"""
-
-import sys
-import time
-import psutil
-import gc
-import os
-
-
-def get_memory_usage():
-    process = psutil.Process()
-    return process.memory_info().rss / 1024 / 1024
-
-
-class MemoryTracker:
-    def __init__(self, name: str):
-        self.name = name
-        self.start_mem = get_memory_usage()
-        self.stages = []
-
-    def checkpoint(self, stage: str):
-        current_mem = get_memory_usage()
-        diff = current_mem - self.start_mem
-        print(f"[{self.name} - {stage}] Memory: {current_mem:.1f} MB (+{diff:.1f} MB)")
-        self.stages.append((stage, current_mem))
-        return current_mem
-
-    def summary(self):
-        peak_mem = max(mem for _, mem in self.stages)
-        print(f"Peak Memory: {peak_mem:.1f} MB")
-        return peak_mem
-
-
-def main():
-    try:
-        import faiss
-    except ImportError:
-        print("Faiss is not installed.")
-        print("Please install it with `uv pip install faiss-cpu`")
-        sys.exit(1)
-
-    from llama_index.core import (
-        SimpleDirectoryReader,
-        VectorStoreIndex,
-        StorageContext,
-        Settings,
-        node_parser,
-        Document,
-    )
-    from llama_index.core.node_parser import SentenceSplitter
-    from llama_index.vector_stores.faiss import FaissVectorStore
-    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-
-    tracker = MemoryTracker("Faiss HNSW")
-    tracker.checkpoint("Initial")
-
-    embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
-    Settings.embed_model = embed_model
-    tracker.checkpoint("After embedding model setup")
-
-    d = 768
-    faiss_index = faiss.IndexHNSWFlat(d, 32)
-    faiss_index.hnsw.efConstruction = 64
-    tracker.checkpoint("After Faiss index creation")
-
-    documents = SimpleDirectoryReader(
-        "../documents/data",
-        recursive=True,
-        encoding="utf-8",
-        required_exts=[".pdf", ".txt", ".md"],
-    ).load_data()
-    tracker.checkpoint("After document loading")
-
-    # Parse into chunks using the same splitter as LEANN
-    node_parser = SentenceSplitter(
-        chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
-    )
-
-    tracker.checkpoint("After text splitter setup")
-
-    # Check if index already exists and try to load it
-    index_loaded = False
-    if os.path.exists("./storage_faiss"):
-        print("Loading existing Faiss HNSW index...")
-        try:
-            # Use the correct Faiss loading pattern from the example
-            vector_store = FaissVectorStore.from_persist_dir("./storage_faiss")
-            storage_context = StorageContext.from_defaults(
-                vector_store=vector_store, persist_dir="./storage_faiss"
-            )
-            from llama_index.core import load_index_from_storage
-            index = load_index_from_storage(storage_context=storage_context)
-            print(f"Index loaded from ./storage_faiss")
-            tracker.checkpoint("After loading existing index")
-            index_loaded = True
-        except Exception as e:
-            print(f"Failed to load existing index: {e}")
-            print("Cleaning up corrupted index and building new one...")
-            # Clean up corrupted index
-            import shutil
-            if os.path.exists("./storage_faiss"):
-                shutil.rmtree("./storage_faiss")
-    
-    if not index_loaded:
-        print("Building new Faiss HNSW index...")
-        
-        # Use the correct Faiss building pattern from the example
-        vector_store = FaissVectorStore(faiss_index=faiss_index)
-        storage_context = StorageContext.from_defaults(vector_store=vector_store)
-        index = VectorStoreIndex.from_documents(
-            documents, 
-            storage_context=storage_context,
-            transformations=[node_parser]
-        )
-        tracker.checkpoint("After index building")
-
-        # Save index to disk using the correct pattern
-        index.storage_context.persist(persist_dir="./storage_faiss")
-        tracker.checkpoint("After index saving")
-
-    # Measure runtime memory overhead
-    print("\nMeasuring runtime memory overhead...")
-    runtime_start_mem = get_memory_usage()
-    print(f"Before load memory: {runtime_start_mem:.1f} MB")
-    tracker.checkpoint("Before load memory")
-    
-    query_engine = index.as_query_engine(similarity_top_k=20)
-    queries = [
-        "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面，任务令一般在什么城市颁发",
-        "What is LEANN and how does it work?",
-        "华为诺亚方舟实验室的主要研究内容",
-    ]
-
-    for i, query in enumerate(queries):
-        start_time = time.time()
-        _ = query_engine.query(query)
-        query_time = time.time() - start_time
-        print(f"Query {i + 1} time: {query_time:.3f}s")
-        tracker.checkpoint(f"After query {i + 1}")
-
-    runtime_end_mem = get_memory_usage()
-    runtime_overhead = runtime_end_mem - runtime_start_mem
-    
-    peak_memory = tracker.summary()
-    print(f"Peak Memory: {peak_memory:.1f} MB")
-    print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
-
-
-if __name__ == "__main__":
-    main()
--- a/apps/browser/init.py
+++ b/apps/browser/init.py
--- a/apps/browser/main.py
+++ b/apps/browser/main.py
@@ -1,201 +0,0 @@
-import os
-import asyncio
-import argparse
-try:
-    import dotenv
-    dotenv.load_dotenv()
-except ModuleNotFoundError:
-    # python-dotenv is not installed; skip loading environment variables
-    dotenv = None
-from pathlib import Path
-from typing import List, Any
-from leann.api import LeannBuilder, LeannSearcher, LeannChat
-from llama_index.core.node_parser import SentenceSplitter
-
-# Default Chrome profile path
-DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
-
-def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1):
-    """
-    Create LEANN index from multiple Chrome profile data sources.
-    
-    Args:
-        profile_dirs: List of Path objects pointing to Chrome profile directories
-        index_path: Path to save the LEANN index
-        max_count: Maximum number of history entries to process per profile
-    """
-    print("Creating LEANN index from multiple Chrome profile data sources...")
-    
-    # Load documents using ChromeHistoryReader from local readers module
-    from .readers import ChromeHistoryReader
-    reader = ChromeHistoryReader()
-    
-    INDEX_DIR = Path(index_path).parent
-    
-    if not INDEX_DIR.exists():
-        print(f"--- Index directory not found, building new index ---")
-        all_documents = []
-        total_processed = 0
-        
-        # Process each Chrome profile directory
-        for i, profile_dir in enumerate(profile_dirs):
-            print(f"\nProcessing Chrome profile {i+1}/{len(profile_dirs)}: {profile_dir}")
-            
-            try:
-                documents = reader.load_data(
-                    chrome_profile_path=str(profile_dir),
-                    max_count=max_count
-                )
-                if documents:
-                    print(f"Loaded {len(documents)} history documents from {profile_dir}")
-                    all_documents.extend(documents)
-                    total_processed += len(documents)
-                    
-                    # Check if we've reached the max count
-                    if max_count > 0 and total_processed >= max_count:
-                        print(f"Reached max count of {max_count} documents")
-                        break
-                else:
-                    print(f"No documents loaded from {profile_dir}")
-            except Exception as e:
-                print(f"Error processing {profile_dir}: {e}")
-                continue
-        
-        if not all_documents:
-            print("No documents loaded from any source. Exiting.")
-            return None
-        
-        print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles")
-        
-        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
-        
-        # Convert Documents to text strings and chunk them
-        all_texts = []
-        for doc in all_documents:
-            # Split the document into chunks
-            nodes = text_splitter.get_nodes_from_documents([doc])
-            for node in nodes:
-                all_texts.append(node.get_content())
-        
-        print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
-        
-        # Create LEANN index directory
-        print(f"--- Index directory not found, building new index ---")
-        INDEX_DIR.mkdir(exist_ok=True)
-
-        print(f"--- Building new LEANN index ---")
-        
-        print(f"\n[PHASE 1] Building Leann index...")
-
-        # Use HNSW backend for better macOS compatibility
-        builder = LeannBuilder(
-            backend_name="hnsw",
-            embedding_model="facebook/contriever",
-            graph_degree=32, 
-            complexity=64,
-            is_compact=True,
-            is_recompute=True,
-            num_threads=1  # Force single-threaded mode
-        )
-
-        print(f"Adding {len(all_texts)} history chunks to index...")
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
-            
-        builder.build_index(index_path)
-        print(f"\nLEANN index built at {index_path}!")
-    else:
-        print(f"--- Using existing index at {INDEX_DIR} ---")
-    
-    return index_path
-
-async def query_leann_index(index_path: str, query: str):
-    """
-    Query the LEANN index.
-    
-    Args:
-        index_path: Path to the LEANN index
-        query: The query string
-    """
-    print(f"\n[PHASE 2] Starting Leann chat session...")
-    chat = LeannChat(index_path=index_path)
-    
-    print(f"You: {query}")
-    chat_response = chat.ask(
-        query, 
-        top_k=10, 
-        recompute_beighbor_embeddings=True,
-        complexity=32,
-        beam_width=1,
-        llm_config={
-            "type": "openai",
-            "model": "gpt-4o",
-            "api_key": os.getenv("OPENAI_API_KEY"),
-        },
-        llm_kwargs={
-            "temperature": 0.0,
-            "max_tokens": 1000
-        }
-    )
-    print(f"Leann: {chat_response}")
-
-async def main():
-    # Parse command line arguments
-    parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
-    parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
-                       help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
-    parser.add_argument('--index-dir', type=str, default="./chrome_history_index_leann_test",
-                       help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
-    parser.add_argument('--max-entries', type=int, default=1000,
-                       help='Maximum number of history entries to process (default: 1000)')
-    parser.add_argument('--query', type=str, default=None,
-                       help='Single query to run (default: runs example queries)')
-    parser.add_argument('--auto-find-profiles', action='store_true', default=True,
-                       help='Automatically find all Chrome profiles (default: True)')
-    
-    args = parser.parse_args()
-    
-    INDEX_DIR = Path(args.index_dir)
-    INDEX_PATH = str(INDEX_DIR / "chrome_history.leann")
-    
-    print(f"Using Chrome profile: {args.chrome_profile}")
-    print(f"Index directory: {INDEX_DIR}")
-    print(f"Max entries: {args.max_entries}")
-    
-    # Find Chrome profile directories
-    from .readers import ChromeHistoryReader
-    
-    if args.auto_find_profiles:
-        profile_dirs = ChromeHistoryReader.find_chrome_profiles()
-        if not profile_dirs:
-            print("No Chrome profiles found automatically. Exiting.")
-            return
-    else:
-        # Use single specified profile
-        profile_path = Path(args.chrome_profile)
-        if not profile_path.exists():
-            print(f"Chrome profile not found: {profile_path}")
-            return
-        profile_dirs = [profile_path]
-    
-    # Create or load the LEANN index from all sources
-    index_path = create_leann_index_from_multiple_chrome_profiles(profile_dirs, INDEX_PATH, args.max_entries)
-    
-    if index_path:
-        if args.query:
-            # Run single query
-            await query_leann_index(index_path, args.query)
-        else:
-            # Example queries
-            queries = [
-                "What websites did I visit about machine learning?",
-                "Find my search history about programming"
-            ]
-            
-            for query in queries:
-                print("\n" + "="*60)
-                await query_leann_index(index_path, query)
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/apps/browser/readers.py
+++ b/apps/browser/readers.py
@@ -1,176 +0,0 @@
-import sqlite3
-import os
-from pathlib import Path
-from typing import List, Any
-from llama_index.core import Document
-from llama_index.core.readers.base import BaseReader
-
-class ChromeHistoryReader(BaseReader):
-    """
-    Chrome browser history reader that extracts browsing data from SQLite database.
-    
-    Reads Chrome history from the default Chrome profile location and creates documents
-    with embedded metadata similar to the email reader structure.
-    """
-    
-    def __init__(self) -> None:
-        """Initialize."""
-        pass
-    
-    def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
-        """
-        Load Chrome history data from the default Chrome profile location.
-        
-        Args:
-            input_dir: Not used for Chrome history (kept for compatibility)
-            **load_kwargs:
-                max_count (int): Maximum amount of history entries to read.
-                chrome_profile_path (str): Custom path to Chrome profile directory.
-        """
-        docs: List[Document] = []
-        max_count = load_kwargs.get('max_count', 1000)
-        chrome_profile_path = load_kwargs.get('chrome_profile_path', None)
-        
-        # Default Chrome profile path on macOS
-        if chrome_profile_path is None:
-            chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
-        
-        history_db_path = os.path.join(chrome_profile_path, "History")
-        
-        if not os.path.exists(history_db_path):
-            print(f"Chrome history database not found at: {history_db_path}")
-            return docs
-        
-        try:
-            # Connect to the Chrome history database
-            print(f"Connecting to database: {history_db_path}")
-            conn = sqlite3.connect(history_db_path)
-            cursor = conn.cursor()
-            
-            # Query to get browsing history with metadata (removed created_time column)
-            query = """
-            SELECT 
-                datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
-                url, 
-                title, 
-                visit_count, 
-                typed_count, 
-                hidden
-            FROM urls 
-            ORDER BY last_visit_time DESC
-            """
-            
-            print(f"Executing query on database: {history_db_path}")
-            cursor.execute(query)
-            rows = cursor.fetchall()
-            print(f"Query returned {len(rows)} rows")
-            
-            count = 0
-            for row in rows:
-                if count >= max_count and max_count > 0:
-                    break
-                
-                last_visit, url, title, visit_count, typed_count, hidden = row
-                
-                # Create document content with metadata embedded in text
-                doc_content = f"""
-[BROWSING HISTORY METADATA]
-URL: {url}
-Title: {title}
-Last Visit: {last_visit}
-Visit Count: {visit_count}
-Typed Count: {typed_count}
-Hidden: {hidden}
-[END METADATA]
-
-Title: {title}
-URL: {url}
-Last visited: {last_visit}
-"""
-                
-                # Create document with embedded metadata
-                doc = Document(text=doc_content, metadata={})
-                docs.append(doc)
-                count += 1
-            
-            conn.close()
-            print(f"Loaded {len(docs)} Chrome history documents")
-            
-        except Exception as e:
-            print(f"Error reading Chrome history: {e}")
-            return docs
-        
-        return docs
-
-    @staticmethod
-    def find_chrome_profiles() -> List[Path]:
-        """
-        Find all Chrome profile directories.
-        
-        Returns:
-            List of Path objects pointing to Chrome profile directories
-        """
-        chrome_base_path = Path(os.path.expanduser("~/Library/Application Support/Google/Chrome"))
-        profile_dirs = []
-        
-        if not chrome_base_path.exists():
-            print(f"Chrome directory not found at: {chrome_base_path}")
-            return profile_dirs
-        
-        # Find all profile directories
-        for profile_dir in chrome_base_path.iterdir():
-            if profile_dir.is_dir() and profile_dir.name != "System Profile":
-                history_path = profile_dir / "History"
-                if history_path.exists():
-                    profile_dirs.append(profile_dir)
-                    print(f"Found Chrome profile: {profile_dir}")
-        
-        print(f"Found {len(profile_dirs)} Chrome profiles")
-        return profile_dirs
-
-    @staticmethod
-    def export_history_to_file(output_file: str = "chrome_history_export.txt", max_count: int = 1000):
-        """
-        Export Chrome history to a text file using the same SQL query format.
-        
-        Args:
-            output_file: Path to the output file
-            max_count: Maximum number of entries to export
-        """
-        chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
-        history_db_path = os.path.join(chrome_profile_path, "History")
-        
-        if not os.path.exists(history_db_path):
-            print(f"Chrome history database not found at: {history_db_path}")
-            return
-        
-        try:
-            conn = sqlite3.connect(history_db_path)
-            cursor = conn.cursor()
-            
-            query = """
-            SELECT 
-                datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
-                url, 
-                title, 
-                visit_count, 
-                typed_count, 
-                hidden
-            FROM urls 
-            ORDER BY last_visit_time DESC
-            LIMIT ?
-            """
-            
-            cursor.execute(query, (max_count,))
-            rows = cursor.fetchall()
-            
-            with open(output_file, 'w', encoding='utf-8') as f:
-                for row in rows:
-                    last_visit, url, title, visit_count, typed_count, hidden = row
-                    f.write(f"{last_visit}\t{url}\t{title}\t{visit_count}\t{typed_count}\t{hidden}\n")
-            
-            conn.close()
-            print(f"Exported {len(rows)} history entries to {output_file}")
-            
-        except Exception as e:
-            print(f"Error exporting Chrome history: {e}") 
--- a/apps/documents/init.py
+++ b/apps/documents/init.py
--- a/apps/documents/main.py
+++ b/apps/documents/main.py
@@ -1,113 +0,0 @@
-import argparse
-from llama_index.core import SimpleDirectoryReader
-from llama_index.core.node_parser import SentenceSplitter
-import asyncio
-import dotenv
-from leann.api import LeannBuilder, LeannChat
-from pathlib import Path
-import os
-
-dotenv.load_dotenv()
-
-
-async def main(args):
-    INDEX_DIR = Path(args.index_dir)
-    INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
-
-    if not INDEX_DIR.exists():
-        node_parser = SentenceSplitter(
-            chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
-        )
-
-        print("Loading documents...")
-        # Get the data directory relative to this module
-        current_dir = Path(__file__).parent
-        data_dir = current_dir / "data"
-        
-        documents = SimpleDirectoryReader(
-            str(data_dir),
-            recursive=True,
-            encoding="utf-8",
-            required_exts=[".pdf", ".txt", ".md"],
-        ).load_data(show_progress=True)
-        print("Documents loaded.")
-        all_texts = []
-        for doc in documents:
-            nodes = node_parser.get_nodes_from_documents([doc])
-            for node in nodes:
-                all_texts.append(node.get_content())
-
-        print("--- Index directory not found, building new index ---")
-
-        print("\n[PHASE 1] Building Leann index...")
-
-        # Use HNSW backend for better macOS compatibility
-        builder = LeannBuilder(
-            backend_name="hnsw",
-            embedding_model="facebook/contriever",
-            graph_degree=32,
-            complexity=64,
-            is_compact=True,
-            is_recompute=True,
-            num_threads=1,  # Force single-threaded mode
-        )
-
-        print(f"Loaded {len(all_texts)} text chunks from documents.")
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
-
-        builder.build_index(INDEX_PATH)
-        print(f"\nLeann index built at {INDEX_PATH}!")
-    else:
-        print(f"--- Using existing index at {INDEX_DIR} ---")
-
-    print(f"\n[PHASE 2] Starting Leann chat session...")
-
-    # llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
-    llm_config = {"type": "ollama", "model": "qwen3:8b"}
-
-    chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
-
-    query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
-
-    # query = (
-    #     "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面，任务令一般在什么城市颁发"
-    # )
-
-    print(f"You: {query}")
-    chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32)
-    print(f"Leann: {chat_response}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Run Leann Chat with various LLM backends."
-    )
-    parser.add_argument(
-        "--llm",
-        type=str,
-        default="hf",
-        choices=["simulated", "ollama", "hf", "openai"],
-        help="The LLM backend to use.",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="Qwen/Qwen3-0.6B",
-        help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
-    )
-    parser.add_argument(
-        "--host",
-        type=str,
-        default="http://localhost:11434",
-        help="The host for the Ollama API.",
-    )
-    parser.add_argument(
-        "--index-dir",
-        type=str,
-        default="./test_doc_files",
-        help="Directory where the Leann index will be stored.",
-    )
-    args = parser.parse_args()
-
-    asyncio.run(main(args))
--- a/apps/documents/data/pangu.md
+++ b/apps/documents/data/pangu.md
@@ -1,82 +0,0 @@
-# 盘古之殇：华为诺亚盘古大模型研发历程的心酸与黑暗
-
-各位好，
-
-我是一名盘古大模型团队，华为诺亚方舟实验室的员工。
-
-首先为自证身份，列举一些细节：
-
-1. 现诺亚主任，前算法应用部部长，后改名为小模型实验室的主任王云鹤。前诺亚主任：姚骏（大家称姚老师）。几个实验室主任：唐睿明（明哥，明队，已离职），尚利峰，张维（维哥），郝建业（郝老师），刘武龙（称呼为武龙所）等。其他骨干成员和专家陆续有很多人离职。
-2. 我们隶属于“四野”这个组织。四野下属有许多纵队，基础语言大模型是四纵。王云鹤的小模型是十六纵队。我们参加过苏州的集结，有各种月份的时间节点。在苏州攻关会颁发任务令，需要在节点前达成目标。苏州集结会把各地的人员都集中在苏州研究所，平常住宾馆，比如在甪直的酒店，与家人孩子天各一方。
-3. 在苏州集结的时候周六默认上班，非常辛苦，不过周六有下午茶，有一次还有小龙虾。在苏州研究所的工位搬迁过一次，从一栋楼换到了另一栋。苏州研究所楼栋都是欧式装修，门口有大坡，里面景色很不错。去苏州集结一般至少要去一周，甚至更久，多的人甚至一两个月都回不了家。
-4. 诺亚曾经传说是研究型的，但是来了之后因为在四野做大模型项目，项目成员完全变成了交付型的，且充满了例会，评审，汇报。很多时候做实验都要申请。团队需要对接终端小艺，华为云，ICT等诸多业务线，交付压力不小。
-5. 诺亚研发的盘古模型早期内部代号叫做“盘古智子”，一开始只有内部需要申请试用的网页版，到后续迫于压力在welink上接入和公测开放。
-
-这些天发生关于质疑盘古大模型抄袭千问的事情闹的沸沸扬扬。作为一个盘古团队的成员，我最近夜夜辗转反侧，难以入眠。盘古的品牌受到如此大的影响，一方面，我自私的为我的职业发展担忧，也为自己过去的努力工作感到不值。另一方面，由于有人开始揭露这些事情我内心又感到大快人心。在多少个日日夜夜，我们对内部某些人一次次靠着造假而又获得了无数利益的行为咬牙切齿而又无能为力。这种压抑和羞辱也逐渐消磨了我对华为的感情，让我在这里的时日逐渐浑浑噩噩，迷茫无措，时常怀疑自己的人生和自我价值。
-
-我承认我是一个懦弱的人，作为一个小小的打工人，我不仅不敢和王云鹤等内部手眼通天的人做对，更不敢和华为这样的庞然大物做对。我很怕失去我的工作，毕竟我也有家人和孩子，所以我打心眼里很佩服揭露者。但是，看到内部还在试图洗地掩盖事实，蒙蔽公众的时候，我实在不能容忍了。我也希望勇敢一次，顺从自己本心。就算自损八百，我也希望能伤敌一千。我决定把我在这里的所见所闻（部分来自于同事口述）公布出来，关于盘古大模型的“传奇故事”：
-
-华为确实主要在昇腾卡上训练大模型（小模型实验室有不少英伟达的卡，他们之前也会用来训练，后面转移到昇腾）。曾经我被华为“打造世界第二选择”的决心而折服，我本身也曾经对华为有深厚的感情。我们陪着昇腾一步步摸爬滚打，从充满bug到现在能训出模型，付出了巨大的心血和代价。
-
-最初我们的算力非常有限，在910A上训练模型。那会只支持fp16，训练的稳定性远不如bf16。盘古的moe开始很早，23年就主要是训练38Bmoe模型和后续的71B dense模型。71B的dense模型通过扩增变成了第一代的135Bdense模型，后面主力模型也逐渐在910B上训练。
-
-71B和135B模型都有一个巨大的硬伤就是tokenizer。当时使用的tokenizer编码效率极低，每个单个的符号，数字，空格，乃至汉字都会占用一个token。可想而知这会非常浪费算力，且使得模型的效果很差。这时候小模型实验室正好有个自己训的词表。姚老师当时怀疑是不是模型的tokenizer不好（虽然事后来看，他的怀疑是无疑正确的），于是就决定，让71B和135B换tokenizer，因为小模型实验室曾经尝试过。团队缝合了两个tokenizer，开始了tokenizer的更换。71B模型的更换失败了，而135B因为采用了更精细的embedding初始化策略，续训了至少1T的数据后词表总算更换成功，但可想而知，效果并不会变好。
-
-于此同期，阿里和智谱等国内其他公司在GPU上训练，且已经摸索出了正确的方法，盘古和竞品的差距越来越大。内部一个230B从头训练的dense模型又因为各种原因训练失败，导致项目的状况几乎陷入绝境。面临几个节点的压力以及内部对盘古的强烈质疑时，团队的士气低迷到了极点。团队在算力极其有限的时候，做出了很多努力和挣扎。比如，团队偶然发现当时的38B moe并没有预期moe的效果。于是去掉了moe参数，还原为了13B的dense模型。由于38B的moe源自很早的pangu alpha 13B，架构相对落后，团队进行了一系列的操作，比如切换绝对位置编码到rope，去掉bias，切换为rmsnorm。同时鉴于tokenizer的一些失败和换词表的经验，这个模型的词表也更换为了王云鹤的小模型实验室7B模型所使用的词表。后面这个13B模型进行了扩增续训，变成了第二代38B dense模型（在几个月内这个模型都是主要的盘古中档位模型），曾经具有一定的竞争力。但是，由于更大的135B模型架构落后，且更换词表模型损伤巨大（后续分析发现当时更换的缝合词表有更严重的bug），续训后也与千问等当时国内领先模型存在很大差距。这时由于内部的质疑声和领导的压力也越来越大。团队的状态几乎陷入了绝境。
-
-在这种情况下，王云鹤和他的小模型实验室出手了。他们声称是从旧的135B参数继承改造而来，通过训练短短的几百B数据，各项指标平均提升了十个点左右。实际上，这就是他们套壳应用到大模型的第一次杰作。华为的外行领导内行，使得领导完全对于这种扯淡的事情没有概念，他们只会觉得肯定是有什么算法创新。经过内部的分析，他们实际上是使用Qwen 1.5 110B续训而来，通过加层，扩增ffn维度，添加盘古pi论文的一些机制得来，凑够了大概135B的参数。实际上，旧的135B有107层，而这个模型只有82层，各种配置也都不一样。新的来路不明的135B训练完很多参数的分布也和Qwen 110B几乎一模一样。连模型代码的类名当时都是Qwen，甚至懒得改名。后续这个模型就是所谓的135B V2。而这个模型当时也提供给了很多下游，甚至包括外部客户。
-
-这件事对于我们这些认真诚实做事的同事们带来了巨大的冲击，内部很多人其实都知道这件事，甚至包括终端和华为云。我们都戏称以后别叫盘古模型了，叫千古吧。当时团队成员就想向bcg举报了，毕竟这已经是重大的业务造假了。但是后面据说被领导拦了下来，因为更高级别的领导（比如姚老师，以及可能熊总和查老）其实后面也知道了，但是并不管，因为通过套壳拿出好的结果，对他们也是有利的。这件事使得当时团队几位最强的同事开始心灰意冷，离职跑路也逐渐成为挂在嘴边的事。
-
-此时，盘古似乎迎来了转机。由于前面所述的这些盘古模型基本都是续训和改造而来，当时诺亚完全没有掌握从头训练的技术，何况还是在昇腾的NPU上进行训练。在当时团队的核心成员的极力争取下，盘古开始了第三代模型的训练，付出了巨大的努力后，在数据架构和训练算法方面都与业界逐渐接轨，而这其中的艰辛和小模型实验室的人一点关系都没有。
-
-一开始团队成员毫无信心，只从一个13B的模型开始训练，但是后面发现效果还不错，于是这个模型后续再次进行了一次参数扩增，变成了第三代的38B，代号38B V3。想必很多产品线的兄弟都对这个模型很熟悉。当时这个模型的tokenizer是基于llama的词表进行扩展的（也是业界常见的做法）。而当时王云鹤的实验室做出来了另一个词表（也就是后续pangu系列的词表）。当时两个词表还被迫进行了一次赛马，最终没有明显的好坏结论。于是，领导当即决定，应该统一词表，使用王云鹤他们的。于是，在后续从头训练的135B V3（也就是对外的Pangu Ultra），便是采用了这个tokenizer。这也解释了很多使用我们模型的兄弟的疑惑，为什么当时同为V3代的两个不同档位的模型，会使用不同的tokenizer。
-
-
-我们打心眼里觉得，135B V3是我们四纵团队当时的骄傲。这是第一个真正意义上的，华为全栈自研，正经从头训练的千亿级别的模型，且效果与24年同期竞品可比的。写到这里我已经热泪盈眶，太不容易了。当时为了稳定训练，团队做了大量实验对比，并且多次在模型梯度出现异常的时候进行及时回退重启。这个模型真正做到了后面技术报告所说的训练全程没有一个loss spike。我们克服了不知道多少困难，我们做到了，我们愿用生命和荣誉保证这个模型训练的真实性。多少个凌晨，我们为了它的训练而不眠。在被内部心声骂的一文不值的时候，我们有多么不甘，有多少的委屈，我们挺住了。
-
-我们这帮人是真的在为打磨国产算力底座燃烧自己的青春啊……客居他乡，我们放弃了家庭，放弃了假期，放弃了健康，放弃了娱乐，抛头颅洒热血，其中的艰辛与困苦，寥寥数笔不足以概括其万一。在各种动员大会上，当时口号中喊出的盘古必胜，华为必胜，我们心里是真的深深被感动。
-
-然而，我们的所有辛苦的成果，经常被小模型实验室轻飘飘的拿走了。数据，直接要走。代码，直接要走，还要求我们配合适配到能一键运行。我们当时戏称小模型实验室为点鼠标实验室。我们付出辛苦，他们取得荣耀。果然应了那句话，你在负重前行是因为有人替你岁月静好。在这种情况下，越来越多的战友再也坚持不下去了，选择了离开。看到身边那些优秀的同事一个个离职，我的内心又感叹又难过。在这种作战一样的环境下，我们比起同事来说更像是战友。他们在技术上也有无数值得我学习的地方，堪称良师。看到他们去了诸如字节Seed，Deepseek，月之暗面，腾讯和快手等等很多出色的团队，我打心眼里为他们高兴和祝福，脱离了这个辛苦却肮脏的地方。我至今还对一位离职同事的话记忆犹新，ta说：“来这里是我技术生涯中的耻辱，在这里再呆每一天都是浪费生命”。话虽难听却让我无言以对。我担心我自己技术方面的积累不足，以及没法适应互联网公司高淘汰的环境，让我多次想离职的心始终没有迈出这一步。
-
-盘古除了dense模型，后续也启动了moe的探索。一开始训练的是一个224B的moe模型。而与之平行的，小模型实验室也开启了第二次主要的套壳行动（次要的插曲可能还包括一些别的模型，比如math模型），即这次流传甚广的pangu pro moe 72B。这个模型内部自称是从小模型实验室的7B扩增上来的（就算如此，这也与技术报告不符，何况是套壳qwen 2.5的14b续训）。还记得他们训了没几天，内部的评测就立刻追上了当时的38B V3。AI系统实验室很多兄弟因为需要适配模型，都知道他们的套壳行动，只是迫于各种原因，无法伸张正义。实际上，对于后续训了很久很久的这个模型，Honestagi能够分析出这个量级的相似性我已经很诧异了，因为这个模型为了续训洗参数，所付出的算力甚至早就足够从头训一个同档位的模型了。听同事说他们为了洗掉千问的水印，采取了不少办法，甚至包括故意训了脏数据。这也为学术界研究模型血缘提供了一个前所未有的特殊模范吧。以后新的血缘方法提出可以拿出来溜溜。
-
-24年底和25年初，在Deepseek v3和r1发布之后，由于其惊艳的技术水平，团队受到了巨大的冲击，也受到了更大的质疑。于是为了紧跟潮流，盘古模仿Deepseek的模型尺寸，开启了718B moe的训练。这个时候，小模型实验室再次出手了。他们选择了套壳Deepseekv3续训。他们通过冻住Deepseek加载的参数，进行训练。连任务加载ckpt的目录都是deepseekv3，改都不改，何其嚣张？与之相反，一些有真正技术信仰的同事，在从头训练另一个718B的moe。但其中出现了各种各样的问题。但是很显然，这个模型怎么可能比直接套壳的好呢？如果不是团队leader坚持，早就被叫停了。
-
-华为的流程管理之繁重，严重拖累了大模型的研发节奏，例如版本管理，模型血缘，各种流程化，各种可追溯。讽刺的是，小模型实验室的模型似乎从来不受这些流程的约束，想套壳就套壳，想续训就续训，算力源源不断的伸手拿走。这种强烈到近乎魔幻的对比，说明了当前流程管理的情况：只许州官放火，不许百姓点灯。何其可笑？何其可悲？何其可恶？何其可耻！
-
-HonestAGI的事情出来后，内部让大家不停的研讨分析，如何公关和“回应”。诚然，这个原文的分析也许不够有力，给了王云鹤与小模型实验室他们狡辩和颠倒黑白的机会。为此，这两天我内心感到作呕，时时怀疑自己的人生意义以及苍天无眼。我不奉陪了，我要离职了，同时我也在申请从盘古部分技术报告的作者名单中移除。曾经在这些技术报告上署名是我一生都无法抹除的污点。当时我没想到，他们竟然猖狂到敢开源。我没想到，他们敢如此愚弄世人，大肆宣发。当时，我也许是存了侥幸心理，没有拒绝署名。我相信很多扎实做事的战友，也只是被迫上了贼船，或者不知情。但这件事已经无法挽回，我希望我的余生能够坚持扎实做真正有意义的事，为我当时的软弱和不坚定赎罪。
-
-深夜写到这里，我已经泪流满面，泣不成声。还记得一些出色的同事离职时，我苦笑问他们要不要发个长长的心声惯例帖，揭露一下现状。对方说：不了，浪费时间，而且我也怕揭露出来你们过的更糟。我当时一下黯然神伤，因为曾经共同为了理想奋斗过的战友已经彻底对华为彻底灰心了。当时大家调侃，我们用着当年共产党的小米加步枪，组织却有着堪比当年国民党的作风。
-
-曾几何时，我为我们用着小米加步枪打败洋枪洋炮而自豪。
-
-现在，我累了，我想投降。
-
-其实时至今日，我还是真心希望华为能认真吸取教训，能做好盘古，把盘古做到世界一流，把昇腾变成英伟达的水平。内部的劣币驱逐良币，使得诺亚乃至华为在短时间内急剧流失了大量出色的大模型人才。相信他们也正在如Deepseek等各个团队闪耀着，施展着他们的抱负才华，为中美在AI的激烈竞赛中奉献力量。我时常感叹，华为不是没有人才，而是根本不知道怎么留住人才。如果给这些人合适的环境，合适的资源，更少的枷锁，更少的政治斗争，盘古何愁不成？
-
-最后：我以生命，人格和荣誉发誓，我写的以上所有内容均为真实（至少在我有限的认知范围内）。我没有那么高的技术水平以及机会去做详尽扎实的分析，也不敢直接用内部记录举证，怕因为信息安全抓到。但是我相信我很多曾经的战友，会为我作证。在华为内部的兄弟，包括我们曾经服务过的产品线兄弟们，相信本文的无数细节能和你们的印象对照，印证我的说法。你们可能也曾经被蒙骗，但这些残酷的真相不会被尘封。我们奋战过的痕迹，也不应该被扭曲和埋葬。
-
-写了这么多，某些人肯定想把我找出来，抹杀掉。公司搞不好也想让我噤声乃至追责。如果真的这样，我，乃至我的家人的人身乃至生命安全可能都会受到威胁。为了自我保护，我近期每天会跟大家报平安。
-
-如果我消失了，就当是我为了真理和理想，为了华为乃至中国能够更好地发展算力和AI而牺牲了吧，我愿埋葬于那片曾经奋斗过的地方。
-
-诺亚，再见
-
-2025年7月6日凌晨      写于深圳
-
---
-
-各位好，
-
-感谢大家的关心与祝福。我目前暂时安全，但公司应该在进行排查与某些名单收集，后续情况未知。
-
-我补充一些细节，以免某些人继续颠倒黑白。
-
-关于135B V2，小模型实验室在迅速地完成套壳并拿完所有套壳带来的好处后（比如任务令表彰和及时激励），因为不想继续支撑下游应用和模型迭代，又把这个烫手山芋甩给了四纵。确实技高一筹，直接把四纵的兄弟们拉下水。同事提供过去一个老旧的模型，最终拿回了一个当时一个魔改的先进的千问。做大模型的人，自己做的模型就像自己孩子一样熟悉，不要把别人都当傻子。就像自家儿子出门一趟，回来个别人家孩子。
-
-盘古report的署名是不符合学术规范的。例如，135B V3有不少有技术贡献的人，因为作者名额数量限制，劳动成果没有得到应有的回报，团队内曾经有不小的意见。这个模型当时是大家智慧和汗水的结晶，甚至是团队当时的精神支柱，支撑着不少兄弟们继续留在诺亚。所谓的名额限制，以及挂名了一些毫无技术贡献的人（如一些小模型实验室的人），让兄弟们何其心寒。
-
---
-
-暂时平安。另外，支持我勇于说出真相的战友们 https://github.com/HW-whistleblower/True-Story-of-Pangu/issues/317
--- a/apps/email/init.py
+++ b/apps/email/init.py
--- a/apps/email/main.py
+++ b/apps/email/main.py
@@ -1,193 +0,0 @@
-import os
-import sys
-import asyncio
-import dotenv
-import argparse
-from pathlib import Path
-from typing import List, Any
-
-from leann.api import LeannBuilder, LeannSearcher, LeannChat
-from llama_index.core.node_parser import SentenceSplitter
-
-dotenv.load_dotenv()
-
-# Auto-detect user's mail path
-def get_mail_path():
-    """Get the mail path for the current user"""
-    home_dir = os.path.expanduser("~")
-    return os.path.join(home_dir, "Library", "Mail")
-
-def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False, embedding_model: str = "facebook/contriever"):
-    """
-    Create LEANN index from multiple mail data sources.
-    
-    Args:
-        messages_dirs: List of Path objects pointing to Messages directories
-        index_path: Path to save the LEANN index
-        max_count: Maximum number of emails to process per directory
-        include_html: Whether to include HTML content in email processing
-    """
-    print("Creating LEANN index from multiple mail data sources...")
-    
-    # Load documents using EmlxReader from local readers module
-    from .readers import EmlxReader, find_all_messages_directories
-    reader = EmlxReader(include_html=include_html)
-    INDEX_DIR = Path(index_path).parent
-    
-    if not INDEX_DIR.exists():
-        print(f"--- Index directory not found, building new index ---")
-        all_documents = []
-        total_processed = 0
-        
-        # Process each Messages directory
-        for i, messages_dir in enumerate(messages_dirs):
-            print(f"\nProcessing Messages directory {i+1}/{len(messages_dirs)}: {messages_dir}")
-            
-            try:
-                documents = reader.load_data(messages_dir)
-                if documents:
-                    print(f"Loaded {len(documents)} email documents from {messages_dir}")
-                    all_documents.extend(documents)
-                    total_processed += len(documents)
-                    
-                    # Check if we've reached the max count
-                    if max_count > 0 and total_processed >= max_count:
-                        print(f"Reached max count of {max_count} documents")
-                        break
-                else:
-                    print(f"No documents loaded from {messages_dir}")
-            except Exception as e:
-                print(f"Error processing {messages_dir}: {e}")
-                continue
-        
-        if not all_documents:
-            print("No documents loaded from any source. Exiting.")
-            return None
-        
-        print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories")
-        
-        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
-        
-        # Convert Documents to text strings and chunk them
-        all_texts = []
-        for doc in all_documents:
-            # Split the document into chunks
-            nodes = text_splitter.get_nodes_from_documents([doc])
-            for node in nodes:
-                all_texts.append(node.get_content())
-        
-        print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
-        
-        # Create LEANN index directory
-        print(f"--- Index directory not found, building new index ---")
-        INDEX_DIR.mkdir(exist_ok=True)
-
-        print(f"--- Building new LEANN index ---")
-        
-        print(f"\n[PHASE 1] Building Leann index...")
-
-        # Use HNSW backend for better macOS compatibility
-        builder = LeannBuilder(
-            backend_name="hnsw",
-            embedding_model=embedding_model,
-            graph_degree=32, 
-            complexity=64,
-            is_compact=True,
-            is_recompute=True,
-            num_threads=1  # Force single-threaded mode
-        )
-
-        print(f"Adding {len(all_texts)} email chunks to index...")
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
-            
-        builder.build_index(index_path)
-        print(f"\nLEANN index built at {index_path}!")
-    else:
-        print(f"--- Using existing index at {INDEX_DIR} ---")
-    
-    return index_path
-
-async def query_leann_index(index_path: str, query: str):
-    """
-    Query the LEANN index.
-    
-    Args:
-        index_path: Path to the LEANN index
-        query: The query string
-    """
-    print(f"\n[PHASE 2] Starting Leann chat session...")
-    chat = LeannChat(index_path=index_path,
-                     llm_config={"type": "openai", "model": "gpt-4o"})
-    
-    print(f"You: {query}")
-    import time
-    start_time = time.time()
-    chat_response = chat.ask(
-        query, 
-        top_k=10, 
-        recompute_beighbor_embeddings=True,
-        complexity=12,
-        beam_width=1,
-        
-    )
-    end_time = time.time()
-    print(f"Time taken: {end_time - start_time} seconds")
-    print(f"Leann: {chat_response}")
-
-async def main():
-    # Parse command line arguments
-    parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
-    parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts",
-                       help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
-    parser.add_argument('--max-emails', type=int, default=1000,
-                       help='Maximum number of emails to process (-1 means all)')
-    parser.add_argument('--query', type=str, default="Give me some funny advertisement about apple or other companies",
-                       help='Single query to run (default: runs example queries)')
-    parser.add_argument('--include-html', action='store_true', default=False,
-                       help='Include HTML content in email processing (default: False)')
-    parser.add_argument('--embedding-model', type=str, default="facebook/contriever",
-                       help='Embedding model to use (default: facebook/contriever)')
-    
-    args = parser.parse_args()
-
-    print(f"args: {args}")
-    
-    # Automatically find all Messages directories under the current user's Mail directory
-    from .readers import find_all_messages_directories
-    mail_path = get_mail_path()
-    print(f"Searching for email data in: {mail_path}")
-    messages_dirs = find_all_messages_directories(mail_path)
-    
-    print('len(messages_dirs): ', len(messages_dirs))
-    
-    if not messages_dirs:
-        print("No Messages directories found. Exiting.")
-        return
-    
-    INDEX_DIR = Path(args.index_dir)
-    INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
-    print(f"Index directory: {INDEX_DIR}")
-    print(f"Found {len(messages_dirs)} Messages directories.")
-    
-    # Create or load the LEANN index from all sources
-    index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model)
-    
-    if index_path:
-        if args.query:
-            # Run single query
-            await query_leann_index(index_path, args.query)
-        else:
-            # Example queries
-            queries = [
-                "Hows Berkeley Graduate Student Instructor",
-                "how's the icloud related advertisement saying",
-                "Whats the number of class recommend to take per semester for incoming EECS students"
-            ]
-            for query in queries:
-                print("\n" + "="*60)
-                await query_leann_index(index_path, query)
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/apps/email/email.py
+++ b/apps/email/email.py
@@ -1,192 +0,0 @@
-"""
-Mbox parser.
-
-Contains simple parser for mbox files.
-
-"""
-
-import logging
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-from fsspec import AbstractFileSystem
-
-from llama_index.core.readers.base import BaseReader
-from llama_index.core.schema import Document
-
-logger = logging.getLogger(__name__)
-
-
-class MboxReader(BaseReader):
-    """
-    Mbox parser.
-
-    Extract messages from mailbox files.
-    Returns string including date, subject, sender, receiver and
-    content for each message.
-
-    """
-
-    DEFAULT_MESSAGE_FORMAT: str = (
-        "Date: {_date}\n"
-        "From: {_from}\n"
-        "To: {_to}\n"
-        "Subject: {_subject}\n"
-        "Content: {_content}"
-    )
-
-    def __init__(
-        self,
-        *args: Any,
-        max_count: int = 0,
-        message_format: str = DEFAULT_MESSAGE_FORMAT,
-        **kwargs: Any,
-    ) -> None:
-        """Init params."""
-        try:
-            from bs4 import BeautifulSoup  # noqa
-        except ImportError:
-            raise ImportError(
-                "`beautifulsoup4` package not found: `pip install beautifulsoup4`"
-            )
-
-        super().__init__(*args, **kwargs)
-        self.max_count = max_count
-        self.message_format = message_format
-
-    def load_data(
-        self,
-        file: Path,
-        extra_info: Optional[Dict] = None,
-        fs: Optional[AbstractFileSystem] = None,
-    ) -> List[Document]:
-        """Parse file into string."""
-        # Import required libraries
-        import mailbox
-        from email.parser import BytesParser
-        from email.policy import default
-
-        from bs4 import BeautifulSoup
-
-        if fs:
-            logger.warning(
-                "fs was specified but MboxReader doesn't support loading "
-                "from fsspec filesystems. Will load from local filesystem instead."
-            )
-
-        i = 0
-        results: List[str] = []
-        # Load file using mailbox
-        bytes_parser = BytesParser(policy=default).parse
-        mbox = mailbox.mbox(file, factory=bytes_parser)  # type: ignore
-
-        # Iterate through all messages
-        for _, _msg in enumerate(mbox):
-            try:
-                msg: mailbox.mboxMessage = _msg
-                # Parse multipart messages
-                if msg.is_multipart():
-                    for part in msg.walk():
-                        ctype = part.get_content_type()
-                        cdispo = str(part.get("Content-Disposition"))
-                        if "attachment" in cdispo:
-                            print(f"Attachment found: {part.get_filename()}")
-                        if ctype == "text/plain" and "attachment" not in cdispo:
-                            content = part.get_payload(decode=True)  # decode
-                            break
-                # Get plain message payload for non-multipart messages
-                else:
-                    content = msg.get_payload(decode=True)
-
-                # Parse message HTML content and remove unneeded whitespace
-                soup = BeautifulSoup(content)
-                stripped_content = " ".join(soup.get_text().split())
-                # Format message to include date, sender, receiver and subject
-                msg_string = self.message_format.format(
-                    _date=msg["date"],
-                    _from=msg["from"],
-                    _to=msg["to"],
-                    _subject=msg["subject"],
-                    _content=stripped_content,
-                )
-                # Add message string to results
-                results.append(msg_string)
-            except Exception as e:
-                logger.warning(f"Failed to parse message:\n{_msg}\n with exception {e}")
-
-            # Increment counter and return if max count is met
-            i += 1
-            if self.max_count > 0 and i >= self.max_count:
-                break
-
-        return [Document(text=result, metadata=extra_info or {}) for result in results]
-
-
-class EmlxMboxReader(MboxReader):
-    """
-    EmlxMboxReader - Modified MboxReader that handles directories of .emlx files.
-    
-    Extends MboxReader to work with Apple Mail's .emlx format by:
-    1. Reading .emlx files from a directory
-    2. Converting them to mbox format in memory
-    3. Using the parent MboxReader's parsing logic
-    """
-
-    def load_data(
-        self,
-        directory: Path,
-        extra_info: Optional[Dict] = None,
-        fs: Optional[AbstractFileSystem] = None,
-    ) -> List[Document]:
-        """Parse .emlx files from directory into strings using MboxReader logic."""
-        import tempfile
-        import os
-        
-        if fs:
-            logger.warning(
-                "fs was specified but EmlxMboxReader doesn't support loading "
-                "from fsspec filesystems. Will load from local filesystem instead."
-            )
-
-        # Find all .emlx files in the directory
-        emlx_files = list(directory.glob("*.emlx"))
-        logger.info(f"Found {len(emlx_files)} .emlx files in {directory}")
-        
-        if not emlx_files:
-            logger.warning(f"No .emlx files found in {directory}")
-            return []
-
-        # Create a temporary mbox file
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.mbox', delete=False) as temp_mbox:
-            temp_mbox_path = temp_mbox.name
-            
-            # Convert .emlx files to mbox format
-            for emlx_file in emlx_files:
-                try:
-                    # Read the .emlx file
-                    with open(emlx_file, 'r', encoding='utf-8', errors='ignore') as f:
-                        content = f.read()
-                    
-                    # .emlx format: first line is length, rest is email content
-                    lines = content.split('\n', 1)
-                    if len(lines) >= 2:
-                        email_content = lines[1]  # Skip the length line
-                        
-                        # Write to mbox format (each message starts with "From " and ends with blank line)
-                        temp_mbox.write(f"From {emlx_file.name} {email_content}\n\n")
-                    
-                except Exception as e:
-                    logger.warning(f"Failed to process {emlx_file}: {e}")
-                    continue
-            
-            # Close the temporary file so MboxReader can read it
-            temp_mbox.close()
-            
-            try:
-                # Use the parent MboxReader's logic to parse the mbox file
-                return super().load_data(Path(temp_mbox_path), extra_info, fs)
-            finally:
-                # Clean up temporary file
-                try:
-                    os.unlink(temp_mbox_path)
-                except:
-                    pass
--- a/apps/email/readers.py
+++ b/apps/email/readers.py
@@ -1,124 +0,0 @@
-import os
-import email
-from pathlib import Path
-from typing import List, Any
-from llama_index.core import Document
-from llama_index.core.readers.base import BaseReader
-
-def find_all_messages_directories(root: str = None) -> List[Path]:
-    """
-    Recursively find all 'Messages' directories under the given root.
-    Returns a list of Path objects.
-    """
-    if root is None:
-        # Auto-detect user's mail path
-        home_dir = os.path.expanduser("~")
-        root = os.path.join(home_dir, "Library", "Mail")
-    
-    messages_dirs = []
-    for dirpath, dirnames, filenames in os.walk(root):
-        if os.path.basename(dirpath) == "Messages":
-            messages_dirs.append(Path(dirpath))
-    return messages_dirs
-
-class EmlxReader(BaseReader):
-    """
-    Apple Mail .emlx file reader with embedded metadata.
-    
-    Reads individual .emlx files from Apple Mail's storage format.
-    """
-    
-    def __init__(self, include_html: bool = False) -> None:
-        """
-        Initialize.
-        
-        Args:
-            include_html: Whether to include HTML content in the email body (default: False)
-        """
-        self.include_html = include_html
-    
-    def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
-        """
-        Load data from the input directory containing .emlx files.
-        
-        Args:
-            input_dir: Directory containing .emlx files
-            **load_kwargs:
-                max_count (int): Maximum amount of messages to read.
-        """
-        docs: List[Document] = []
-        max_count = load_kwargs.get('max_count', 1000)
-        count = 0
-        
-        # Walk through the directory recursively
-        for dirpath, dirnames, filenames in os.walk(input_dir):
-            # Skip hidden directories
-            dirnames[:] = [d for d in dirnames if not d.startswith(".")]
-            
-            for filename in filenames:
-                if count >= max_count:
-                    break
-                    
-                if filename.endswith(".emlx"):
-                    filepath = os.path.join(dirpath, filename)
-                    try:
-                        # Read the .emlx file
-                        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
-                            content = f.read()
-                        
-                        # .emlx files have a length prefix followed by the email content
-                        # The first line contains the length, followed by the email
-                        lines = content.split('\n', 1)
-                        if len(lines) >= 2:
-                            email_content = lines[1]
-                            
-                            # Parse the email using Python's email module
-                            try:
-                                msg = email.message_from_string(email_content)
-                                
-                                # Extract email metadata
-                                subject = msg.get('Subject', 'No Subject')
-                                from_addr = msg.get('From', 'Unknown')
-                                to_addr = msg.get('To', 'Unknown')
-                                date = msg.get('Date', 'Unknown')
-                                
-                                # Extract email body
-                                body = ""
-                                if msg.is_multipart():
-                                    for part in msg.walk():
-                                        if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
-                                            if part.get_content_type() == "text/html" and not self.include_html:
-                                                continue
-                                            body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
-                                            # break
-                                else:
-                                    body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
-                                
-                                # Create document content with metadata embedded in text
-                                doc_content = f"""
-[EMAIL METADATA]
-File: {filename}
-From: {from_addr}
-To: {to_addr}
-Subject: {subject}
-Date: {date}
-[END METADATA]
-
-{body}
-"""
-                                
-                                # No separate metadata - everything is in the text
-                                doc = Document(text=doc_content, metadata={})
-                                docs.append(doc)
-                                count += 1
-                                
-                            except Exception as e:
-                                print(f"Error parsing email from {filepath}: {e}")
-                                continue
-                                
-                    except Exception as e:
-                        print(f"Error reading file {filepath}: {e}")
-                        continue
-        
-        print(f"Loaded {len(docs)} email documents")
-        return docs 
--- a/apps/evaluation/init.py
+++ b/apps/evaluation/init.py
--- a/apps/evaluation/main.py
+++ b/apps/evaluation/main.py
@@ -1,382 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script runs a recall evaluation on a given LEANN index.
-It correctly compares results by fetching the text content for both the new search
-results and the golden standard results, making the comparison robust to ID changes.
-"""
-
-import json
-import argparse
-import time
-from pathlib import Path
-import sys
-import numpy as np
-from typing import List
-
-from leann.api import LeannSearcher, LeannBuilder
-
-
-def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
-    """Checks if the data directory exists, and if not, downloads it from HF Hub."""
-    if not data_root.exists():
-        print(f"Data directory '{data_root}' not found.")
-        print(
-            "Downloading evaluation data from Hugging Face Hub... (this may take a moment)"
-        )
-        try:
-            from huggingface_hub import snapshot_download
-
-            if download_embeddings:
-                # Download everything including embeddings (large files)
-                snapshot_download(
-                    repo_id="LEANN-RAG/leann-rag-evaluation-data",
-                    repo_type="dataset",
-                    local_dir=data_root,
-                    local_dir_use_symlinks=False,
-                )
-                print("Data download complete (including embeddings)!")
-            else:
-                # Download only specific folders, excluding embeddings
-                allow_patterns = [
-                    "ground_truth/**",
-                    "indices/**",
-                    "queries/**",
-                    "*.md",
-                    "*.txt",
-                ]
-                snapshot_download(
-                    repo_id="LEANN-RAG/leann-rag-evaluation-data",
-                    repo_type="dataset",
-                    local_dir=data_root,
-                    local_dir_use_symlinks=False,
-                    allow_patterns=allow_patterns,
-                )
-                print("Data download complete (excluding embeddings)!")
-        except ImportError:
-            print(
-                "Error: huggingface_hub is not installed. Please install it to download the data:"
-            )
-            print("uv pip install -e '.[dev]'")
-            sys.exit(1)
-        except Exception as e:
-            print(f"An error occurred during data download: {e}")
-            sys.exit(1)
-
-
-def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):
-    """Download embeddings files specifically."""
-    embeddings_dir = data_root / "embeddings"
-
-    if dataset_type:
-        # Check if specific dataset embeddings exist
-        target_file = embeddings_dir / dataset_type / "passages_00.pkl"
-        if target_file.exists():
-            print(f"Embeddings for {dataset_type} already exist")
-            return str(target_file)
-
-    print("Downloading embeddings from HuggingFace Hub...")
-    try:
-        from huggingface_hub import snapshot_download
-
-        # Download only embeddings folder
-        snapshot_download(
-            repo_id="LEANN-RAG/leann-rag-evaluation-data",
-            repo_type="dataset",
-            local_dir=data_root,
-            local_dir_use_symlinks=False,
-            allow_patterns=["embeddings/**/*.pkl"],
-        )
-        print("Embeddings download complete!")
-
-        if dataset_type:
-            target_file = embeddings_dir / dataset_type / "passages_00.pkl"
-            if target_file.exists():
-                return str(target_file)
-
-        return str(embeddings_dir)
-
-    except Exception as e:
-        print(f"Error downloading embeddings: {e}")
-        sys.exit(1)
-
-
-# --- Helper Function to get Golden Passages ---
-def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
-    """
-    Retrieves the text for golden passage IDs directly from the LeannSearcher's
-    passage manager.
-    """
-    golden_texts = set()
-    for gid in golden_ids:
-        try:
-            # PassageManager uses string IDs
-            passage_data = searcher.passage_manager.get_passage(str(gid))
-            golden_texts.add(passage_data["text"])
-        except KeyError:
-            print(
-                f"Warning: Golden passage ID '{gid}' not found in the index's passage data."
-            )
-    return golden_texts
-
-
-def load_queries(file_path: Path) -> List[str]:
-    queries = []
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line in f:
-            data = json.loads(line)
-            queries.append(data["query"])
-    return queries
-
-
-def build_index_from_embeddings(
-    embeddings_file: str, output_path: str, backend: str = "hnsw"
-):
-    """
-    Build a LEANN index from pre-computed embeddings.
-
-    Args:
-        embeddings_file: Path to pickle file with (ids, embeddings) tuple
-        output_path: Path where to save the index
-        backend: Backend to use ("hnsw" or "diskann")
-    """
-    print(f"Building {backend} index from embeddings: {embeddings_file}")
-
-    # Create builder with appropriate parameters
-    if backend == "hnsw":
-        builder_kwargs = {
-            "M": 32,  # Graph degree
-            "efConstruction": 256,  # Construction complexity
-            "is_compact": True,  # Use compact storage
-            "is_recompute": True,  # Enable pruning for better recall
-        }
-    elif backend == "diskann":
-        builder_kwargs = {
-            "complexity": 64,
-            "graph_degree": 32,
-            "search_memory_maximum": 8.0,  # GB
-            "build_memory_maximum": 16.0,  # GB
-        }
-    else:
-        builder_kwargs = {}
-
-    builder = LeannBuilder(
-        backend_name=backend,
-        embedding_model="facebook/contriever-msmarco",  # Model used to create embeddings
-        dimensions=768,  # Will be auto-detected from embeddings
-        **builder_kwargs,
-    )
-
-    # Build index from precomputed embeddings
-    builder.build_index_from_embeddings(output_path, embeddings_file)
-    print(f"Index saved to: {output_path}")
-    return output_path
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run recall evaluation on a LEANN index."
-    )
-    parser.add_argument(
-        "index_path",
-        type=str,
-        nargs="?",
-        help="Path to the LEANN index to evaluate or build (optional).",
-    )
-    parser.add_argument(
-        "--mode",
-        choices=["evaluate", "build"],
-        default="evaluate",
-        help="Mode: 'evaluate' existing index or 'build' from embeddings",
-    )
-    parser.add_argument(
-        "--embeddings-file",
-        type=str,
-        help="Path to embeddings pickle file (optional for build mode)",
-    )
-    parser.add_argument(
-        "--backend",
-        choices=["hnsw", "diskann"],
-        default="hnsw",
-        help="Backend to use for building index (default: hnsw)",
-    )
-    parser.add_argument(
-        "--num-queries", type=int, default=10, help="Number of queries to evaluate."
-    )
-    parser.add_argument(
-        "--top-k", type=int, default=3, help="The 'k' value for recall@k."
-    )
-    parser.add_argument(
-        "--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW."
-    )
-    args = parser.parse_args()
-
-    # --- Path Configuration ---
-    # Assumes a project structure where the script is in 'examples/'
-    # and data is in 'data/' at the project root.
-    project_root = Path(__file__).resolve().parent.parent
-    data_root = project_root / "data"
-
-    # Download data based on mode
-    if args.mode == "build":
-        # For building mode, we need embeddings
-        download_data_if_needed(
-            data_root, download_embeddings=False
-        )  # Basic data first
-
-        # Auto-detect dataset type and download embeddings
-        if args.embeddings_file:
-            embeddings_file = args.embeddings_file
-            # Try to detect dataset type from embeddings file path
-            if "rpj_wiki" in str(embeddings_file):
-                dataset_type = "rpj_wiki"
-            elif "dpr" in str(embeddings_file):
-                dataset_type = "dpr"
-            else:
-                dataset_type = "dpr"  # Default
-        else:
-            # Auto-detect from index path if provided, otherwise default to DPR
-            if args.index_path:
-                index_path_str = str(args.index_path)
-                if "rpj_wiki" in index_path_str:
-                    dataset_type = "rpj_wiki"
-                elif "dpr" in index_path_str:
-                    dataset_type = "dpr"
-                else:
-                    dataset_type = "dpr"  # Default to DPR
-            else:
-                dataset_type = "dpr"  # Default to DPR
-
-            embeddings_file = download_embeddings_if_needed(data_root, dataset_type)
-
-        # Auto-generate index path if not provided
-        if not args.index_path:
-            indices_dir = data_root / "indices" / dataset_type
-            indices_dir.mkdir(parents=True, exist_ok=True)
-            args.index_path = str(indices_dir / f"{dataset_type}_from_embeddings")
-            print(f"Auto-generated index path: {args.index_path}")
-
-        print(f"Building index from embeddings: {embeddings_file}")
-        built_index_path = build_index_from_embeddings(
-            embeddings_file, args.index_path, args.backend
-        )
-        print(f"Index built successfully: {built_index_path}")
-
-        # Ask if user wants to run evaluation
-        eval_response = (
-            input("Run evaluation on the built index? (y/n): ").strip().lower()
-        )
-        if eval_response != "y":
-            print("Index building complete. Exiting.")
-            return
-    else:
-        # For evaluation mode, don't need embeddings
-        download_data_if_needed(data_root, download_embeddings=False)
-
-        # Auto-detect index path if not provided
-        if not args.index_path:
-            # Default to using downloaded indices
-            indices_dir = data_root / "indices"
-
-            # Try common datasets in order of preference
-            for dataset in ["dpr", "rpj_wiki"]:
-                dataset_dir = indices_dir / dataset
-                if dataset_dir.exists():
-                    # Look for index files
-                    index_files = list(dataset_dir.glob("*.index")) + list(
-                        dataset_dir.glob("*_disk.index")
-                    )
-                    if index_files:
-                        args.index_path = str(
-                            index_files[0].with_suffix("")
-                        )  # Remove .index extension
-                        print(f"Using index: {args.index_path}")
-                        break
-
-            if not args.index_path:
-                print(
-                    "No indices found. The data download should have included pre-built indices."
-                )
-                print(
-                    "Please check the data/indices/ directory or provide --index-path manually."
-                )
-                sys.exit(1)
-
-    # Detect dataset type from index path to select the correct ground truth
-    index_path_str = str(args.index_path)
-    if "rpj_wiki" in index_path_str:
-        dataset_type = "rpj_wiki"
-    elif "dpr" in index_path_str:
-        dataset_type = "dpr"
-    else:
-        # Fallback: try to infer from the index directory name
-        dataset_type = Path(args.index_path).name
-        print(
-            f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'."
-        )
-
-    queries_file = data_root / "queries" / "nq_open.jsonl"
-    golden_results_file = (
-        data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
-    )
-
-    print(f"INFO: Detected dataset type: {dataset_type}")
-    print(f"INFO: Using queries file: {queries_file}")
-    print(f"INFO: Using ground truth file: {golden_results_file}")
-
-    try:
-        searcher = LeannSearcher(args.index_path)
-        queries = load_queries(queries_file)
-
-        with open(golden_results_file, "r") as f:
-            golden_results_data = json.load(f)
-
-        num_eval_queries = min(args.num_queries, len(queries))
-        queries = queries[:num_eval_queries]
-
-        print(f"\nRunning evaluation on {num_eval_queries} queries...")
-        recall_scores = []
-        search_times = []
-
-        for i in range(num_eval_queries):
-            start_time = time.time()
-            new_results = searcher.search(
-                queries[i], top_k=args.top_k, ef=args.ef_search
-            )
-            search_times.append(time.time() - start_time)
-
-            # Correct Recall Calculation: Based on TEXT content
-            new_texts = {result.text for result in new_results}
-
-            # Get golden texts directly from the searcher's passage manager
-            golden_ids = golden_results_data["indices"][i][: args.top_k]
-            golden_texts = get_golden_texts(searcher, golden_ids)
-
-            overlap = len(new_texts & golden_texts)
-            recall = overlap / len(golden_texts) if golden_texts else 0
-            recall_scores.append(recall)
-
-            print("\n--- EVALUATION RESULTS ---")
-            print(f"Query: {queries[i]}")
-            print(f"New Results: {new_texts}")
-            print(f"Golden Results: {golden_texts}")
-            print(f"Overlap: {overlap}")
-            print(f"Recall: {recall}")
-            print(f"Search Time: {search_times[-1]:.4f}s")
-            print("--------------------------------")
-
-        avg_recall = np.mean(recall_scores) if recall_scores else 0
-        avg_time = np.mean(search_times) if search_times else 0
-
-        print("\n🎉 --- Evaluation Complete ---")
-        print(f"Avg. Recall@{args.top_k} (efSearch={args.ef_search}): {avg_recall:.4f}")
-        print(f"Avg. Search Time: {avg_time:.4f}s")
-
-    except Exception as e:
-        print(f"\n❌ An error occurred during evaluation: {e}")
-        import traceback
-
-        traceback.print_exc()
-
-
-if __name__ == "__main__":
-    main()
--- a/apps/wechat/init.py
+++ b/apps/wechat/init.py
--- a/apps/wechat/main.py
+++ b/apps/wechat/main.py
@@ -1,230 +0,0 @@
-import os
-import asyncio
-import dotenv
-import argparse
-from pathlib import Path
-from typing import List, Any, Optional
-from leann.api import LeannBuilder, LeannSearcher, LeannChat
-from llama_index.core.node_parser import SentenceSplitter
-import requests
-import time
-
-dotenv.load_dotenv()
-
-# Default WeChat export directory
-DEFAULT_WECHAT_EXPORT_DIR = "./wechat_export_direct"
-
-def create_leann_index_from_multiple_wechat_exports(
-    export_dirs: List[Path],
-    index_path: str = "wechat_history_index.leann",
-    max_count: int = -1,
-):
-    """
-    Create LEANN index from multiple WeChat export data sources.
-
-    Args:
-        export_dirs: List of Path objects pointing to WeChat export directories
-        index_path: Path to save the LEANN index
-        max_count: Maximum number of chat entries to process per export
-    """
-    print("Creating LEANN index from multiple WeChat export data sources...")
-
-    # Load documents using WeChatHistoryReader from local readers module
-    from .readers import WeChatHistoryReader
-
-    reader = WeChatHistoryReader()
-
-    INDEX_DIR = Path(index_path).parent
-
-    if not INDEX_DIR.exists():
-        print(f"--- Index directory not found, building new index ---")
-        all_documents = []
-        total_processed = 0
-
-        # Process each WeChat export directory
-        for i, export_dir in enumerate(export_dirs):
-            print(
-                f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}"
-            )
-
-            try:
-                documents = reader.load_data(
-                    wechat_export_dir=str(export_dir),
-                    max_count=max_count,
-                    concatenate_messages=True,  # Disable concatenation - one message per document
-                )
-                if documents:
-                    print(f"Loaded {len(documents)} chat documents from {export_dir}")
-                    all_documents.extend(documents)
-                    total_processed += len(documents)
-
-                    # Check if we've reached the max count
-                    if max_count > 0 and total_processed >= max_count:
-                        print(f"Reached max count of {max_count} documents")
-                        break
-                else:
-                    print(f"No documents loaded from {export_dir}")
-            except Exception as e:
-                print(f"Error processing {export_dir}: {e}")
-                continue
-
-        if not all_documents:
-            print("No documents loaded from any source. Exiting.")
-            return None
-
-        print(
-            f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports"
-        )
-
-        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
-
-        # Convert Documents to text strings and chunk them
-        all_texts = []
-        for doc in all_documents:
-            # Split the document into chunks
-            nodes = text_splitter.get_nodes_from_documents([doc])
-            for node in nodes:
-                text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
-                all_texts.append(text)
-
-        print(
-            f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
-        )
-
-        # Create LEANN index directory
-        print(f"--- Index directory not found, building new index ---")
-        INDEX_DIR.mkdir(exist_ok=True)
-
-        print(f"--- Building new LEANN index ---")
-
-        print(f"\n[PHASE 1] Building Leann index...")
-
-        # Use HNSW backend for better macOS compatibility
-        builder = LeannBuilder(
-            backend_name="hnsw",
-            embedding_model="Qwen/Qwen3-Embedding-0.6B",
-            graph_degree=32,
-            complexity=64,
-            is_compact=True,
-            is_recompute=True,
-            num_threads=1,  # Force single-threaded mode
-        )
-
-        print(f"Adding {len(all_texts)} chat chunks to index...")
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
-
-        builder.build_index(index_path)
-        print(f"\nLEANN index built at {index_path}!")
-    else:
-        print(f"--- Using existing index at {INDEX_DIR} ---")
-
-    return index_path
-
-async def query_leann_index(index_path: str, query: str):
-    """
-    Query the LEANN index.
-
-    Args:
-        index_path: Path to the LEANN index
-        query: The query string
-    """
-    print(f"\n[PHASE 2] Starting Leann chat session...")
-    chat = LeannChat(index_path=index_path)
-
-    print(f"You: {query}")
-    chat_response = chat.ask(
-        query,
-        top_k=20,
-        recompute_beighbor_embeddings=True,
-        complexity=16,
-        beam_width=1,
-        llm_config={
-            "type": "openai",
-            "model": "gpt-4o",
-            "api_key": os.getenv("OPENAI_API_KEY"),
-        },
-        llm_kwargs={"temperature": 0.0, "max_tokens": 1000},
-    )
-    print(f"Leann: {chat_response}")
-
-async def main():
-    """Main function with integrated WeChat export functionality."""
-
-    # Parse command line arguments
-    parser = argparse.ArgumentParser(
-        description="LEANN WeChat History Reader - Create and query WeChat chat history index"
-    )
-    parser.add_argument(
-        "--export-dir",
-        type=str,
-        default=DEFAULT_WECHAT_EXPORT_DIR,
-        help=f"Directory to store WeChat exports (default: {DEFAULT_WECHAT_EXPORT_DIR})",
-    )
-    parser.add_argument(
-        "--index-dir",
-        type=str,
-        default="./wechat_history_magic_test_11Debug_new",
-        help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
-    )
-    parser.add_argument(
-        "--max-entries",
-        type=int,
-        default=50,
-        help="Maximum number of chat entries to process (default: 5000)",
-    )
-    parser.add_argument(
-        "--query",
-        type=str,
-        default=None,
-        help="Single query to run (default: runs example queries)",
-    )
-    parser.add_argument(
-        "--force-export",
-        action="store_true",
-        default=False,
-        help="Force re-export of WeChat data even if exports exist",
-    )
-
-    args = parser.parse_args()
-
-    INDEX_DIR = Path(args.index_dir)
-    INDEX_PATH = str(INDEX_DIR / "wechat_history.leann")
-
-    print(f"Using WeChat export directory: {args.export_dir}")
-    print(f"Index directory: {INDEX_DIR}")
-    print(f"Max entries: {args.max_entries}")
-
-    # Initialize WeChat reader with export capabilities
-    from .readers import WeChatHistoryReader
-
-    reader = WeChatHistoryReader()
-
-    # Find existing exports or create new ones using the centralized method
-    export_dirs = reader.find_or_export_wechat_data(args.export_dir)
-    if not export_dirs:
-        print("Failed to find or export WeChat data. Exiting.")
-        return
-
-    # Create or load the LEANN index from all sources
-    index_path = create_leann_index_from_multiple_wechat_exports(
-        export_dirs, INDEX_PATH, max_count=args.max_entries
-    )
-
-    if index_path:
-        if args.query:
-            # Run single query
-            await query_leann_index(index_path, args.query)
-        else:
-            # Example queries
-            queries = [
-                "我想买魔术师约翰逊的球衣，给我一些对应聊天记录?",
-            ]
-
-            for query in queries:
-                print("\n" + "=" * 60)
-                await query_leann_index(index_path, query)
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/apps/wechat/readers.py
+++ b/apps/wechat/readers.py
@@ -1,719 +0,0 @@
-import json
-import os
-import re
-import subprocess
-import sys
-import time
-from pathlib import Path
-from typing import List, Any, Dict, Optional
-from llama_index.core import Document
-from llama_index.core.readers.base import BaseReader
-from datetime import datetime
-
-class WeChatHistoryReader(BaseReader):
-    """
-    WeChat chat history reader that extracts chat data from exported JSON files.
-    
-    Reads WeChat chat history from exported JSON files (from wechat-exporter tool)
-    and creates documents with embedded metadata similar to the Chrome history reader structure.
-    
-    Also includes utilities for automatic WeChat chat history export.
-    """
-    
-    def __init__(self) -> None:
-        """Initialize."""
-        self.packages_dir = Path(__file__).parent.parent.parent / "packages"
-        self.wechat_exporter_dir = self.packages_dir / "wechat-exporter"
-        self.wechat_decipher_dir = self.packages_dir / "wechat-decipher-macos"
-    
-    def check_wechat_running(self) -> bool:
-        """Check if WeChat is currently running."""
-        try:
-            result = subprocess.run(["pgrep", "-f", "WeChat"], capture_output=True, text=True)
-            return result.returncode == 0
-        except Exception:
-            return False
-    
-    def install_wechattweak(self) -> bool:
-        """Install WeChatTweak CLI tool."""
-        try:
-            # Create wechat-exporter directory if it doesn't exist
-            self.wechat_exporter_dir.mkdir(parents=True, exist_ok=True)
-            
-            wechattweak_path = self.wechat_exporter_dir / "wechattweak-cli"
-            if not wechattweak_path.exists():
-                print("Downloading WeChatTweak CLI...")
-                subprocess.run([
-                    "curl", "-L", "-o", str(wechattweak_path),
-                    "https://github.com/JettChenT/WeChatTweak-CLI/releases/latest/download/wechattweak-cli"
-                ], check=True)
-            
-            # Make executable
-            wechattweak_path.chmod(0o755)
-            
-            # Install WeChatTweak
-            print("Installing WeChatTweak...")
-            subprocess.run(["sudo", str(wechattweak_path), "install"], check=True)
-            return True
-        except Exception as e:
-            print(f"Error installing WeChatTweak: {e}")
-            return False
-    
-    def restart_wechat(self):
-        """Restart WeChat to apply WeChatTweak."""
-        try:
-            print("Restarting WeChat...")
-            subprocess.run(["pkill", "-f", "WeChat"], check=False)
-            time.sleep(2)
-            subprocess.run(["open", "-a", "WeChat"], check=True)
-            time.sleep(5)  # Wait for WeChat to start
-        except Exception as e:
-            print(f"Error restarting WeChat: {e}")
-    
-    def check_api_available(self) -> bool:
-        """Check if WeChatTweak API is available."""
-        try:
-            result = subprocess.run([
-                "curl", "-s", "http://localhost:48065/wechat/allcontacts"
-            ], capture_output=True, text=True, timeout=5)
-            return result.returncode == 0 and result.stdout.strip()
-        except Exception:
-            return False
-    
-
-
-    
-    def _extract_readable_text(self, content: str) -> str:
-        """
-        Extract readable text from message content, removing XML and system messages.
-        
-        Args:
-            content: The raw message content (can be string or dict)
-            
-        Returns:
-            Cleaned, readable text
-        """
-        if not content:
-            return ""
-        
-        # Handle dictionary content (like quoted messages)
-        if isinstance(content, dict):
-            # Extract text from dictionary structure
-            text_parts = []
-            if 'title' in content:
-                text_parts.append(str(content['title']))
-            if 'quoted' in content:
-                text_parts.append(str(content['quoted']))
-            if 'content' in content:
-                text_parts.append(str(content['content']))
-            if 'text' in content:
-                text_parts.append(str(content['text']))
-            
-            if text_parts:
-                return " | ".join(text_parts)
-            else:
-                # If we can't extract meaningful text from dict, return empty
-                return ""
-        
-        # Handle string content
-        if not isinstance(content, str):
-            return ""
-        
-        # Remove common prefixes like "wxid_xxx:\n"
-        clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
-        clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
-        
-        # If it's just XML or system message, return empty
-        if clean_content.strip().startswith('<') or 'recalled a message' in clean_content:
-            return ""
-        
-        return clean_content.strip()
-    
-    def _is_text_message(self, content: str) -> bool:
-        """
-        Check if a message contains readable text content.
-        
-        Args:
-            content: The message content (can be string or dict)
-            
-        Returns:
-            True if the message contains readable text, False otherwise
-        """
-        if not content:
-            return False
-        
-        # Handle dictionary content
-        if isinstance(content, dict):
-            # Check if dict has any readable text fields
-            text_fields = ['title', 'quoted', 'content', 'text']
-            for field in text_fields:
-                if field in content and content[field]:
-                    return True
-            return False
-        
-        # Handle string content
-        if not isinstance(content, str):
-            return False
-        
-        # Skip image messages (contain XML with img tags)
-        if '<img' in content and 'cdnurl' in content:
-            return False
-        
-        # Skip emoji messages (contain emoji XML tags)
-        if '<emoji' in content and 'productid' in content:
-            return False
-        
-        # Skip voice messages
-        if '<voice' in content:
-            return False
-        
-        # Skip video messages
-        if '<video' in content:
-            return False
-        
-        # Skip file messages
-        if '<appmsg' in content and 'appid' in content:
-            return False
-        
-        # Skip system messages (like "recalled a message")
-        if 'recalled a message' in content:
-            return False
-        
-        # Check if there's actual readable text (not just XML or system messages)
-        # Remove common prefixes like "wxid_xxx:\n" and check for actual content
-        clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
-        clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
-        
-        # If after cleaning we have meaningful text, consider it readable
-        if len(clean_content.strip()) > 0 and not clean_content.strip().startswith('<'):
-            return True
-        
-        return False
-    
-    def _concatenate_messages(self, messages: List[Dict], max_length: int = 128, 
-                             time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
-        """
-        Concatenate messages based on length and time rules.
-        
-        Args:
-            messages: List of message dictionaries
-            max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint.
-            time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint.
-            overlap_messages: Number of messages to overlap between consecutive groups
-            
-        Returns:
-            List of concatenated message groups
-        """
-        if not messages:
-            return []
-        
-        concatenated_groups = []
-        current_group = []
-        current_length = 0
-        last_timestamp = None
-        
-        for message in messages:
-            # Extract message info
-            content = message.get('content', '')
-            message_text = message.get('message', '')
-            create_time = message.get('createTime', 0)
-            from_user = message.get('fromUser', '')
-            to_user = message.get('toUser', '')
-            is_sent_from_self = message.get('isSentFromSelf', False)
-            
-            # Extract readable text
-            readable_text = self._extract_readable_text(content)
-            if not readable_text:
-                readable_text = message_text
-            
-            # Skip empty messages
-            if not readable_text.strip():
-                continue
-            
-            # Check time window constraint (only if time_window_minutes != -1)
-            if time_window_minutes != -1 and last_timestamp is not None and create_time > 0:
-                time_diff_minutes = (create_time - last_timestamp) / 60
-                if time_diff_minutes > time_window_minutes:
-                    # Time gap too large, start new group
-                    if current_group:
-                        concatenated_groups.append({
-                            'messages': current_group,
-                            'total_length': current_length,
-                            'start_time': current_group[0].get('createTime', 0),
-                            'end_time': current_group[-1].get('createTime', 0)
-                        })
-                        # Keep last few messages for overlap
-                        if overlap_messages > 0 and len(current_group) > overlap_messages:
-                            current_group = current_group[-overlap_messages:]
-                            current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
-                        else:
-                            current_group = []
-                            current_length = 0
-            
-            # Check length constraint (only if max_length != -1)
-            message_length = len(readable_text)
-            if max_length != -1 and current_length + message_length > max_length and current_group:
-                # Current group would exceed max length, save it and start new
-                concatenated_groups.append({
-                    'messages': current_group,
-                    'total_length': current_length,
-                    'start_time': current_group[0].get('createTime', 0),
-                    'end_time': current_group[-1].get('createTime', 0)
-                })
-                # Keep last few messages for overlap
-                if overlap_messages > 0 and len(current_group) > overlap_messages:
-                    current_group = current_group[-overlap_messages:]
-                    current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
-                else:
-                    current_group = []
-                    current_length = 0
-            
-            # Add message to current group
-            current_group.append(message)
-            current_length += message_length
-            last_timestamp = create_time
-        
-        # Add the last group if it exists
-        if current_group:
-            concatenated_groups.append({
-                'messages': current_group,
-                'total_length': current_length,
-                'start_time': current_group[0].get('createTime', 0),
-                'end_time': current_group[-1].get('createTime', 0)
-            })
-        
-        return concatenated_groups
-    
-    def _create_concatenated_content(self, message_group: Dict, contact_name: str) -> str:
-        """
-        Create concatenated content from a group of messages.
-        
-        Args:
-            message_group: Dictionary containing messages and metadata
-            contact_name: Name of the contact
-            
-        Returns:
-            Formatted concatenated content
-        """
-        messages = message_group['messages']
-        start_time = message_group['start_time']
-        end_time = message_group['end_time']
-        
-        # Format timestamps
-        if start_time:
-            try:
-                start_timestamp = datetime.fromtimestamp(start_time)
-                start_time_str = start_timestamp.strftime('%Y-%m-%d %H:%M:%S')
-            except:
-                start_time_str = str(start_time)
-        else:
-            start_time_str = "Unknown"
-        
-        if end_time:
-            try:
-                end_timestamp = datetime.fromtimestamp(end_time)
-                end_time_str = end_timestamp.strftime('%Y-%m-%d %H:%M:%S')
-            except:
-                end_time_str = str(end_time)
-        else:
-            end_time_str = "Unknown"
-        
-        # Build concatenated message content
-        message_parts = []
-        for message in messages:
-            content = message.get('content', '')
-            message_text = message.get('message', '')
-            create_time = message.get('createTime', 0)
-            is_sent_from_self = message.get('isSentFromSelf', False)
-            
-            # Extract readable text
-            readable_text = self._extract_readable_text(content)
-            if not readable_text:
-                readable_text = message_text
-            
-            # Format individual message
-            if create_time:
-                try:
-                    timestamp = datetime.fromtimestamp(create_time)
-                    # change to YYYY-MM-DD HH:MM:SS
-                    time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
-                except:
-                    time_str = str(create_time)
-            else:
-                time_str = "Unknown"
-            
-            sender = "[Me]" if is_sent_from_self else "[Contact]"
-            message_parts.append(f"({time_str}) {sender}: {readable_text}")
-        
-        concatenated_text = "\n".join(message_parts)
-        
-        # Create final document content
-        doc_content = f"""
-Contact: {contact_name}
-Time Range: {start_time_str} - {end_time_str}
-Messages ({len(messages)} messages, {message_group['total_length']} chars):
-
-{concatenated_text}
-"""
-        # TODO @yichuan give better format and rich info here!    
-        doc_content = f"""
-{concatenated_text}
-"""
-        return doc_content, contact_name
-    
-    def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
-        """
-        Load WeChat chat history data from exported JSON files.
-        
-        Args:
-            input_dir: Directory containing exported WeChat JSON files
-            **load_kwargs:
-                max_count (int): Maximum amount of chat entries to read.
-                wechat_export_dir (str): Custom path to WeChat export directory.
-                include_non_text (bool): Whether to include non-text messages (images, emojis, etc.)
-                concatenate_messages (bool): Whether to concatenate messages based on length rules.
-                max_length (int): Maximum length for concatenated message groups (default: 1000).
-                time_window_minutes (int): Time window in minutes to group messages together (default: 30).
-                overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
-        """
-        docs: List[Document] = []
-        max_count = load_kwargs.get('max_count', 1000)
-        wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
-        include_non_text = load_kwargs.get('include_non_text', False)
-        concatenate_messages = load_kwargs.get('concatenate_messages', False)
-        max_length = load_kwargs.get('max_length', 1000)
-        time_window_minutes = load_kwargs.get('time_window_minutes', 30)
-        
-        # Default WeChat export path
-        if wechat_export_dir is None:
-            wechat_export_dir = "./wechat_export_test"
-        
-        if not os.path.exists(wechat_export_dir):
-            print(f"WeChat export directory not found at: {wechat_export_dir}")
-            return docs
-        
-        try:
-            # Find all JSON files in the export directory
-            json_files = list(Path(wechat_export_dir).glob("*.json"))
-            print(f"Found {len(json_files)} WeChat chat history files")
-            
-            count = 0
-            for json_file in json_files:
-                if count >= max_count and max_count > 0:
-                    break
-                
-                try:
-                    with open(json_file, 'r', encoding='utf-8') as f:
-                        chat_data = json.load(f)
-                    
-                    # Extract contact name from filename
-                    contact_name = json_file.stem
-                    
-                    if concatenate_messages:
-                        # Filter messages to only include readable text messages
-                        readable_messages = []
-                        for message in chat_data:
-                            try:
-                                content = message.get('content', '')
-                                if not include_non_text and not self._is_text_message(content):
-                                    continue
-                                
-                                readable_text = self._extract_readable_text(content)
-                                if not readable_text and not include_non_text:
-                                    continue
-                                
-                                readable_messages.append(message)
-                            except Exception as e:
-                                print(f"Error processing message in {json_file}: {e}")
-                                continue
-                        
-                        # Concatenate messages based on rules
-                        message_groups = self._concatenate_messages(
-                            readable_messages, 
-                            max_length=-1, 
-                            time_window_minutes=-1,
-                            overlap_messages=0  # Keep 2 messages overlap between groups
-                        )
-                        
-                        # Create documents from concatenated groups
-                        for message_group in message_groups:
-                            if count >= max_count and max_count > 0:
-                                break
-                            
-                            doc_content, contact_name  = self._create_concatenated_content(message_group, contact_name)
-                            doc = Document(text=doc_content, metadata={"contact_name": contact_name})
-                            docs.append(doc)
-                            count += 1
-                        
-                        print(f"Created {len(message_groups)} concatenated message groups for {contact_name}")
-                        
-                    else:
-                        # Original single-message processing
-                        for message in chat_data:
-                            if count >= max_count and max_count > 0:
-                                break
-                            
-                            # Extract message information
-                            from_user = message.get('fromUser', '')
-                            to_user = message.get('toUser', '')
-                            content = message.get('content', '')
-                            message_text = message.get('message', '')
-                            create_time = message.get('createTime', 0)
-                            is_sent_from_self = message.get('isSentFromSelf', False)
-                            
-                            # Handle content that might be dict or string
-                            try:
-                                # Check if this is a readable text message
-                                if not include_non_text and not self._is_text_message(content):
-                                    continue
-                                
-                                # Extract readable text
-                                readable_text = self._extract_readable_text(content)
-                                if not readable_text and not include_non_text:
-                                    continue
-                            except Exception as e:
-                                # Skip messages that cause processing errors
-                                print(f"Error processing message in {json_file}: {e}")
-                                continue
-                            
-                            # Convert timestamp to readable format
-                            if create_time:
-                                try:
-                                    timestamp = datetime.fromtimestamp(create_time)
-                                    time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
-                                except:
-                                    time_str = str(create_time)
-                            else:
-                                time_str = "Unknown"
-                            
-                            # Create document content with metadata header and contact info
-                            doc_content = f"""
-Contact: {contact_name}
-Is sent from self: {is_sent_from_self}
-Time: {time_str}
-Message: {readable_text if readable_text else message_text}
-"""
-                            
-                            # Create document with embedded metadata
-                            doc = Document(text=doc_content, metadata={})
-                            docs.append(doc)
-                            count += 1
-                        
-                except Exception as e:
-                    print(f"Error reading {json_file}: {e}")
-                    continue
-            
-            print(f"Loaded {len(docs)} WeChat chat documents")
-            
-        except Exception as e:
-            print(f"Error reading WeChat history: {e}")
-            return docs
-        
-        return docs
-
-    @staticmethod
-    def find_wechat_export_dirs() -> List[Path]:
-        """
-        Find all WeChat export directories.
-        
-        Returns:
-            List of Path objects pointing to WeChat export directories
-        """
-        export_dirs = []
-        
-        # Look for common export directory names
-        possible_dirs = [
-            Path("./wechat_export_test"),
-            Path("./wechat_export"),
-            Path("./wechat_chat_history"),
-            Path("./chat_export")
-        ]
-        
-        for export_dir in possible_dirs:
-            if export_dir.exists() and export_dir.is_dir():
-                json_files = list(export_dir.glob("*.json"))
-                if json_files:
-                    export_dirs.append(export_dir)
-                    print(f"Found WeChat export directory: {export_dir} with {len(json_files)} files")
-        
-        print(f"Found {len(export_dirs)} WeChat export directories")
-        return export_dirs
-
-    @staticmethod
-    def export_chat_to_file(output_file: str = "wechat_chat_export.txt", max_count: int = 1000, export_dir: str = None, include_non_text: bool = False):
-        """
-        Export WeChat chat history to a text file.
-        
-        Args:
-            output_file: Path to the output file
-            max_count: Maximum number of entries to export
-            export_dir: Directory containing WeChat JSON files
-            include_non_text: Whether to include non-text messages
-        """
-        if export_dir is None:
-            export_dir = "./wechat_export_test"
-        
-        if not os.path.exists(export_dir):
-            print(f"WeChat export directory not found at: {export_dir}")
-            return
-        
-        try:
-            json_files = list(Path(export_dir).glob("*.json"))
-            
-            with open(output_file, 'w', encoding='utf-8') as f:
-                count = 0
-                for json_file in json_files:
-                    if count >= max_count and max_count > 0:
-                        break
-                    
-                    try:
-                        with open(json_file, 'r', encoding='utf-8') as json_f:
-                            chat_data = json.load(json_f)
-                        
-                        contact_name = json_file.stem
-                        f.write(f"\n=== Chat with {contact_name} ===\n")
-                        
-                        for message in chat_data:
-                            if count >= max_count and max_count > 0:
-                                break
-                            
-                            from_user = message.get('fromUser', '')
-                            content = message.get('content', '')
-                            message_text = message.get('message', '')
-                            create_time = message.get('createTime', 0)
-                            
-                            # Skip non-text messages unless requested
-                            if not include_non_text:
-                                reader = WeChatHistoryReader()
-                                if not reader._is_text_message(content):
-                                    continue
-                                readable_text = reader._extract_readable_text(content)
-                                if not readable_text:
-                                    continue
-                                message_text = readable_text
-                            
-                            if create_time:
-                                try:
-                                    timestamp = datetime.fromtimestamp(create_time)
-                                    time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
-                                except:
-                                    time_str = str(create_time)
-                            else:
-                                time_str = "Unknown"
-                            
-                            f.write(f"[{time_str}] {from_user}: {message_text}\n")
-                            count += 1
-                            
-                    except Exception as e:
-                        print(f"Error processing {json_file}: {e}")
-                        continue
-            
-            print(f"Exported {count} chat entries to {output_file}")
-            
-        except Exception as e:
-            print(f"Error exporting WeChat chat history: {e}")
-
-    def export_wechat_chat_history(self, export_dir: str = "./wechat_export_direct") -> Optional[Path]:
-        """
-        Export WeChat chat history using wechat-exporter tool.
-        
-        Args:
-            export_dir: Directory to save exported chat history
-            
-        Returns:
-            Path to export directory if successful, None otherwise
-        """
-        try:
-            import subprocess
-            import sys
-            
-            # Create export directory
-            export_path = Path(export_dir)
-            export_path.mkdir(exist_ok=True)
-            
-            print(f"Exporting WeChat chat history to {export_path}...")
-            
-            # Check if wechat-exporter directory exists
-            if not self.wechat_exporter_dir.exists():
-                print(f"wechat-exporter directory not found at: {self.wechat_exporter_dir}")
-                return None
-            
-            # Install requirements if needed
-            requirements_file = self.wechat_exporter_dir / "requirements.txt"
-            if requirements_file.exists():
-                print("Installing wechat-exporter requirements...")
-                subprocess.run([
-                    "uv", "pip", "install", "-r", str(requirements_file)
-                ], check=True)
-            
-            # Run the export command
-            print("Running wechat-exporter...")
-            result = subprocess.run([
-                sys.executable, str(self.wechat_exporter_dir / "main.py"), 
-                "export-all", str(export_path)
-            ], capture_output=True, text=True, check=True)
-            
-            print("Export command output:")
-            print(result.stdout)
-            if result.stderr:
-                print("Export errors:")
-                print(result.stderr)
-            
-            # Check if export was successful
-            if export_path.exists() and any(export_path.glob("*.json")):
-                json_files = list(export_path.glob("*.json"))
-                print(f"Successfully exported {len(json_files)} chat history files to {export_path}")
-                return export_path
-            else:
-                print("Export completed but no JSON files found")
-                return None
-                
-        except subprocess.CalledProcessError as e:
-            print(f"Export command failed: {e}")
-            print(f"Command output: {e.stdout}")
-            print(f"Command errors: {e.stderr}")
-            return None
-        except Exception as e:
-            print(f"Export failed: {e}")
-            print("Please ensure WeChat is running and WeChatTweak is installed.")
-            return None
-
-    def find_or_export_wechat_data(self, export_dir: str = "./wechat_export_direct") -> List[Path]:
-        """
-        Find existing WeChat exports or create new ones.
-        
-        Args:
-            export_dir: Directory to save exported chat history if needed
-            
-        Returns:
-            List of Path objects pointing to WeChat export directories
-        """
-        export_dirs = []
-        
-        # Look for existing exports in common locations
-        possible_export_dirs = [
-            Path("./wechat_database_export"),
-            Path("./wechat_export_test"),
-            Path("./wechat_export"),
-            Path("./wechat_export_direct"),
-            Path("./wechat_chat_history"),
-            Path("./chat_export")
-        ]
-        
-        for export_dir_path in possible_export_dirs:
-            if export_dir_path.exists() and any(export_dir_path.glob("*.json")):
-                export_dirs.append(export_dir_path)
-                print(f"Found existing export: {export_dir_path}")
-        
-        # If no existing exports, try to export automatically
-        if not export_dirs:
-            print("No existing WeChat exports found. Starting direct export...")
-            
-            # Try to export using wechat-exporter
-            exported_path = self.export_wechat_chat_history(export_dir)
-            if exported_path:
-                export_dirs = [exported_path]
-            else:
-                print("Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.")
-        
-        return export_dirs 
--- a/assets/effects.png
+++ b/assets/effects.png
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -7,31 +7,29 @@
   "outputs": [],
   "source": [
    "from leann.api import LeannBuilder, LeannSearcher, LeannChat\n",
-    "\n",
-    "# 1. Build the index (no embeddings stored!)\n",
+    "# 1. Build index (no embeddings stored!)\n",
    "builder = LeannBuilder(backend_name=\"hnsw\")\n",
-    "builder.add_text(\"C# is a powerful programming language\")\n",
+    "builder.add_text(\"C# is a powerful programming language but it is not very popular\")\n",
    "builder.add_text(\"Python is a powerful programming language and it is very popular\")\n",
-    "builder.add_text(\"Machine learning transforms industries\")\n",
+    "builder.add_text(\"Machine learning transforms industries\")  \n",
    "builder.add_text(\"Neural networks process complex data\")\n",
-    "builder.add_text(\"Leann is a great storage saving engine for RAG on your MacBook\")\n",
+    "builder.add_text(\"Leann is a great storage saving engine for RAG on your macbook\")\n",
    "builder.build_index(\"knowledge.leann\")\n",
-    "\n",
    "# 2. Search with real-time embeddings\n",
    "searcher = LeannSearcher(\"knowledge.leann\")\n",
-    "results = searcher.search(\"programming languages\", top_k=2)\n",
+    "results = searcher.search(\"programming languages\", top_k=2, recompute_beighbor_embeddings=True)\n",
+    "print(results)\n",
    "\n",
-    "# 3. Chat with LEANN using retrieved results\n",
-    "llm_config = {\n",
-    "    \"type\": \"ollama\",\n",
-    "    \"model\": \"llama3.2:1b\"\n",
-    "}\n",
+    "llm_config = {\"type\": \"ollama\", \"model\": \"qwen3:8b\"}\n",
    "\n",
    "chat = LeannChat(index_path=\"knowledge.leann\", llm_config=llm_config)\n",
+    "\n",
    "response = chat.ask(\n",
-    "    \"Compare the two retrieved programming languages and say which one is more popular today.\",\n",
+    "    \"Compare the two retrieved programming languages and say which one is more popular today. Respond in a single well-formed sentence.\",\n",
    "    top_k=2,\n",
-    ")"
+    "    recompute_beighbor_embeddings=True,\n",
+    ")\n",
+    "print(response)"
   ]
  }
 ],
--- a/examples/data/2501.14312v1
+++ b/examples/data/2501.14312v1
--- a/examples/data/2506.08276v1.pdf
+++ b/examples/data/2506.08276v1.pdf
--- a/examples/data/PrideandPrejudice.txt
+++ b/examples/data/PrideandPrejudice.txt
--- a/apps/documents/data/README.md
+++ b/apps/documents/data/README.md
--- a/examples/document_search.py
+++ b/examples/document_search.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""
+Document search demo with recompute mode
+"""
+
+import os
+from pathlib import Path
+import shutil
+import time
+
+# Import backend packages to trigger plugin registration
+try:
+    import leann_backend_diskann
+    import leann_backend_hnsw
+    print("INFO: Backend packages imported successfully.")
+except ImportError as e:
+    print(f"WARNING: Could not import backend packages. Error: {e}")
+
+# Import upper-level API from leann-core
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+
+
+def load_sample_documents():
+    """Create sample documents for demonstration"""
+    docs = [
+        {"title": "Intro to Python", "content": "Python is a high-level, interpreted language known for simplicity."},
+        {"title": "ML Basics", "content": "Machine learning builds systems that learn from data."},
+        {"title": "Data Structures", "content": "Data structures like arrays, lists, and graphs organize data."},
+    ]
+    return docs
+
+def main():
+    print("==========================================================")
+    print("=== Leann Document Search Demo (DiskANN + Recompute) ===")
+    print("==========================================================")
+    
+    INDEX_DIR = Path("./test_indices")
+    INDEX_PATH = str(INDEX_DIR / "documents.diskann")
+    BACKEND_TO_TEST = "diskann"
+
+    if INDEX_DIR.exists():
+        print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
+        shutil.rmtree(INDEX_DIR)
+
+    # --- 1. Build index ---
+    print(f"\n[PHASE 1] Building index using '{BACKEND_TO_TEST}' backend...")
+    
+    builder = LeannBuilder(
+        backend_name=BACKEND_TO_TEST, 
+        graph_degree=32, 
+        complexity=64
+    )
+    
+    documents = load_sample_documents()
+    print(f"Loaded {len(documents)} sample documents.")
+    for doc in documents:
+        builder.add_text(doc["content"], metadata={"title": doc["title"]})
+        
+    builder.build_index(INDEX_PATH)
+    print(f"\nIndex built!")
+
+    # --- 2. Basic search demo ---
+    print(f"\n[PHASE 2] Basic search using '{BACKEND_TO_TEST}' backend...")
+    searcher = LeannSearcher(index_path=INDEX_PATH)
+    
+    query = "What is machine learning?"
+    print(f"\nQuery: '{query}'")
+    
+    print("\n--- Basic search mode (PQ computation) ---")
+    start_time = time.time()
+    results = searcher.search(query, top_k=2)
+    basic_time = time.time() - start_time
+    
+    print(f"⏱️  Basic search time: {basic_time:.3f} seconds")
+    print(">>> Basic search results <<<")
+    for i, res in enumerate(results, 1):
+        print(f"  {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}")
+
+    # --- 3. Recompute search demo ---
+    print(f"\n[PHASE 3] Recompute search using embedding server...")
+    
+    print("\n--- Recompute search mode (get real embeddings via network) ---")
+    
+    # Configure recompute parameters
+    recompute_params = {
+        "recompute_beighbor_embeddings": True,  # Enable network recomputation
+        "USE_DEFERRED_FETCH": False,           # Don't use deferred fetch
+        "skip_search_reorder": True,           # Skip search reordering
+        "dedup_node_dis": True,               # Enable node distance deduplication
+        "prune_ratio": 0.1,                   # Pruning ratio 10%
+        "batch_recompute": False,             # Don't use batch recomputation
+        "global_pruning": False,              # Don't use global pruning
+        "zmq_port": 5555,                     # ZMQ port
+        "embedding_model": "sentence-transformers/all-mpnet-base-v2"
+    }
+    
+    print("Recompute parameter configuration:")
+    for key, value in recompute_params.items():
+        print(f"  {key}: {value}")
+    
+    print(f"\n🔄 Executing Recompute search...")
+    try:
+        start_time = time.time()
+        recompute_results = searcher.search(query, top_k=2, **recompute_params)
+        recompute_time = time.time() - start_time
+        
+        print(f"⏱️  Recompute search time: {recompute_time:.3f} seconds")
+        print(">>> Recompute search results <<<")
+        for i, res in enumerate(recompute_results, 1):
+            print(f"  {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}")
+        
+        # Compare results
+        print(f"\n--- Result comparison ---")
+        print(f"Basic search time: {basic_time:.3f} seconds")
+        print(f"Recompute time: {recompute_time:.3f} seconds")
+        
+        print("\nBasic search vs Recompute results:")
+        for i in range(min(len(results), len(recompute_results))):
+            basic_score = results[i].score
+            recompute_score = recompute_results[i].score
+            score_diff = abs(basic_score - recompute_score)
+            print(f"  Position {i+1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}")
+        
+        if recompute_time > basic_time:
+            print(f"✅ Recompute mode working correctly (more accurate but slower)")
+        else:
+            print(f"ℹ️  Recompute time is unusually fast, network recomputation may not be enabled")
+            
+    except Exception as e:
+        print(f"❌ Recompute search failed: {e}")
+        print("This usually indicates an embedding server connection issue")
+
+    # --- 4. Chat demo ---
+    print(f"\n[PHASE 4] Starting chat session...")
+    chat = LeannChat(index_path=INDEX_PATH)
+    chat_response = chat.ask(query)
+    print(f"You: {query}")
+    print(f"Leann: {chat_response}")
+
+    print("\n==========================================================")
+    print("✅ Demo finished successfully!")
+    print("==========================================================")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/email_data/LEANN_email_reader.py
+++ b/examples/email_data/LEANN_email_reader.py
@@ -96,12 +96,14 @@ class EmlxReader(BaseReader):
                                
                                # Create document content with metadata embedded in text
                                doc_content = f"""
-[File]: {filename}
-[From]: {from_addr}
-[To]: {to_addr}
-[Subject]: {subject}
-[Date]: {date}
-[EMAIL BODY Start]:
+[EMAIL METADATA]
+File: {filename}
+From: {from_addr}
+To: {to_addr}
+Subject: {subject}
+Date: {date}
+[END METADATA]
+
 {body}
 """
                                
--- a/examples/google_history_reader_leann.py
+++ b/examples/google_history_reader_leann.py
@@ -65,14 +65,12 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
        
        if not all_documents:
            print("No documents loaded from any source. Exiting.")
-            # highlight info that you need to close all chrome browser before running this script and high light the instruction!!
-            print("\033[91mYou need to close or quit all chrome browser before running this script\033[0m")
            return None
        
        print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles")
        
        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
+        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
        
        # Convert Documents to text strings and chunk them
        all_texts = []
@@ -80,9 +78,7 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
            # Split the document into chunks
            nodes = text_splitter.get_nodes_from_documents([doc])
            for node in nodes:
-                text = node.get_content()
-                # text = '[Title] ' + doc.metadata["title"] + '\n' + text
-                all_texts.append(text)
+                all_texts.append(node.get_content())
        
        print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
        
@@ -229,7 +225,7 @@ async def main():
    parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
    parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
                       help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
-    parser.add_argument('--index-dir', type=str, default="./all_google_new",
+    parser.add_argument('--index-dir', type=str, default="./chrome_history_index_leann_test",
                       help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
    parser.add_argument('--max-entries', type=int, default=1000,
                       help='Maximum number of history entries to process (default: 1000)')
--- a/examples/history_data/history.py
+++ b/examples/history_data/history.py
@@ -74,17 +74,22 @@ class ChromeHistoryReader(BaseReader):
                
                # Create document content with metadata embedded in text
                doc_content = f"""
-[Title]: {title}
-[URL of the page]: {url}
-[Last visited time]: {last_visit}
-[Visit times]: {visit_count}
-[Typed times]: {typed_count}
+[BROWSING HISTORY METADATA]
+URL: {url}
+Title: {title}
+Last Visit: {last_visit}
+Visit Count: {visit_count}
+Typed Count: {typed_count}
+Hidden: {hidden}
+[END METADATA]
+
+Title: {title}
+URL: {url}
+Last visited: {last_visit}
 """
                
                # Create document with embedded metadata
-                doc = Document(text=doc_content, metadata={ "title": title[0:150]})
-                # if len(title) > 150:
-                #     print(f"Title is too long: {title}")
+                doc = Document(text=doc_content, metadata={})
                docs.append(doc)
                count += 1
            
--- a/examples/history_data/wechat_history.py
+++ b/examples/history_data/wechat_history.py
@@ -197,8 +197,8 @@ class WeChatHistoryReader(BaseReader):
        
        Args:
            messages: List of message dictionaries
-            max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint.
-            time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint.
+            max_length: Maximum length for concatenated message groups
+            time_window_minutes: Time window in minutes to group messages together
            overlap_messages: Number of messages to overlap between consecutive groups
            
        Returns:
@@ -230,8 +230,8 @@ class WeChatHistoryReader(BaseReader):
            if not readable_text.strip():
                continue
            
-            # Check time window constraint (only if time_window_minutes != -1)
-            if time_window_minutes != -1 and last_timestamp is not None and create_time > 0:
+            # Check time window constraint
+            if last_timestamp is not None and create_time > 0:
                time_diff_minutes = (create_time - last_timestamp) / 60
                if time_diff_minutes > time_window_minutes:
                    # Time gap too large, start new group
@@ -250,9 +250,9 @@ class WeChatHistoryReader(BaseReader):
                            current_group = []
                            current_length = 0
            
-            # Check length constraint (only if max_length != -1)
+            # Check length constraint
            message_length = len(readable_text)
-            if max_length != -1 and current_length + message_length > max_length and current_group:
+            if current_length + message_length > max_length and current_group:
                # Current group would exceed max length, save it and start new
                concatenated_groups.append({
                    'messages': current_group,
@@ -335,15 +335,14 @@ class WeChatHistoryReader(BaseReader):
            if create_time:
                try:
                    timestamp = datetime.fromtimestamp(create_time)
-                    # change to YYYY-MM-DD HH:MM:SS
-                    time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
+                    time_str = timestamp.strftime('%H:%M:%S')
                except:
                    time_str = str(create_time)
            else:
                time_str = "Unknown"
            
-            sender = "[Me]" if is_sent_from_self else "[Contact]"
-            message_parts.append(f"({time_str}) {sender}: {readable_text}")
+            sender = "Me" if is_sent_from_self else "Contact"
+            message_parts.append(f"[{time_str}] {sender}: {readable_text}")
        
        concatenated_text = "\n".join(message_parts)
        
@@ -355,11 +354,13 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):

 {concatenated_text}
 """
-        # TODO @yichuan give better format and rich info here!    
+        
        doc_content = f"""
+Contact: {contact_name}
+
 {concatenated_text}
 """
-        return doc_content, contact_name
+        return doc_content
    
    def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
        """
@@ -430,9 +431,9 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
                        # Concatenate messages based on rules
                        message_groups = self._concatenate_messages(
                            readable_messages, 
-                            max_length=-1, 
-                            time_window_minutes=-1,
-                            overlap_messages=0  # Keep 2 messages overlap between groups
+                            max_length=max_length, 
+                            time_window_minutes=time_window_minutes,
+                            overlap_messages=2  # Keep 2 messages overlap between groups
                        )
                        
                        # Create documents from concatenated groups
@@ -440,8 +441,8 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
                            if count >= max_count and max_count > 0:
                                break
                            
-                            doc_content, contact_name  = self._create_concatenated_content(message_group, contact_name)
-                            doc = Document(text=doc_content, metadata={"contact_name": contact_name})
+                            doc_content = self._create_concatenated_content(message_group, contact_name)
+                            doc = Document(text=doc_content, metadata={})
                            docs.append(doc)
                            count += 1
                        
--- a/examples/mail_reader_leann.py
+++ b/examples/mail_reader_leann.py
@@ -74,10 +74,10 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
            print("No documents loaded from any source. Exiting.")
            return None
        
-        print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks")
+        print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories")
        
        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
+        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
        
        # Convert Documents to text strings and chunk them
        all_texts = []
@@ -85,11 +85,9 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
            # Split the document into chunks
            nodes = text_splitter.get_nodes_from_documents([doc])
            for node in nodes:
-                text = node.get_content()
-                # text = '[subject] ' + doc.metadata["subject"] + '\n' + text
-                all_texts.append(text)
+                all_texts.append(node.get_content())
        
-        print(f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks")
+        print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
        
        # Create LEANN index directory

@@ -233,7 +231,7 @@ async def main():
    parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
    # Remove --mail-path argument and auto-detect all Messages directories
    # Remove DEFAULT_MAIL_PATH
-    parser.add_argument('--index-dir', type=str, default="./mail_index_leann_debug",
+    parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts",
                       help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
    parser.add_argument('--max-emails', type=int, default=1000,
                       help='Maximum number of emails to process (-1 means all)')
--- a/examples/mail_reader_llamaindex.py
+++ b/examples/mail_reader_llamaindex.py
@@ -0,0 +1,108 @@
+import os
+import sys
+import argparse
+from pathlib import Path
+from typing import List, Any
+
+# Add the project root to Python path so we can import from examples
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from llama_index.core import VectorStoreIndex, StorageContext
+from llama_index.core.node_parser import SentenceSplitter
+
+# --- EMBEDDING MODEL ---
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+import torch
+
+# --- END EMBEDDING MODEL ---
+
+# Import EmlxReader from the new module
+from examples.email_data.LEANN_email_reader import EmlxReader
+
+def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000, include_html: bool = False):
+    print("Creating index from mail data with embedded metadata...")
+    documents = EmlxReader(include_html=include_html).load_data(mail_path, max_count=max_count)
+    if not documents:
+        print("No documents loaded. Exiting.")
+        return None
+    text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
+    # Use facebook/contriever as the embedder
+    embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
+    # set on device
+    import torch
+    if torch.cuda.is_available():
+        embed_model._model.to("cuda")
+    # set mps
+    elif torch.backends.mps.is_available():
+        embed_model._model.to("mps")
+    else:
+        embed_model._model.to("cpu")
+    index = VectorStoreIndex.from_documents(
+        documents,
+        transformations=[text_splitter],
+        embed_model=embed_model
+    )
+    os.makedirs(save_dir, exist_ok=True)
+    index.storage_context.persist(persist_dir=save_dir)
+    print(f"Index saved to {save_dir}")
+    return index
+
+def load_index(save_dir: str = "mail_index_embedded"):
+    try:
+        storage_context = StorageContext.from_defaults(persist_dir=save_dir)
+        index = VectorStoreIndex.from_vector_store(
+            storage_context.vector_store,
+            storage_context=storage_context
+        )
+        print(f"Index loaded from {save_dir}")
+        return index
+    except Exception as e:
+        print(f"Error loading index: {e}")
+        return None
+
+def query_index(index, query: str):
+    if index is None:
+        print("No index available for querying.")
+        return
+    query_engine = index.as_query_engine()
+    response = query_engine.query(query)
+    print(f"Query: {query}")
+    print(f"Response: {response}")
+
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='LlamaIndex Mail Reader - Create and query email index')
+    parser.add_argument('--mail-path', type=str, 
+                       default="/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
+                       help='Path to mail data directory')
+    parser.add_argument('--save-dir', type=str, default="mail_index_embedded",
+                       help='Directory to store the index (default: mail_index_embedded)')
+    parser.add_argument('--max-emails', type=int, default=10000,
+                       help='Maximum number of emails to process')
+    parser.add_argument('--include-html', action='store_true', default=False,
+                       help='Include HTML content in email processing (default: False)')
+    
+    args = parser.parse_args()
+    
+    mail_path = args.mail_path
+    save_dir = args.save_dir
+    
+    if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
+        print("Loading existing index...")
+        index = load_index(save_dir)
+    else:
+        print("Creating new index...")
+        index = create_and_save_index(mail_path, save_dir, max_count=args.max_emails, include_html=args.include_html)
+    if index:
+        queries = [
+            "Hows Berkeley Graduate Student Instructor",
+            "how's the icloud related advertisement saying",
+            "Whats the number of class recommend to take per semester for incoming EECS students"
+        ]
+        for query in queries:
+            print("\n" + "="*50)
+            query_index(index, query)
+
+if __name__ == "__main__":
+    main() 
--- a/examples/main_cli_example.py
+++ b/examples/main_cli_example.py
@@ -1,40 +1,40 @@
 import argparse
-from llama_index.core import SimpleDirectoryReader
+from llama_index.core import SimpleDirectoryReader, Settings
 from llama_index.core.node_parser import SentenceSplitter
 import asyncio
 import dotenv
-from leann.api import LeannBuilder, LeannChat
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+import shutil
 from pathlib import Path

 dotenv.load_dotenv()

+node_parser = SentenceSplitter(
+    chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
+)
+print("Loading documents...")
+documents = SimpleDirectoryReader(
+    "examples/data",
+    recursive=True,
+    encoding="utf-8",
+    required_exts=[".pdf", ".txt", ".md"],
+).load_data(show_progress=True)
+print("Documents loaded.")
+all_texts = []
+for doc in documents:
+    nodes = node_parser.get_nodes_from_documents([doc])
+    for node in nodes:
+        all_texts.append(node.get_content())
+

 async def main(args):
    INDEX_DIR = Path(args.index_dir)
    INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")

    if not INDEX_DIR.exists():
-        node_parser = SentenceSplitter(
-            chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
-        )
+        print(f"--- Index directory not found, building new index ---")

-        print("Loading documents...")
-        documents = SimpleDirectoryReader(
-            args.data_dir,
-            recursive=True,
-            encoding="utf-8",
-            required_exts=[".pdf", ".txt", ".md"],
-        ).load_data(show_progress=True)
-        print("Documents loaded.")
-        all_texts = []
-        for doc in documents:
-            nodes = node_parser.get_nodes_from_documents([doc])
-            for node in nodes:
-                all_texts.append(node.get_content())
-
-        print("--- Index directory not found, building new index ---")
-
-        print("\n[PHASE 1] Building Leann index...")
+        print(f"\n[PHASE 1] Building Leann index...")

        # Use HNSW backend for better macOS compatibility
        builder = LeannBuilder(
@@ -58,9 +58,8 @@ async def main(args):

    print(f"\n[PHASE 2] Starting Leann chat session...")

-    llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
+    # llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
    llm_config = {"type": "ollama", "model": "qwen3:8b"}
-    llm_config = {"type": "openai", "model": "gpt-4o"}

    chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)

@@ -71,7 +70,9 @@ async def main(args):
    # )

    print(f"You: {query}")
-    chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32)
+    chat_response = chat.ask(
+        query, top_k=20, recompute_beighbor_embeddings=True, complexity=32
+    )
    print(f"Leann: {chat_response}")


@@ -104,12 +105,6 @@ if __name__ == "__main__":
        default="./test_doc_files",
        help="Directory where the Leann index will be stored.",
    )
-    parser.add_argument(
-        "--data-dir",
-        type=str,
-        default="examples/data",
-        help="Directory containing documents to index (PDF, TXT, MD files).",
-    )
    args = parser.parse_args()

    asyncio.run(main(args))
--- a/examples/multi_vector_aggregator.py
+++ b/examples/multi_vector_aggregator.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Multi-Vector Aggregator for Fat Embeddings
+==========================================
+
+This module implements aggregation strategies for multi-vector embeddings,
+similar to ColPali's approach where multiple patch vectors represent a single document.
+
+Key features:
+- MaxSim aggregation (take maximum similarity across patches)
+- Voting-based aggregation (count patch matches)
+- Weighted aggregation (attention-score weighted)
+- Spatial clustering of matching patches
+- Document-level result consolidation
+"""
+
+import numpy as np
+from typing import List, Dict, Any, Tuple, Optional
+from dataclasses import dataclass
+from collections import defaultdict
+import json
+
+@dataclass
+class PatchResult:
+    """Represents a single patch search result."""
+    patch_id: int
+    image_name: str
+    image_path: str
+    coordinates: Tuple[int, int, int, int]  # (x1, y1, x2, y2)
+    score: float
+    attention_score: float
+    scale: float
+    metadata: Dict[str, Any]
+
+@dataclass
+class AggregatedResult:
+    """Represents an aggregated document-level result."""
+    image_name: str
+    image_path: str
+    doc_score: float
+    patch_count: int
+    best_patch: PatchResult
+    all_patches: List[PatchResult]
+    aggregation_method: str
+    spatial_clusters: Optional[List[List[PatchResult]]] = None
+
+class MultiVectorAggregator:
+    """
+    Aggregates multiple patch-level results into document-level results.
+    """
+    
+    def __init__(self, 
+                 aggregation_method: str = "maxsim",
+                 spatial_clustering: bool = True,
+                 cluster_distance_threshold: float = 100.0):
+        """
+        Initialize the aggregator.
+        
+        Args:
+            aggregation_method: "maxsim", "voting", "weighted", or "mean"
+            spatial_clustering: Whether to cluster spatially close patches
+            cluster_distance_threshold: Distance threshold for spatial clustering
+        """
+        self.aggregation_method = aggregation_method
+        self.spatial_clustering = spatial_clustering
+        self.cluster_distance_threshold = cluster_distance_threshold
+    
+    def aggregate_results(self, 
+                         search_results: List[Dict[str, Any]], 
+                         top_k: int = 10) -> List[AggregatedResult]:
+        """
+        Aggregate patch-level search results into document-level results.
+        
+        Args:
+            search_results: List of search results from LeannSearcher
+            top_k: Number of top documents to return
+            
+        Returns:
+            List of aggregated document results
+        """
+        # Group results by image
+        image_groups = defaultdict(list)
+        
+        for result in search_results:
+            metadata = result.metadata
+            if "image_name" in metadata and "patch_id" in metadata:
+                patch_result = PatchResult(
+                    patch_id=metadata["patch_id"],
+                    image_name=metadata["image_name"],
+                    image_path=metadata["image_path"],
+                    coordinates=tuple(metadata["coordinates"]),
+                    score=result.score,
+                    attention_score=metadata.get("attention_score", 0.0),
+                    scale=metadata.get("scale", 1.0),
+                    metadata=metadata
+                )
+                image_groups[metadata["image_name"]].append(patch_result)
+        
+        # Aggregate each image group
+        aggregated_results = []
+        for image_name, patches in image_groups.items():
+            if len(patches) == 0:
+                continue
+                
+            agg_result = self._aggregate_image_patches(image_name, patches)
+            aggregated_results.append(agg_result)
+        
+        # Sort by aggregated score and return top-k
+        aggregated_results.sort(key=lambda x: x.doc_score, reverse=True)
+        return aggregated_results[:top_k]
+    
+    def _aggregate_image_patches(self, image_name: str, patches: List[PatchResult]) -> AggregatedResult:
+        """Aggregate patches for a single image."""
+        
+        if self.aggregation_method == "maxsim":
+            doc_score = max(patch.score for patch in patches)
+            best_patch = max(patches, key=lambda p: p.score)
+            
+        elif self.aggregation_method == "voting":
+            # Count patches above threshold
+            threshold = np.percentile([p.score for p in patches], 75)
+            doc_score = sum(1 for patch in patches if patch.score >= threshold)
+            best_patch = max(patches, key=lambda p: p.score)
+            
+        elif self.aggregation_method == "weighted":
+            # Weight by attention scores
+            total_weighted_score = sum(p.score * p.attention_score for p in patches)
+            total_weights = sum(p.attention_score for p in patches)
+            doc_score = total_weighted_score / max(total_weights, 1e-8)
+            best_patch = max(patches, key=lambda p: p.score * p.attention_score)
+            
+        elif self.aggregation_method == "mean":
+            doc_score = np.mean([patch.score for patch in patches])
+            best_patch = max(patches, key=lambda p: p.score)
+            
+        else:
+            raise ValueError(f"Unknown aggregation method: {self.aggregation_method}")
+        
+        # Spatial clustering if enabled
+        spatial_clusters = None
+        if self.spatial_clustering:
+            spatial_clusters = self._cluster_patches_spatially(patches)
+        
+        return AggregatedResult(
+            image_name=image_name,
+            image_path=patches[0].image_path,
+            doc_score=float(doc_score),
+            patch_count=len(patches),
+            best_patch=best_patch,
+            all_patches=sorted(patches, key=lambda p: p.score, reverse=True),
+            aggregation_method=self.aggregation_method,
+            spatial_clusters=spatial_clusters
+        )
+    
+    def _cluster_patches_spatially(self, patches: List[PatchResult]) -> List[List[PatchResult]]:
+        """Cluster patches that are spatially close to each other."""
+        if len(patches) <= 1:
+            return [patches]
+        
+        clusters = []
+        remaining_patches = patches.copy()
+        
+        while remaining_patches:
+            # Start new cluster with highest scoring remaining patch
+            seed_patch = max(remaining_patches, key=lambda p: p.score)
+            current_cluster = [seed_patch]
+            remaining_patches.remove(seed_patch)
+            
+            # Add nearby patches to cluster
+            added_to_cluster = True
+            while added_to_cluster:
+                added_to_cluster = False
+                for patch in remaining_patches.copy():
+                    if self._is_patch_nearby(patch, current_cluster):
+                        current_cluster.append(patch)
+                        remaining_patches.remove(patch)
+                        added_to_cluster = True
+            
+            clusters.append(current_cluster)
+        
+        return sorted(clusters, key=lambda cluster: max(p.score for p in cluster), reverse=True)
+    
+    def _is_patch_nearby(self, patch: PatchResult, cluster: List[PatchResult]) -> bool:
+        """Check if a patch is spatially close to any patch in the cluster."""
+        patch_center = self._get_patch_center(patch.coordinates)
+        
+        for cluster_patch in cluster:
+            cluster_center = self._get_patch_center(cluster_patch.coordinates)
+            distance = np.sqrt((patch_center[0] - cluster_center[0])**2 + 
+                             (patch_center[1] - cluster_center[1])**2)
+            
+            if distance <= self.cluster_distance_threshold:
+                return True
+        
+        return False
+    
+    def _get_patch_center(self, coordinates: Tuple[int, int, int, int]) -> Tuple[float, float]:
+        """Get center point of a patch."""
+        x1, y1, x2, y2 = coordinates
+        return ((x1 + x2) / 2, (y1 + y2) / 2)
+    
+    def print_aggregated_results(self, results: List[AggregatedResult], max_patches_per_doc: int = 3):
+        """Pretty print aggregated results."""
+        print(f"\n🔍 Aggregated Results (method: {self.aggregation_method})")
+        print("=" * 80)
+        
+        for i, result in enumerate(results):
+            print(f"\n{i+1}. {result.image_name}")
+            print(f"   Doc Score: {result.doc_score:.4f} | Patches: {result.patch_count}")
+            print(f"   Path: {result.image_path}")
+            
+            # Show best patch
+            best = result.best_patch
+            print(f"   🌟 Best Patch: #{best.patch_id} at {best.coordinates} (score: {best.score:.4f})")
+            
+            # Show top patches
+            print(f"   📍 Top Patches:")
+            for j, patch in enumerate(result.all_patches[:max_patches_per_doc]):
+                print(f"      {j+1}. Patch #{patch.patch_id}: {patch.score:.4f} at {patch.coordinates}")
+            
+            # Show spatial clusters if available
+            if result.spatial_clusters and len(result.spatial_clusters) > 1:
+                print(f"   🗂️ Spatial Clusters: {len(result.spatial_clusters)}")
+                for j, cluster in enumerate(result.spatial_clusters[:2]):  # Show top 2 clusters
+                    cluster_score = max(p.score for p in cluster)
+                    print(f"      Cluster {j+1}: {len(cluster)} patches (best: {cluster_score:.4f})")
+
+def demo_aggregation():
+    """Demonstrate the multi-vector aggregation functionality."""
+    print("=== Multi-Vector Aggregation Demo ===")
+    
+    # Simulate some patch-level search results
+    # In real usage, these would come from LeannSearcher.search()
+    
+    class MockResult:
+        def __init__(self, score, metadata):
+            self.score = score
+            self.metadata = metadata
+    
+    # Simulate results for 2 images with multiple patches each
+    mock_results = [
+        # Image 1: cats_and_kitchen.jpg - 4 patches
+        MockResult(0.85, {
+            "image_name": "cats_and_kitchen.jpg",
+            "image_path": "/path/to/cats_and_kitchen.jpg",
+            "patch_id": 3,
+            "coordinates": [100, 50, 224, 174],  # Kitchen area
+            "attention_score": 0.92,
+            "scale": 1.0
+        }),
+        MockResult(0.78, {
+            "image_name": "cats_and_kitchen.jpg", 
+            "image_path": "/path/to/cats_and_kitchen.jpg",
+            "patch_id": 7,
+            "coordinates": [200, 300, 324, 424],  # Cat area
+            "attention_score": 0.88,
+            "scale": 1.0
+        }),
+        MockResult(0.72, {
+            "image_name": "cats_and_kitchen.jpg",
+            "image_path": "/path/to/cats_and_kitchen.jpg", 
+            "patch_id": 12,
+            "coordinates": [150, 100, 274, 224],  # Appliances
+            "attention_score": 0.75,
+            "scale": 1.0
+        }),
+        MockResult(0.65, {
+            "image_name": "cats_and_kitchen.jpg",
+            "image_path": "/path/to/cats_and_kitchen.jpg",
+            "patch_id": 15,
+            "coordinates": [50, 250, 174, 374],  # Furniture
+            "attention_score": 0.70,
+            "scale": 1.0
+        }),
+        
+        # Image 2: city_street.jpg - 3 patches  
+        MockResult(0.68, {
+            "image_name": "city_street.jpg",
+            "image_path": "/path/to/city_street.jpg",
+            "patch_id": 2,
+            "coordinates": [300, 100, 424, 224],  # Buildings
+            "attention_score": 0.80,
+            "scale": 1.0
+        }),
+        MockResult(0.62, {
+            "image_name": "city_street.jpg",
+            "image_path": "/path/to/city_street.jpg",
+            "patch_id": 8,
+            "coordinates": [100, 350, 224, 474],  # Street level
+            "attention_score": 0.75,
+            "scale": 1.0
+        }),
+        MockResult(0.55, {
+            "image_name": "city_street.jpg", 
+            "image_path": "/path/to/city_street.jpg",
+            "patch_id": 11,
+            "coordinates": [400, 200, 524, 324],  # Sky area
+            "attention_score": 0.60,
+            "scale": 1.0
+        }),
+    ]
+    
+    # Test different aggregation methods
+    methods = ["maxsim", "voting", "weighted", "mean"]
+    
+    for method in methods:
+        print(f"\n{'='*20} {method.upper()} AGGREGATION {'='*20}")
+        
+        aggregator = MultiVectorAggregator(
+            aggregation_method=method,
+            spatial_clustering=True,
+            cluster_distance_threshold=100.0
+        )
+        
+        aggregated = aggregator.aggregate_results(mock_results, top_k=5)
+        aggregator.print_aggregated_results(aggregated)
+
+if __name__ == "__main__":
+    demo_aggregation()
--- a/examples/openai_hnsw_example.py
+++ b/examples/openai_hnsw_example.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+OpenAI Embedding Example
+
+Complete example showing how to build and search with OpenAI embeddings using HNSW backend.
+"""
+
+import os
+import dotenv
+from pathlib import Path
+from leann.api import LeannBuilder, LeannSearcher
+
+# Load environment variables
+dotenv.load_dotenv()
+
+def main():
+    # Check if OpenAI API key is available
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        print("ERROR: OPENAI_API_KEY environment variable not set")
+        return False
+    
+    print(f"✅ OpenAI API key found: {api_key[:10]}...")
+    
+    # Sample texts
+    sample_texts = [
+        "Machine learning is a powerful technology that enables computers to learn from data.",
+        "Natural language processing helps computers understand and generate human language.",
+        "Deep learning uses neural networks with multiple layers to solve complex problems.",
+        "Computer vision allows machines to interpret and understand visual information.",
+        "Reinforcement learning trains agents to make decisions through trial and error.",
+        "Data science combines statistics, math, and programming to extract insights from data.",
+        "Artificial intelligence aims to create machines that can perform human-like tasks.",
+        "Python is a popular programming language used extensively in data science and AI.",
+        "Neural networks are inspired by the structure and function of the human brain.",
+        "Big data refers to extremely large datasets that require special tools to process."
+    ]
+    
+    INDEX_DIR = Path("./simple_openai_test_index")
+    INDEX_PATH = str(INDEX_DIR / "simple_test.leann")
+    
+    print(f"\n=== Building Index with OpenAI Embeddings ===")
+    print(f"Index path: {INDEX_PATH}")
+    
+    try:
+        # Use proper configuration for OpenAI embeddings
+        builder = LeannBuilder(
+            backend_name="hnsw",
+            embedding_model="text-embedding-3-small",
+            embedding_mode="openai",
+            # HNSW settings for OpenAI embeddings
+            M=16,                    # Smaller graph degree
+            efConstruction=64,       # Smaller construction complexity  
+            is_compact=True,         # Enable compact storage for recompute
+            is_recompute=True,       # MUST enable for OpenAI embeddings
+            num_threads=1,
+        )
+        
+        print(f"Adding {len(sample_texts)} texts to the index...")
+        for i, text in enumerate(sample_texts):
+            metadata = {"id": f"doc_{i}", "topic": "AI"}
+            builder.add_text(text, metadata)
+        
+        print("Building index...")
+        builder.build_index(INDEX_PATH)
+        print(f"✅ Index built successfully!")
+        
+    except Exception as e:
+        print(f"❌ Error building index: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+    
+    print(f"\n=== Testing Search ===")
+    
+    try:
+        searcher = LeannSearcher(INDEX_PATH)
+        
+        test_queries = [
+            "What is machine learning?",
+            "How do neural networks work?",
+            "Programming languages for data science"
+        ]
+        
+        for query in test_queries:
+            print(f"\n🔍 Query: '{query}'")
+            results = searcher.search(query, top_k=3)
+            
+            print(f"   Found {len(results)} results:")
+            for i, result in enumerate(results):
+                print(f"   {i+1}. Score: {result.score:.4f}")
+                print(f"      Text: {result.text[:80]}...")
+        
+        print(f"\n✅ Search test completed successfully!")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error during search: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    success = main()
+    if success:
+        print(f"\n🎉 Simple OpenAI index test completed successfully!")
+    else:
+        print(f"\n💥 Simple OpenAI index test failed!")
--- a/examples/resue_index.py
+++ b/examples/resue_index.py
@@ -0,0 +1,18 @@
+import asyncio
+from leann.api import LeannChat
+from pathlib import Path
+
+INDEX_DIR = Path("./test_pdf_index_huawei")
+INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
+
+async def main():
+    print(f"\n[PHASE 2] Starting Leann chat session...")
+    chat = LeannChat(index_path=INDEX_PATH)
+    query = "What is the main idea of RL and give me 5 exapmle of classic RL algorithms?"
+    query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
+    # query = "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面，任务令一般在什么城市颁发"
+    response = chat.ask(query,top_k=20,recompute_beighbor_embeddings=True,complexity=32,beam_width=1)
+    print(f"\n[PHASE 2] Response: {response}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/simple_demo.py
+++ b/examples/simple_demo.py
@@ -0,0 +1,81 @@
+"""
+Simple demo showing basic leann usage
+Run: uv run python examples/simple_demo.py
+"""
+
+import argparse
+from leann import LeannBuilder, LeannSearcher, LeannChat
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Simple demo of Leann with selectable embedding models.")
+    parser.add_argument("--embedding_model", type=str, default="sentence-transformers/all-mpnet-base-v2",
+                        help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.")
+    args = parser.parse_args()
+
+    print(f"=== Leann Simple Demo with {args.embedding_model} ===")
+    print()
+    
+    # Sample knowledge base
+    chunks = [
+        "Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
+        "Deep learning uses neural networks with multiple layers to process data and make decisions.",
+        "Natural language processing helps computers understand and generate human language.",
+        "Computer vision enables machines to interpret and understand visual information from images and videos.",
+        "Reinforcement learning teaches agents to make decisions by receiving rewards or penalties for their actions.",
+        "Data science combines statistics, programming, and domain expertise to extract insights from data.",
+        "Big data refers to extremely large datasets that require special tools and techniques to process.",
+        "Cloud computing provides on-demand access to computing resources over the internet.",
+    ]
+    
+    print("1. Building index (no embeddings stored)...")
+    builder = LeannBuilder(
+        embedding_model=args.embedding_model,
+        backend_name="hnsw",
+    )
+    for chunk in chunks:
+        builder.add_text(chunk)
+    builder.build_index("demo_knowledge.leann")
+    print()
+    
+    print("2. Searching with real-time embeddings...")
+    searcher = LeannSearcher("demo_knowledge.leann")
+    
+    queries = [
+        "What is machine learning?",
+        "How does neural network work?", 
+        "Tell me about data processing",
+    ]
+    
+    for query in queries:
+        print(f"Query: {query}")
+        results = searcher.search(query, top_k=2)
+        
+        for i, result in enumerate(results, 1):
+            print(f"  {i}. Score: {result.score:.3f}")
+            print(f"     Text: {result.text[:100]}...")
+        print()
+    
+    print("3. Interactive chat demo:")
+    print("   (Note: Requires OpenAI API key for real responses)")
+    
+    chat = LeannChat("demo_knowledge.leann")
+    
+    # Demo questions
+    demo_questions: list[str] = [
+        "What is the difference between machine learning and deep learning?",
+        "How is data science related to big data?",
+    ]
+    
+    for question in demo_questions:
+        print(f"   Q: {question}")
+        response = chat.ask(question)
+        print(f"   A: {response}")
+        print()
+    
+    print("Demo completed! Try running:")
+    print("   uv run python examples/document_search.py")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/wechat_history_reader_leann.py
+++ b/examples/wechat_history_reader_leann.py
@@ -74,11 +74,11 @@ def create_leann_index_from_multiple_wechat_exports(
            return None

        print(
-            f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports and starting to split them into chunks"
+            f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports"
        )

        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
+        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)

        # Convert Documents to text strings and chunk them
        all_texts = []
@@ -86,11 +86,10 @@ def create_leann_index_from_multiple_wechat_exports(
            # Split the document into chunks
            nodes = text_splitter.get_nodes_from_documents([doc])
            for node in nodes:
-                text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
-                all_texts.append(text)
+                all_texts.append(node.get_content())

        print(
-            f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks"
+            f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
        )

        # Create LEANN index directory
@@ -225,7 +224,7 @@ async def query_leann_index(index_path: str, query: str):
        query,
        top_k=20,
        recompute_beighbor_embeddings=True,
-        complexity=16,
+        complexity=64,
        beam_width=1,
        llm_config={
            "type": "openai",
@@ -253,13 +252,13 @@ async def main():
    parser.add_argument(
        "--index-dir",
        type=str,
-        default="./wechat_history_magic_test_11Debug_new",
+        default="./wechat_history_june19_test",
        help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
    )
    parser.add_argument(
        "--max-entries",
        type=int,
-        default=50,
+        default=5000,
        help="Maximum number of chat entries to process (default: 5000)",
    )
    parser.add_argument(
--- a/packages/leann-backend-diskann/CMakeLists.txt
+++ b/packages/leann-backend-diskann/CMakeLists.txt
@@ -1,8 +1,8 @@
-# packages/leann-backend-diskann/CMakeLists.txt (simplified version)
+# packages/leann-backend-diskann/CMakeLists.txt (最终简化版)

 cmake_minimum_required(VERSION 3.20)
 project(leann_backend_diskann_wrapper)

-# Tell CMake to directly enter the DiskANN submodule and execute its own CMakeLists.txt
-# DiskANN will handle everything itself, including compiling Python bindings
+# 告诉 CMake 直接进入 DiskANN 子模块并执行它自己的 CMakeLists.txt
+# DiskANN 会自己处理所有事情，包括编译 Python 绑定
 add_subdirectory(src/third_party/DiskANN)
--- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py
+++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py
@@ -1,12 +1,10 @@
 import numpy as np
 import os
 import struct
-import sys
 from pathlib import Path
-from typing import Dict, Any, List, Literal, Optional
+from typing import Dict, Any, List, Literal
 import contextlib
-
-import logging
+import pickle

 from leann.searcher_base import BaseSearcher
 from leann.registry import register_backend
@@ -16,46 +14,6 @@ from leann.interface import (
    LeannBackendSearcherInterface,
 )

-logger = logging.getLogger(__name__)
-
-
-@contextlib.contextmanager
-def suppress_cpp_output_if_needed():
-    """Suppress C++ stdout/stderr based on LEANN_LOG_LEVEL"""
-    log_level = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
-
-    # Only suppress if log level is WARNING or higher (ERROR, CRITICAL)
-    should_suppress = log_level in ["WARNING", "ERROR", "CRITICAL"]
-
-    if not should_suppress:
-        # Don't suppress, just yield
-        yield
-        return
-
-    # Save original file descriptors
-    stdout_fd = sys.stdout.fileno()
-    stderr_fd = sys.stderr.fileno()
-
-    # Save original stdout/stderr
-    stdout_dup = os.dup(stdout_fd)
-    stderr_dup = os.dup(stderr_fd)
-
-    try:
-        # Redirect to /dev/null
-        devnull = os.open(os.devnull, os.O_WRONLY)
-        os.dup2(devnull, stdout_fd)
-        os.dup2(devnull, stderr_fd)
-        os.close(devnull)
-
-        yield
-
-    finally:
-        # Restore original file descriptors
-        os.dup2(stdout_dup, stdout_fd)
-        os.dup2(stderr_dup, stderr_fd)
-        os.close(stdout_dup)
-        os.close(stderr_dup)
-

 def _get_diskann_metrics():
    from . import _diskannpy as diskannpy  # type: ignore
@@ -107,20 +65,22 @@ class DiskannBuilder(LeannBackendBuilderInterface):
        index_dir.mkdir(parents=True, exist_ok=True)

        if data.dtype != np.float32:
-            logger.warning(f"Converting data to float32, shape: {data.shape}")
            data = data.astype(np.float32)

        data_filename = f"{index_prefix}_data.bin"
        _write_vectors_to_bin(data, index_dir / data_filename)

+        label_map = {i: str_id for i, str_id in enumerate(ids)}
+        label_map_file = index_dir / "leann.labels.map"
+        with open(label_map_file, "wb") as f:
+            pickle.dump(label_map, f)
+
        build_kwargs = {**self.build_params, **kwargs}
        metric_enum = _get_diskann_metrics().get(
            build_kwargs.get("distance_metric", "mips").lower()
        )
        if metric_enum is None:
-            raise ValueError(
-                f"Unsupported distance_metric '{build_kwargs.get('distance_metric', 'unknown')}'."
-            )
+            raise ValueError("Unsupported distance_metric.")

        try:
            from . import _diskannpy as diskannpy  # type: ignore
@@ -142,40 +102,36 @@ class DiskannBuilder(LeannBackendBuilderInterface):
            temp_data_file = index_dir / data_filename
            if temp_data_file.exists():
                os.remove(temp_data_file)
-                logger.debug(f"Cleaned up temporary data file: {temp_data_file}")


 class DiskannSearcher(BaseSearcher):
    def __init__(self, index_path: str, **kwargs):
        super().__init__(
            index_path,
-            backend_module_name="leann_backend_diskann.diskann_embedding_server",
+            backend_module_name="leann_backend_diskann.embedding_server",
            **kwargs,
        )
+        from . import _diskannpy as diskannpy  # type: ignore

-        # Initialize DiskANN index with suppressed C++ output based on log level
-        with suppress_cpp_output_if_needed():
-            from . import _diskannpy as diskannpy  # type: ignore
+        distance_metric = kwargs.get("distance_metric", "mips").lower()
+        metric_enum = _get_diskann_metrics().get(distance_metric)
+        if metric_enum is None:
+            raise ValueError(f"Unsupported distance_metric '{distance_metric}'.")

-            distance_metric = kwargs.get("distance_metric", "mips").lower()
-            metric_enum = _get_diskann_metrics().get(distance_metric)
-            if metric_enum is None:
-                raise ValueError(f"Unsupported distance_metric '{distance_metric}'.")
+        self.num_threads = kwargs.get("num_threads", 8)
+        self.zmq_port = kwargs.get("zmq_port", 6666)

-            self.num_threads = kwargs.get("num_threads", 8)
-
-            fake_zmq_port = 6666
-            full_index_prefix = str(self.index_dir / self.index_path.stem)
-            self._index = diskannpy.StaticDiskFloatIndex(
-                metric_enum,
-                full_index_prefix,
-                self.num_threads,
-                kwargs.get("num_nodes_to_cache", 0),
-                1,
-                fake_zmq_port,  # Initial port, can be updated at runtime
-                "",
-                "",
-            )
+        full_index_prefix = str(self.index_dir / self.index_path.stem)
+        self._index = diskannpy.StaticDiskFloatIndex(
+            metric_enum,
+            full_index_prefix,
+            self.num_threads,
+            kwargs.get("num_nodes_to_cache", 0),
+            1,
+            self.zmq_port,
+            "",
+            "",
+        )

    def search(
        self,
@@ -186,7 +142,7 @@ class DiskannSearcher(BaseSearcher):
        prune_ratio: float = 0.0,
        recompute_embeddings: bool = False,
        pruning_strategy: Literal["global", "local", "proportional"] = "global",
-        zmq_port: Optional[int] = None,
+        zmq_port: int = 5557,
        batch_recompute: bool = False,
        dedup_node_dis: bool = False,
        **kwargs,
@@ -205,7 +161,7 @@ class DiskannSearcher(BaseSearcher):
                - "global": Use global pruning strategy (default)
                - "local": Use local pruning strategy
                - "proportional": Not supported in DiskANN, falls back to global
-            zmq_port: ZMQ port for embedding server communication. Must be provided if recompute_embeddings is True.
+            zmq_port: ZMQ port for embedding server
            batch_recompute: Whether to batch neighbor recomputation (DiskANN-specific)
            dedup_node_dis: Whether to cache and reuse distance computations (DiskANN-specific)
            **kwargs: Additional DiskANN-specific parameters (for legacy compatibility)
@@ -213,25 +169,22 @@ class DiskannSearcher(BaseSearcher):
        Returns:
            Dict with 'labels' (list of lists) and 'distances' (ndarray)
        """
-        # Handle zmq_port compatibility: DiskANN can now update port at runtime
-        if recompute_embeddings:
-            if zmq_port is None:
-                raise ValueError(
-                    "zmq_port must be provided if recompute_embeddings is True"
-                )
-            current_port = self._index.get_zmq_port()
-            if zmq_port != current_port:
-                logger.debug(
-                    f"Updating DiskANN zmq_port from {current_port} to {zmq_port}"
-                )
-                self._index.set_zmq_port(zmq_port)
-
        # DiskANN doesn't support "proportional" strategy
        if pruning_strategy == "proportional":
            raise NotImplementedError(
                "DiskANN backend does not support 'proportional' pruning strategy. Use 'global' or 'local' instead."
            )

+        # Use recompute_embeddings parameter
+        use_recompute = recompute_embeddings
+        if use_recompute:
+            meta_file_path = self.index_dir / f"{self.index_path.name}.meta.json"
+            if not meta_file_path.exists():
+                raise RuntimeError(
+                    f"FATAL: Recompute enabled but metadata file not found: {meta_file_path}"
+                )
+            self._ensure_server_running(str(meta_file_path), port=zmq_port, **kwargs)
+
        if query.dtype != np.float32:
            query = query.astype(np.float32)

@@ -241,26 +194,28 @@ class DiskannSearcher(BaseSearcher):
        else:  # "global"
            use_global_pruning = True

-        # Perform search with suppressed C++ output based on log level
-        with suppress_cpp_output_if_needed():
-            labels, distances = self._index.batch_search(
-                query,
-                query.shape[0],
-                top_k,
-                complexity,
-                beam_width,
-                self.num_threads,
-                kwargs.get("USE_DEFERRED_FETCH", False),
-                kwargs.get("skip_search_reorder", False),
-                recompute_embeddings,
-                dedup_node_dis,
-                prune_ratio,
-                batch_recompute,
-                use_global_pruning,
-            )
+        labels, distances = self._index.batch_search(
+            query,
+            query.shape[0],
+            top_k,
+            complexity,
+            beam_width,
+            self.num_threads,
+            kwargs.get("USE_DEFERRED_FETCH", False),
+            kwargs.get("skip_search_reorder", False),
+            use_recompute,
+            dedup_node_dis,
+            prune_ratio,
+            batch_recompute,
+            use_global_pruning,
+        )

        string_labels = [
-            [str(int_label) for int_label in batch_labels] for batch_labels in labels
+            [
+                self.label_map.get(int_label, f"unknown_{int_label}")
+                for int_label in batch_labels
+            ]
+            for batch_labels in labels
        ]

        return {"labels": string_labels, "distances": distances}
--- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py
+++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py
@@ -1,283 +0,0 @@
-"""
-DiskANN-specific embedding server
-"""
-
-import argparse
-import threading
-import time
-import os
-import zmq
-import numpy as np
-import json
-from pathlib import Path
-from typing import Optional
-import sys
-import logging
-
-# Set up logging based on environment variable
-LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
-logger = logging.getLogger(__name__)
-
-# Force set logger level (don't rely on basicConfig in subprocess)
-log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
-logger.setLevel(log_level)
-
-# Ensure we have a handler if none exists
-if not logger.handlers:
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.propagate = False
-
-
-def create_diskann_embedding_server(
-    passages_file: Optional[str] = None,
-    zmq_port: int = 5555,
-    model_name: str = "sentence-transformers/all-mpnet-base-v2",
-    embedding_mode: str = "sentence-transformers",
-):
-    """
-    Create and start a ZMQ-based embedding server for DiskANN backend.
-    Uses ROUTER socket and protobuf communication as required by DiskANN C++ implementation.
-    """
-    logger.info(f"Starting DiskANN server on port {zmq_port} with model {model_name}")
-    logger.info(f"Using embedding mode: {embedding_mode}")
-
-    # Add leann-core to path for unified embedding computation
-    current_dir = Path(__file__).parent
-    leann_core_path = current_dir.parent.parent / "leann-core" / "src"
-    sys.path.insert(0, str(leann_core_path))
-
-    try:
-        from leann.embedding_compute import compute_embeddings
-        from leann.api import PassageManager
-
-        logger.info("Successfully imported unified embedding computation module")
-    except ImportError as e:
-        logger.error(f"Failed to import embedding computation module: {e}")
-        return
-    finally:
-        sys.path.pop(0)
-
-    # Check port availability
-    import socket
-
-    def check_port(port):
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            return s.connect_ex(("localhost", port)) == 0
-
-    if check_port(zmq_port):
-        logger.error(f"Port {zmq_port} is already in use")
-        return
-
-    # Only support metadata file, fail fast for everything else
-    if not passages_file or not passages_file.endswith(".meta.json"):
-        raise ValueError("Only metadata files (.meta.json) are supported")
-
-    # Load metadata to get passage sources
-    with open(passages_file, "r") as f:
-        meta = json.load(f)
-
-    passages = PassageManager(meta["passage_sources"])
-    logger.info(
-        f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
-    )
-
-    # Import protobuf after ensuring the path is correct
-    try:
-        from . import embedding_pb2
-    except ImportError as e:
-        logger.error(f"Failed to import protobuf module: {e}")
-        return
-
-    def zmq_server_thread():
-        """ZMQ server thread using REP socket for universal compatibility"""
-        context = zmq.Context()
-        socket = context.socket(
-            zmq.REP
-        )  # REP socket for both BaseSearcher and DiskANN C++ REQ clients
-        socket.bind(f"tcp://*:{zmq_port}")
-        logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}")
-
-        socket.setsockopt(zmq.RCVTIMEO, 300000)
-        socket.setsockopt(zmq.SNDTIMEO, 300000)
-
-        while True:
-            try:
-                # REP socket receives single-part messages
-                message = socket.recv()
-
-                # Check for empty messages - REP socket requires response to every request
-                if len(message) == 0:
-                    logger.debug("Received empty message, sending empty response")
-                    socket.send(b"")  # REP socket must respond to every request
-                    continue
-
-                logger.debug(f"Received ZMQ request of size {len(message)} bytes")
-                logger.debug(f"Message preview: {message[:50]}")  # Show first 50 bytes
-
-                e2e_start = time.time()
-
-                # Try protobuf first (for DiskANN C++ node_ids requests - primary use case)
-                texts = []
-                node_ids = []
-                is_text_request = False
-
-                try:
-                    req_proto = embedding_pb2.NodeEmbeddingRequest()
-                    req_proto.ParseFromString(message)
-                    node_ids = list(req_proto.node_ids)
-
-                    if not node_ids:
-                        raise RuntimeError(
-                            f"PROTOBUF: Received empty node_ids! Message size: {len(message)}"
-                        )
-
-                    logger.info(
-                        f"✅ PROTOBUF: Node ID request for {len(node_ids)} node embeddings: {node_ids[:10]}"
-                    )
-                except Exception as protobuf_error:
-                    logger.debug(f"Protobuf parsing failed: {protobuf_error}")
-                    # Fallback to msgpack (for BaseSearcher direct text requests)
-                    try:
-                        import msgpack
-
-                        request = msgpack.unpackb(message)
-                        # For BaseSearcher compatibility, request is a list of texts directly
-                        if isinstance(request, list) and all(
-                            isinstance(item, str) for item in request
-                        ):
-                            texts = request
-                            is_text_request = True
-                            logger.info(
-                                f"✅ MSGPACK: Direct text request for {len(texts)} texts"
-                            )
-                        else:
-                            raise ValueError("Not a valid msgpack text request")
-                    except Exception as msgpack_error:
-                        raise RuntimeError(
-                            f"Both protobuf and msgpack parsing failed! Protobuf: {protobuf_error}, Msgpack: {msgpack_error}"
-                        )
-
-                # Look up texts by node IDs (only if not direct text request)
-                if not is_text_request:
-                    for nid in node_ids:
-                        try:
-                            passage_data = passages.get_passage(str(nid))
-                            txt = passage_data["text"]
-                            if not txt:
-                                raise RuntimeError(
-                                    f"FATAL: Empty text for passage ID {nid}"
-                                )
-                            texts.append(txt)
-                        except KeyError as e:
-                            logger.error(f"Passage ID {nid} not found: {e}")
-                            raise e
-                        except Exception as e:
-                            logger.error(f"Exception looking up passage ID {nid}: {e}")
-                            raise
-
-                    # Debug logging
-                    logger.debug(f"Processing {len(texts)} texts")
-                    logger.debug(
-                        f"Text lengths: {[len(t) for t in texts[:5]]}"
-                    )  # Show first 5
-
-                # Process embeddings using unified computation
-                embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
-                logger.info(
-                    f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
-                )
-
-                # Prepare response based on request type
-                if is_text_request:
-                    # For BaseSearcher compatibility: return msgpack format
-                    import msgpack
-
-                    response_data = msgpack.packb(embeddings.tolist())
-                else:
-                    # For DiskANN C++ compatibility: return protobuf format
-                    resp_proto = embedding_pb2.NodeEmbeddingResponse()
-                    hidden_contiguous = np.ascontiguousarray(
-                        embeddings, dtype=np.float32
-                    )
-
-                    # Serialize embeddings data
-                    resp_proto.embeddings_data = hidden_contiguous.tobytes()
-                    resp_proto.dimensions.append(hidden_contiguous.shape[0])
-                    resp_proto.dimensions.append(hidden_contiguous.shape[1])
-
-                    response_data = resp_proto.SerializeToString()
-
-                # Send response back to the client
-                socket.send(response_data)
-
-                e2e_end = time.time()
-                logger.info(f"⏱️  ZMQ E2E time: {e2e_end - e2e_start:.6f}s")
-
-            except zmq.Again:
-                logger.debug("ZMQ socket timeout, continuing to listen")
-                continue
-            except Exception as e:
-                logger.error(f"Error in ZMQ server loop: {e}")
-                import traceback
-
-                traceback.print_exc()
-                raise
-
-    zmq_thread = threading.Thread(target=zmq_server_thread, daemon=True)
-    zmq_thread.start()
-    logger.info(f"Started DiskANN ZMQ server thread on port {zmq_port}")
-
-    # Keep the main thread alive
-    try:
-        while True:
-            time.sleep(1)
-    except KeyboardInterrupt:
-        logger.info("DiskANN Server shutting down...")
-        return
-
-
-if __name__ == "__main__":
-    import signal
-    import sys
-
-    def signal_handler(sig, frame):
-        logger.info(f"Received signal {sig}, shutting down gracefully...")
-        sys.exit(0)
-
-    # Register signal handlers for graceful shutdown
-    signal.signal(signal.SIGTERM, signal_handler)
-    signal.signal(signal.SIGINT, signal_handler)
-
-    parser = argparse.ArgumentParser(description="DiskANN Embedding service")
-    parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
-    parser.add_argument(
-        "--passages-file",
-        type=str,
-        help="Metadata JSON file containing passage sources",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default="sentence-transformers/all-mpnet-base-v2",
-        help="Embedding model name",
-    )
-    parser.add_argument(
-        "--embedding-mode",
-        type=str,
-        default="sentence-transformers",
-        choices=["sentence-transformers", "openai", "mlx"],
-        help="Embedding backend mode",
-    )
-
-    args = parser.parse_args()
-
-    # Create and start the DiskANN embedding server
-    create_diskann_embedding_server(
-        passages_file=args.passages_file,
-        zmq_port=args.zmq_port,
-        model_name=args.model_name,
-        embedding_mode=args.embedding_mode,
-    )
--- a/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py
+++ b/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py
@@ -0,0 +1,741 @@
+#!/usr/bin/env python3
+"""
+Embedding server for leann-backend-diskann - Fixed ZMQ REQ-REP pattern
+"""
+
+import pickle
+import argparse
+import time
+import json
+from typing import Dict, Any, Optional, Union
+
+from transformers import AutoTokenizer, AutoModel
+import os
+from contextlib import contextmanager
+import zmq
+import numpy as np
+import msgpack
+from pathlib import Path
+import logging
+
+RED = "\033[91m"
+
+# Set up logging based on environment variable
+LOG_LEVEL = os.getenv('LEANN_LOG_LEVEL', 'INFO').upper()
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL, logging.INFO),
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+RESET = "\033[0m"
+
+# --- New Passage Loader from HNSW backend ---
+class SimplePassageLoader:
+    """
+    Simple passage loader that replaces config.py dependencies
+    """
+    def __init__(self, passages_data: Optional[Dict[str, Any]] = None):
+        self.passages_data = passages_data or {}
+        self._meta_path = ''
+    
+    def __getitem__(self, passage_id: Union[str, int]) -> Dict[str, str]:
+        """Get passage by ID"""
+        str_id = str(passage_id)
+        if str_id in self.passages_data:
+            return {"text": self.passages_data[str_id]}
+        else:
+            # Return empty text for missing passages
+            return {"text": ""}
+    
+    def __len__(self) -> int:
+        return len(self.passages_data)
+    
+    def keys(self):
+        return self.passages_data.keys()
+
+def load_passages_from_metadata(meta_file: str) -> SimplePassageLoader:
+    """
+    Load passages using metadata file with PassageManager for lazy loading
+    """
+    # Load metadata to get passage sources
+    with open(meta_file, 'r') as f:
+        meta = json.load(f)
+    
+    # Import PassageManager dynamically to avoid circular imports
+    import sys
+    from pathlib import Path
+    
+    # Find the leann package directory relative to this file
+    current_dir = Path(__file__).parent
+    leann_core_path = current_dir.parent.parent / "leann-core" / "src"
+    sys.path.insert(0, str(leann_core_path))
+    
+    try:
+        from leann.api import PassageManager
+        passage_manager = PassageManager(meta['passage_sources'])
+    finally:
+        sys.path.pop(0)
+    
+    # Load label map 
+    passages_dir = Path(meta_file).parent
+    label_map_file = passages_dir / "leann.labels.map"
+    
+    if label_map_file.exists():
+        import pickle
+        with open(label_map_file, 'rb') as f:
+            label_map = pickle.load(f)
+        print(f"Loaded label map with {len(label_map)} entries")
+    else:
+        raise FileNotFoundError(f"Label map file not found: {label_map_file}")
+    
+    print(f"Initialized lazy passage loading for {len(label_map)} passages")
+    
+    class LazyPassageLoader(SimplePassageLoader):
+        def __init__(self, passage_manager, label_map):
+            self.passage_manager = passage_manager
+            self.label_map = label_map
+            # Initialize parent with empty data
+            super().__init__({})
+        
+        def __getitem__(self, passage_id: Union[str, int]) -> Dict[str, str]:
+            """Get passage by ID with lazy loading"""
+            try:
+                int_id = int(passage_id)
+                if int_id in self.label_map:
+                    string_id = self.label_map[int_id]
+                    passage_data = self.passage_manager.get_passage(string_id)
+                    if passage_data and passage_data.get("text"):
+                        return {"text": passage_data["text"]}
+                    else:
+                        raise RuntimeError(f"FATAL: Empty text for ID {int_id} -> {string_id}")
+                else:
+                    raise RuntimeError(f"FATAL: ID {int_id} not found in label_map")
+            except Exception as e:
+                raise RuntimeError(f"FATAL: Exception getting passage {passage_id}: {e}")
+        
+        def __len__(self) -> int:
+            return len(self.label_map)
+        
+        def keys(self):
+            return self.label_map.keys()
+    
+    loader = LazyPassageLoader(passage_manager, label_map)
+    loader._meta_path = meta_file
+    return loader
+
+def load_passages_from_file(passages_file: str) -> SimplePassageLoader:
+    """
+    Load passages from a JSONL file with label map support
+    Expected format: {"id": "passage_id", "text": "passage_text", "metadata": {...}} (one per line)
+    """
+    
+    if not os.path.exists(passages_file):
+        raise FileNotFoundError(f"Passages file {passages_file} not found.")
+    
+    if not passages_file.endswith('.jsonl'):
+        raise ValueError(f"Expected .jsonl file format, got: {passages_file}")
+    
+    # Load label map (int -> string_id)
+    passages_dir = Path(passages_file).parent
+    label_map_file = passages_dir / "leann.labels.map"
+    
+    label_map = {}
+    if label_map_file.exists():
+        with open(label_map_file, 'rb') as f:
+            label_map = pickle.load(f)
+        print(f"Loaded label map with {len(label_map)} entries")
+    else:
+        raise FileNotFoundError(f"Label map file not found: {label_map_file}")
+    
+    # Load passages by string ID
+    string_id_passages = {}
+    with open(passages_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                passage = json.loads(line)
+                string_id_passages[passage['id']] = passage['text']
+    
+    # Create int ID -> text mapping using label map
+    passages_data = {}
+    for int_id, string_id in label_map.items():
+        if string_id in string_id_passages:
+            passages_data[str(int_id)] = string_id_passages[string_id]
+        else:
+            print(f"WARNING: String ID {string_id} from label map not found in passages")
+    
+    print(f"Loaded {len(passages_data)} passages from JSONL file {passages_file} using label map")
+    return SimplePassageLoader(passages_data)
+
+def create_embedding_server_thread(
+    zmq_port=5555,
+    model_name="sentence-transformers/all-mpnet-base-v2",
+    max_batch_size=128,
+    passages_file: Optional[str] = None,
+    embedding_mode: str = "sentence-transformers",
+    enable_warmup: bool = False,
+):
+    """
+    Create and run embedding server in the current thread
+    This function is designed to be called in a separate thread
+    """
+    logger.info(f"Initializing embedding server thread on port {zmq_port}")
+    
+    try:
+        # Check if port is already occupied
+        import socket
+        def check_port(port):
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                return s.connect_ex(('localhost', port)) == 0
+
+        if check_port(zmq_port):
+            print(f"{RED}Port {zmq_port} is already in use{RESET}")
+            return
+
+        # Auto-detect mode based on model name if not explicitly set
+        if embedding_mode == "sentence-transformers" and model_name.startswith("text-embedding-"):
+            embedding_mode = "openai"
+        
+        if embedding_mode == "mlx":
+            from leann.api import compute_embeddings_mlx
+            import torch
+            logger.info("Using MLX for embeddings")
+            # Set device to CPU for compatibility with DeviceTimer class
+            device = torch.device("cpu")
+            cuda_available = False
+            mps_available = False
+        elif embedding_mode == "openai":
+            from leann.api import compute_embeddings_openai
+            import torch
+            logger.info("Using OpenAI API for embeddings")
+            # Set device to CPU for compatibility with DeviceTimer class
+            device = torch.device("cpu")
+            cuda_available = False
+            mps_available = False
+        elif embedding_mode == "sentence-transformers":
+            # Initialize model
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            import torch
+
+            # Select device
+            mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+            cuda_available = torch.cuda.is_available()
+            
+            if cuda_available:
+                device = torch.device("cuda")
+                logger.info("Using CUDA device")
+            elif mps_available:
+                device = torch.device("mps")
+                logger.info("Using MPS device (Apple Silicon)")
+            else:
+                device = torch.device("cpu")
+                logger.info("Using CPU device")
+            
+            # Load model
+            logger.info(f"Loading model {model_name}")
+            model = AutoModel.from_pretrained(model_name).to(device).eval()
+
+            # Optimize model
+            if cuda_available or mps_available:
+                try:
+                    model = model.half()
+                    model = torch.compile(model)
+                    logger.info(f"Using FP16 precision with model: {model_name}")
+                except Exception as e:
+                    print(f"WARNING: Model optimization failed: {e}")
+        else:
+            raise ValueError(f"Unsupported embedding mode: {embedding_mode}. Supported modes: sentence-transformers, mlx, openai")
+
+        # Load passages from file if provided
+        if passages_file and os.path.exists(passages_file):
+            # Check if it's a metadata file or a single passages file
+            if passages_file.endswith('.meta.json'):
+                passages = load_passages_from_metadata(passages_file)
+            else:
+                # Try to find metadata file in same directory
+                passages_dir = Path(passages_file).parent
+                meta_files = list(passages_dir.glob("*.meta.json"))
+                if meta_files:
+                    print(f"Found metadata file: {meta_files[0]}, using lazy loading")
+                    passages = load_passages_from_metadata(str(meta_files[0]))
+                else:
+                    # Fallback to original single file loading (will cause warnings)
+                    print("WARNING: No metadata file found, using single file loading (may cause missing passage warnings)")
+                    passages = load_passages_from_file(passages_file)
+        else:
+            print("WARNING: No passages file provided or file not found. Using an empty passage loader.")
+            passages = SimplePassageLoader()
+
+        logger.info(f"Loaded {len(passages)} passages.")
+
+        def client_warmup(zmq_port):
+            """Perform client-side warmup for DiskANN server"""
+            time.sleep(2)
+            print(f"Performing client-side warmup with model {model_name}...")
+            
+            # Get actual passage IDs from the loaded passages
+            sample_ids = []
+            if hasattr(passages, 'keys') and len(passages) > 0:
+                available_ids = list(passages.keys())
+                # Take up to 5 actual IDs, but at least 1
+                sample_ids = available_ids[:min(5, len(available_ids))]
+                print(f"Using actual passage IDs for warmup: {sample_ids}")
+            else:
+                print("No passages available for warmup, skipping warmup...")
+                return
+
+            try:
+                context = zmq.Context()
+                socket = context.socket(zmq.REQ)
+                socket.connect(f"tcp://localhost:{zmq_port}")
+                socket.setsockopt(zmq.RCVTIMEO, 30000)
+                socket.setsockopt(zmq.SNDTIMEO, 30000)
+
+                try:
+                    ids_to_send = [int(x) for x in sample_ids]
+                except ValueError:
+                    print("Warning: Could not convert sample IDs to integers, skipping warmup")
+                    return
+
+                if not ids_to_send:
+                    print("Skipping warmup send.")
+                    return
+
+                # Use protobuf format for warmup
+                from . import embedding_pb2
+                req_proto = embedding_pb2.NodeEmbeddingRequest()
+                req_proto.node_ids.extend(ids_to_send)
+                request_bytes = req_proto.SerializeToString()
+
+                for i in range(3):
+                    print(f"Sending warmup request {i + 1}/3 via ZMQ (Protobuf)...")
+                    socket.send(request_bytes)
+                    response_bytes = socket.recv()
+                    
+                    resp_proto = embedding_pb2.NodeEmbeddingResponse()
+                    resp_proto.ParseFromString(response_bytes)
+                    embeddings_count = resp_proto.dimensions[0] if resp_proto.dimensions else 0
+                    print(f"Warmup request {i + 1}/3 successful, received {embeddings_count} embeddings")
+                    time.sleep(0.1)
+
+                print("Client-side Protobuf ZMQ warmup complete")
+                socket.close()
+                context.term()
+            except Exception as e:
+                print(f"Error during Protobuf ZMQ warmup: {e}")
+
+        class DeviceTimer:
+            """Device timer"""
+            def __init__(self, name="", device=device):
+                self.name = name
+                self.device = device
+                self.start_time = 0
+                self.end_time = 0
+                
+                if embedding_mode == "sentence-transformers" and torch.cuda.is_available():
+                    self.start_event = torch.cuda.Event(enable_timing=True)
+                    self.end_event = torch.cuda.Event(enable_timing=True)
+                else:
+                    self.start_event = None
+                    self.end_event = None
+
+            @contextmanager
+            def timing(self):
+                self.start()
+                yield
+                self.end()
+
+            def start(self):
+                if embedding_mode == "sentence-transformers" and torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                    self.start_event.record()
+                else:
+                    if embedding_mode == "sentence-transformers" and self.device.type == "mps":
+                        torch.mps.synchronize()
+                    self.start_time = time.time()
+
+            def end(self):
+                if embedding_mode == "sentence-transformers" and torch.cuda.is_available():
+                    self.end_event.record()
+                    torch.cuda.synchronize()
+                else:
+                    if embedding_mode == "sentence-transformers" and self.device.type == "mps":
+                        torch.mps.synchronize()
+                    self.end_time = time.time()
+
+            def elapsed_time(self):
+                if embedding_mode == "sentence-transformers" and torch.cuda.is_available():
+                    return self.start_event.elapsed_time(self.end_event) / 1000.0
+                else:
+                    return self.end_time - self.start_time
+
+            def print_elapsed(self):
+                elapsed = self.elapsed_time()
+                print(f"[{self.name}] Elapsed time: {elapsed:.3f}s")
+
+        def process_batch_pytorch(texts_batch, ids_batch, missing_ids):
+            """Process text batch"""
+            if not texts_batch:
+                return np.array([])
+
+            # Filter out empty texts and their corresponding IDs
+            valid_texts = []
+            valid_ids = []
+            for i, text in enumerate(texts_batch):
+                if text.strip():  # Only include non-empty texts
+                    valid_texts.append(text)
+                    valid_ids.append(ids_batch[i])
+
+            if not valid_texts:
+                print("WARNING: No valid texts in batch")
+                return np.array([])
+
+            # Tokenize
+            token_timer = DeviceTimer("tokenization")
+            with token_timer.timing():
+                inputs = tokenizer(
+                    valid_texts,
+                    padding=True,
+                    truncation=True,
+                    max_length=512,
+                    return_tensors="pt"
+                ).to(device)
+
+            # Compute embeddings
+            embed_timer = DeviceTimer("embedding computation")
+            with embed_timer.timing():
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    hidden_states = outputs.last_hidden_state
+                    
+                    # Mean pooling
+                    attention_mask = inputs['attention_mask']
+                    mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
+                    sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
+                    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
+                    batch_embeddings = sum_embeddings / sum_mask
+                embed_timer.print_elapsed()
+
+            return batch_embeddings.cpu().numpy()
+
+        # ZMQ server main loop - modified to use REP socket
+        context = zmq.Context()
+        socket = context.socket(zmq.ROUTER)  # Changed to REP socket
+        socket.bind(f"tcp://127.0.0.1:{zmq_port}")
+        print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}")
+
+        # Set timeouts
+        socket.setsockopt(zmq.RCVTIMEO, 5000)  # 5 second receive timeout
+        socket.setsockopt(zmq.SNDTIMEO, 300000)  # 300 second send timeout
+
+        from . import embedding_pb2
+
+        print(f"INFO: Embedding server ready to serve requests")
+
+        # Start warmup thread if enabled
+        if enable_warmup and len(passages) > 0:
+            import threading
+            print(f"Warmup enabled: starting warmup thread")
+            warmup_thread = threading.Thread(target=client_warmup, args=(zmq_port,))
+            warmup_thread.daemon = True
+            warmup_thread.start()
+        else:
+            print(f"Warmup disabled or no passages available (enable_warmup={enable_warmup}, passages={len(passages)})")
+
+        while True:
+            try:
+                parts = socket.recv_multipart()
+
+                # --- Restore robust message format detection ---
+                # Must check parts length to avoid IndexError
+                if len(parts) >= 3:
+                    identity = parts[0]
+                    # empty = parts[1]  # We usually don't care about the middle empty frame
+                    message = parts[2]
+                elif len(parts) == 2:
+                    # Can also handle cases without empty frame
+                    identity = parts[0]
+                    message = parts[1]
+                else:
+                    # If received message format is wrong, print warning and ignore it instead of crashing
+                    print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.")
+                    continue
+                print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes")
+
+                # Handle control messages (MessagePack format)
+                try:
+                    request_payload = msgpack.unpackb(message)
+                    if isinstance(request_payload, list) and len(request_payload) >= 1:
+                        if request_payload[0] == "__QUERY_META_PATH__":
+                            # Return the current meta path being used by the server
+                            current_meta_path = getattr(passages, '_meta_path', '') if hasattr(passages, '_meta_path') else ''
+                            response = [current_meta_path]
+                            socket.send_multipart([identity, b'', msgpack.packb(response)])
+                            continue
+                            
+                        elif request_payload[0] == "__UPDATE_META_PATH__" and len(request_payload) >= 2:
+                            # Update the server's meta path and reload passages
+                            new_meta_path = request_payload[1]
+                            try:
+                                print(f"INFO: Updating server meta path to: {new_meta_path}")
+                                # Reload passages from the new meta file
+                                passages = load_passages_from_metadata(new_meta_path)
+                                # Store the meta path for future queries
+                                passages._meta_path = new_meta_path
+                                response = ["SUCCESS"]
+                                print(f"INFO: Successfully updated meta path and reloaded {len(passages)} passages")
+                            except Exception as e:
+                                print(f"ERROR: Failed to update meta path: {e}")
+                                response = ["FAILED", str(e)]
+                            socket.send_multipart([identity, b'', msgpack.packb(response)])
+                            continue
+                            
+                        elif request_payload[0] == "__QUERY_MODEL__":
+                            # Return the current model being used by the server
+                            response = [model_name]
+                            socket.send_multipart([identity, b'', msgpack.packb(response)])
+                            continue
+                            
+                        elif request_payload[0] == "__UPDATE_MODEL__" and len(request_payload) >= 2:
+                            # Update the server's embedding model
+                            new_model_name = request_payload[1]
+                            try:
+                                print(f"INFO: Updating server model from {model_name} to: {new_model_name}")
+                                
+                                # Clean up old model to free memory
+                                if not use_mlx:
+                                    print("INFO: Releasing old model from memory...")
+                                    old_model = model
+                                    old_tokenizer = tokenizer
+                                    
+                                    # Load new tokenizer first
+                                    print(f"Loading new tokenizer for {new_model_name}...")
+                                    tokenizer = AutoTokenizer.from_pretrained(new_model_name, use_fast=True)
+                                    
+                                    # Load new model
+                                    print(f"Loading new model {new_model_name}...")
+                                    model = AutoModel.from_pretrained(new_model_name).to(device).eval()
+                                    
+                                    # Optimize new model
+                                    if cuda_available or mps_available:
+                                        try:
+                                            model = model.half()
+                                            model = torch.compile(model)
+                                            print(f"INFO: Using FP16 precision with model: {new_model_name}")
+                                        except Exception as e:
+                                            print(f"WARNING: Model optimization failed: {e}")
+                                    
+                                    # Now safely delete old model after new one is loaded
+                                    del old_model
+                                    del old_tokenizer
+                                    
+                                    # Clear GPU cache if available
+                                    if device.type == "cuda":
+                                        torch.cuda.empty_cache()
+                                        print("INFO: Cleared CUDA cache")
+                                    elif device.type == "mps":
+                                        torch.mps.empty_cache()
+                                        print("INFO: Cleared MPS cache")
+                                    
+                                    # Force garbage collection
+                                    import gc
+                                    gc.collect()
+                                    print("INFO: Memory cleanup completed")
+                                
+                                # Update model name
+                                model_name = new_model_name
+                                
+                                response = ["SUCCESS"]
+                                print(f"INFO: Successfully updated model to: {new_model_name}")
+                            except Exception as e:
+                                print(f"ERROR: Failed to update model: {e}")
+                                response = ["FAILED", str(e)]
+                            socket.send_multipart([identity, b'', msgpack.packb(response)])
+                            continue
+                except:
+                    # Not a control message, continue with normal protobuf processing
+                    pass
+
+                e2e_start = time.time()
+                lookup_timer = DeviceTimer("text lookup")
+
+                # Parse request
+                req_proto = embedding_pb2.NodeEmbeddingRequest()
+                req_proto.ParseFromString(message)
+                node_ids = req_proto.node_ids
+                print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}")
+
+                # Add debug information
+                if len(node_ids) > 0:
+                    print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}")
+                
+                # Look up texts
+                texts = []
+                missing_ids = []
+                with lookup_timer.timing():
+                    for nid in node_ids:
+                        txtinfo = passages[nid]
+                        txt = txtinfo["text"]
+                        if txt:
+                            texts.append(txt)
+                        else:
+                            # If text is empty, we still need a placeholder for batch processing,
+                            # but record its ID as missing
+                            texts.append("") 
+                            missing_ids.append(nid)
+                lookup_timer.print_elapsed()
+
+                if missing_ids:
+                    print(f"WARNING: Missing passages for IDs: {missing_ids}")
+
+                # Process batch
+                total_size = len(texts)
+                print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}")
+                
+                all_embeddings = []
+                
+                if total_size > max_batch_size:
+                    print(f"INFO: Splitting batch of size {total_size} into chunks of {max_batch_size}")
+                    for i in range(0, total_size, max_batch_size):
+                        end_idx = min(i + max_batch_size, total_size)
+                        print(f"INFO: Processing chunk {i//max_batch_size + 1}/{(total_size + max_batch_size - 1)//max_batch_size}: items {i} to {end_idx-1}")
+                        
+                        chunk_texts = texts[i:end_idx]
+                        chunk_ids = node_ids[i:end_idx]
+                        
+                        if embedding_mode == "mlx":
+                            embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name, batch_size=16)
+                        elif embedding_mode == "openai":
+                            embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name)
+                        else:  # sentence-transformers
+                            embeddings_chunk = process_batch_pytorch(chunk_texts, chunk_ids, missing_ids)
+                        all_embeddings.append(embeddings_chunk)
+                        
+                        if embedding_mode == "sentence-transformers":
+                            if cuda_available:
+                                torch.cuda.empty_cache()
+                            elif device.type == "mps":
+                                torch.mps.empty_cache()
+                            
+                    hidden = np.vstack(all_embeddings)
+                    print(f"INFO: Combined embeddings shape: {hidden.shape}")
+                else:
+                    if embedding_mode == "mlx":
+                        hidden = compute_embeddings_mlx(texts, model_name, batch_size=16)
+                    elif embedding_mode == "openai":
+                        hidden = compute_embeddings_openai(texts, model_name)
+                    else:  # sentence-transformers
+                        hidden = process_batch_pytorch(texts, node_ids, missing_ids)
+
+                # Serialize response
+                ser_start = time.time()
+
+                resp_proto = embedding_pb2.NodeEmbeddingResponse()
+                hidden_contiguous = np.ascontiguousarray(hidden, dtype=np.float32)
+                resp_proto.embeddings_data = hidden_contiguous.tobytes()
+                resp_proto.dimensions.append(hidden_contiguous.shape[0])
+                resp_proto.dimensions.append(hidden_contiguous.shape[1])
+                resp_proto.missing_ids.extend(missing_ids)
+
+                response_data = resp_proto.SerializeToString()
+                
+                # REP socket sends a single response
+                socket.send_multipart([identity, b'', response_data])
+
+                ser_end = time.time()
+
+                print(f"INFO: Serialize time: {ser_end - ser_start:.6f} seconds")
+
+                if embedding_mode == "sentence-transformers":
+                    if device.type == "cuda":
+                        torch.cuda.synchronize()
+                    elif device.type == "mps":
+                        torch.mps.synchronize()
+                e2e_end = time.time()
+                print(f"INFO: ZMQ E2E time: {e2e_end - e2e_start:.6f} seconds")
+
+            except zmq.Again:
+                print("INFO: ZMQ socket timeout, continuing to listen")
+                continue
+            except Exception as e:
+                print(f"ERROR: Error in ZMQ server: {e}")
+                try:
+                    # Send empty response to maintain REQ-REP state
+                    empty_resp = embedding_pb2.NodeEmbeddingResponse()
+                    socket.send(empty_resp.SerializeToString())
+                except:
+                    # If sending fails, recreate socket
+                    socket.close()
+                    socket = context.socket(zmq.REP)
+                    socket.bind(f"tcp://127.0.0.1:{zmq_port}")
+                    socket.setsockopt(zmq.RCVTIMEO, 5000)
+                    socket.setsockopt(zmq.SNDTIMEO, 300000)
+                    print("INFO: ZMQ socket recreated after error")
+
+    except Exception as e:
+        print(f"ERROR: Failed to start embedding server: {e}")
+        raise
+
+
+def create_embedding_server(
+    domain="demo",
+    load_passages=True,
+    load_embeddings=False,
+    use_fp16=True,
+    use_int8=False,
+    use_cuda_graphs=False,
+    zmq_port=5555,
+    max_batch_size=128,
+    lazy_load_passages=False,
+    model_name="sentence-transformers/all-mpnet-base-v2",
+    passages_file: Optional[str] = None,
+    embedding_mode: str = "sentence-transformers",
+    enable_warmup: bool = False,
+):
+    """
+    原有的 create_embedding_server 函数保持不变
+    这个是阻塞版本，用于直接运行
+    """
+    create_embedding_server_thread(zmq_port, model_name, max_batch_size, passages_file, embedding_mode, enable_warmup)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Embedding service")
+    parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
+    parser.add_argument("--domain", type=str, default="demo", help="Domain name")
+    parser.add_argument("--passages-file", type=str, help="JSON file containing passage ID to text mapping")
+    parser.add_argument("--load-passages", action="store_true", default=True)
+    parser.add_argument("--load-embeddings", action="store_true", default=False)
+    parser.add_argument("--use-fp16", action="store_true", default=False)
+    parser.add_argument("--use-int8", action="store_true", default=False)
+    parser.add_argument("--use-cuda-graphs", action="store_true", default=False)
+    parser.add_argument("--max-batch-size", type=int, default=128, help="Maximum batch size before splitting")
+    parser.add_argument("--lazy-load-passages", action="store_true", default=True)
+    parser.add_argument("--model-name", type=str, default="sentence-transformers/all-mpnet-base-v2", 
+                        help="Embedding model name")
+    parser.add_argument("--embedding-mode", type=str, default="sentence-transformers", 
+                        choices=["sentence-transformers", "mlx", "openai"],
+                        help="Embedding backend mode")
+    parser.add_argument("--use-mlx", action="store_true", default=False, help="Use MLX backend for embeddings (deprecated: use --embedding-mode mlx)")
+    parser.add_argument("--disable-warmup", action="store_true", default=False, help="Disable warmup requests on server start")
+    args = parser.parse_args()
+    
+    # Handle backward compatibility with use_mlx
+    embedding_mode = args.embedding_mode
+    if args.use_mlx:
+        embedding_mode = "mlx"
+
+    create_embedding_server(
+        domain=args.domain,
+        load_passages=args.load_passages,
+        load_embeddings=args.load_embeddings,
+        use_fp16=args.use_fp16,
+        use_int8=args.use_int8,
+        use_cuda_graphs=args.use_cuda_graphs,
+        zmq_port=args.zmq_port,
+        max_batch_size=args.max_batch_size,
+        lazy_load_passages=args.lazy_load_passages,
+        model_name=args.model_name,
+        passages_file=args.passages_file,
+        embedding_mode=embedding_mode,
+        enable_warmup=not args.disable_warmup,
+    )
--- a/packages/leann-backend-diskann/pyproject.toml
+++ b/packages/leann-backend-diskann/pyproject.toml
@@ -8,12 +8,9 @@ version = "0.1.0"
 dependencies = ["leann-core==0.1.0", "numpy"]

 [tool.scikit-build]
-# Key: simplified CMake path
+# 关键：简化的 CMake 路径
 cmake.source-dir = "third_party/DiskANN"
-# Key: Python package in root directory, paths match exactly
+# 关键：Python 包在根目录，路径完全匹配
 wheel.packages = ["leann_backend_diskann"]
-# Use default redirect mode
-editable.mode = "redirect"
-cmake.build-type = "Release"
-build.verbose = true
-build.tool-args = ["-j8"]
+# 使用默认的 redirect 模式
+editable.mode = "redirect"
--- a/packages/leann-backend-diskann/third_party/DiskANN
+++ b/packages/leann-backend-diskann/third_party/DiskANN
--- a/packages/leann-backend-hnsw/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/CMakeLists.txt
@@ -1,7 +1,6 @@
+# 最终简化版
 cmake_minimum_required(VERSION 3.24)
 project(leann_backend_hnsw_wrapper)
-set(CMAKE_C_COMPILER_WORKS 1)
-set(CMAKE_CXX_COMPILER_WORKS 1)

 # Set OpenMP path for macOS
 if(APPLE)
@@ -12,9 +11,15 @@ if(APPLE)
    set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
 endif()

-# Use system ZeroMQ instead of building from source
-find_package(PkgConfig REQUIRED)
-pkg_check_modules(ZMQ REQUIRED libzmq)
+# Build ZeroMQ from source
+set(ZMQ_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(ENABLE_DRAFTS OFF CACHE BOOL "" FORCE)
+set(ENABLE_PRECOMPILED OFF CACHE BOOL "" FORCE)
+set(WITH_PERF_TOOL OFF CACHE BOOL "" FORCE)
+set(WITH_DOCS OFF CACHE BOOL "" FORCE)
+set(BUILD_SHARED OFF CACHE BOOL "" FORCE)
+set(BUILD_STATIC ON CACHE BOOL "" FORCE)
+add_subdirectory(third_party/libzmq)

 # Add cppzmq headers
 include_directories(third_party/cppzmq)
@@ -24,7 +29,6 @@ set(MSGPACK_USE_BOOST OFF CACHE BOOL "" FORCE)
 add_compile_definitions(MSGPACK_NO_BOOST)
 include_directories(third_party/msgpack-c/include)

-# Faiss configuration - streamlined build
 set(FAISS_ENABLE_PYTHON ON CACHE BOOL "" FORCE)
 set(FAISS_ENABLE_GPU OFF CACHE BOOL "" FORCE)
 set(FAISS_ENABLE_EXTRAS OFF CACHE BOOL "" FORCE)
@@ -32,24 +36,4 @@ set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
 set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE)
 set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)

-# Disable additional SIMD versions to speed up compilation
-set(FAISS_ENABLE_AVX2 OFF CACHE BOOL "" FORCE)
-set(FAISS_ENABLE_AVX512 OFF CACHE BOOL "" FORCE)
-
-# Additional optimization options from INSTALL.md
-set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
-set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)  # Static library is faster to build
-
-# Avoid building demos and benchmarks
-set(BUILD_DEMOS OFF CACHE BOOL "" FORCE)
-set(BUILD_BENCHS OFF CACHE BOOL "" FORCE)
-
-# NEW: Tell Faiss to only build the generic version
-set(FAISS_BUILD_GENERIC ON CACHE BOOL "" FORCE)
-set(FAISS_BUILD_AVX2 OFF CACHE BOOL "" FORCE)
-set(FAISS_BUILD_AVX512 OFF CACHE BOOL "" FORCE)
-
-# IMPORTANT: Disable building AVX versions to speed up compilation
-set(FAISS_BUILD_AVX_VERSIONS OFF CACHE BOOL "" FORCE)
-
 add_subdirectory(third_party/faiss)
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
@@ -1,9 +1,10 @@
 import numpy as np
 import os
 from pathlib import Path
-from typing import Dict, Any, List, Literal, Optional
+from typing import Dict, Any, List, Literal
+import pickle
 import shutil
-import logging
+import time

 from leann.searcher_base import BaseSearcher
 from .convert_to_csr import convert_hnsw_graph_to_csr
@@ -15,8 +16,6 @@ from leann.interface import (
    LeannBackendSearcherInterface,
 )

-logger = logging.getLogger(__name__)
-

 def get_metric_map():
    from . import faiss  # type: ignore
@@ -58,9 +57,13 @@ class HNSWBuilder(LeannBackendBuilderInterface):
        index_dir.mkdir(parents=True, exist_ok=True)

        if data.dtype != np.float32:
-            logger.warning(f"Converting data to float32, shape: {data.shape}")
            data = data.astype(np.float32)

+        label_map = {i: str_id for i, str_id in enumerate(ids)}
+        label_map_file = index_dir / "leann.labels.map"
+        with open(label_map_file, "wb") as f:
+            pickle.dump(label_map, f)
+
        metric_enum = get_metric_map().get(self.distance_metric.lower())
        if metric_enum is None:
            raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.")
@@ -82,7 +85,7 @@ class HNSWBuilder(LeannBackendBuilderInterface):
    def _convert_to_csr(self, index_file: Path):
        """Convert built index to CSR format"""
        mode_str = "CSR-pruned" if self.is_recompute else "CSR-standard"
-        logger.info(f"INFO: Converting HNSW index to {mode_str} format...")
+        print(f"INFO: Converting HNSW index to {mode_str} format...")

        csr_temp_file = index_file.with_suffix(".csr.tmp")

@@ -91,11 +94,11 @@ class HNSWBuilder(LeannBackendBuilderInterface):
        )

        if success:
-            logger.info("✅ CSR conversion successful.")
+            print("✅ CSR conversion successful.")
            index_file_old = index_file.with_suffix(".old")
            shutil.move(str(index_file), str(index_file_old))
            shutil.move(str(csr_temp_file), str(index_file))
-            logger.info(
+            print(
                f"INFO: Replaced original index with {mode_str} version at '{index_file}'"
            )
        else:
@@ -132,22 +135,31 @@ class HNSWSearcher(BaseSearcher):

        hnsw_config = faiss.HNSWIndexConfig()
        hnsw_config.is_compact = self.is_compact
-        hnsw_config.is_recompute = (
-            self.is_pruned
-        )  # In C++ code, it's called is_recompute, but it's only for loading IIUC.
+        hnsw_config.is_recompute = self.is_pruned or kwargs.get("is_recompute", False)
+
+        if self.is_pruned and not hnsw_config.is_recompute:
+            raise RuntimeError("Index is pruned but recompute is disabled.")

        self._index = faiss.read_index(str(index_file), faiss.IO_FLAG_MMAP, hnsw_config)
+        
+        # Load label mapping
+        label_map_file = self.index_dir / "leann.labels.map"
+        if not label_map_file.exists():
+            raise FileNotFoundError(f"Label map file not found at {label_map_file}")
+        
+        with open(label_map_file, "rb") as f:
+            self.label_map = pickle.load(f)

    def search(
        self,
        query: np.ndarray,
        top_k: int,
-        zmq_port: Optional[int] = None,
        complexity: int = 64,
        beam_width: int = 1,
        prune_ratio: float = 0.0,
-        recompute_embeddings: bool = True,
+        recompute_embeddings: bool = False,
        pruning_strategy: Literal["global", "local", "proportional"] = "global",
+        zmq_port: int = 5557,
        batch_size: int = 0,
        **kwargs,
    ) -> Dict[str, Any]:
@@ -165,7 +177,7 @@ class HNSWSearcher(BaseSearcher):
                - "global": Use global PQ queue size for selection (default)
                - "local": Local pruning, sort and select best candidates
                - "proportional": Base selection on new neighbor count ratio
-            zmq_port: ZMQ port for embedding server communication. Must be provided if recompute_embeddings is True.
+            zmq_port: ZMQ port for embedding server
            batch_size: Neighbor processing batch size, 0=disabled (HNSW-specific)
            **kwargs: Additional HNSW-specific parameters (for legacy compatibility)

@@ -174,14 +186,15 @@ class HNSWSearcher(BaseSearcher):
        """
        from . import faiss  # type: ignore

-        if not recompute_embeddings:
-            if self.is_pruned:
-                raise RuntimeError("Recompute is required for pruned index.")
-        if recompute_embeddings:
-            if zmq_port is None:
-                raise ValueError(
-                    "zmq_port must be provided if recompute_embeddings is True"
+        # Use recompute_embeddings parameter
+        use_recompute = recompute_embeddings or self.is_pruned
+        if use_recompute:
+            meta_file_path = self.index_dir / f"{self.index_path.name}.meta.json"
+            if not meta_file_path.exists():
+                raise RuntimeError(
+                    f"FATAL: Recompute enabled but metadata file not found: {meta_file_path}"
                )
+            self._ensure_server_running(str(meta_file_path), port=zmq_port, **kwargs)

        if query.dtype != np.float32:
            query = query.astype(np.float32)
@@ -189,10 +202,7 @@ class HNSWSearcher(BaseSearcher):
            faiss.normalize_L2(query)

        params = faiss.SearchParametersHNSW()
-        if zmq_port is not None:
-            params.zmq_port = (
-                zmq_port  # C++ code won't use this if recompute_embeddings is False
-            )
+        params.zmq_port = zmq_port
        params.efSearch = complexity
        params.beam_size = beam_width

@@ -229,7 +239,11 @@ class HNSWSearcher(BaseSearcher):
        )

        string_labels = [
-            [str(int_label) for int_label in batch_labels] for batch_labels in labels
+            [
+                self.label_map.get(int_label, f"unknown_{int_label}")
+                for int_label in batch_labels
+            ]
+            for batch_labels in labels
        ]

        return {"labels": string_labels, "distances": distances}
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -13,10 +13,5 @@ dependencies = ["leann-core==0.1.0", "numpy"]
 [tool.scikit-build]
 wheel.packages = ["leann_backend_hnsw"]
 editable.mode = "redirect"
-cmake.build-type = "Release"
-build.verbose = true
-build.tool-args = ["-j8"]
-
-# CMake definitions to optimize compilation
-[tool.scikit-build.cmake.define]
-CMAKE_BUILD_PARALLEL_LEVEL = "8"
+cmake.build-type = "Debug"
+build.verbose = true
--- a/packages/leann-backend-hnsw/third_party/faiss
+++ b/packages/leann-backend-hnsw/third_party/faiss
--- a/packages/leann-backend-hnsw/third_party/msgpack-c
+++ b/packages/leann-backend-hnsw/third_party/msgpack-c
--- a/packages/leann-core/pyproject.toml
+++ b/packages/leann-core/pyproject.toml
@@ -15,8 +15,5 @@ dependencies = [
    "tqdm>=4.60.0"
 ]

-[project.scripts]
-leann = "leann.cli:main"
-
 [tool.setuptools.packages.find]
 where = ["src"]
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -5,18 +5,16 @@ with the correct, original embedding logic from the user's reference code.

 import json
 import pickle
-from leann.interface import LeannBackendSearcherInterface
 import numpy as np
-import time
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Literal
 from dataclasses import dataclass, field
+import uuid
+import torch
+
 from .registry import BACKEND_REGISTRY
 from .interface import LeannBackendFactoryInterface
 from .chat import get_llm
-import logging
-
-logger = logging.getLogger(__name__)


 def compute_embeddings(
@@ -24,8 +22,7 @@ def compute_embeddings(
    model_name: str,
    mode: str = "sentence-transformers",
    use_server: bool = True,
-    port: Optional[int] = None,
-    is_build=False,
+    use_mlx: bool = False  # Backward compatibility: if True, override mode to 'mlx',
 ) -> np.ndarray:
    """
    Computes embeddings using different backends.
@@ -42,63 +39,251 @@ def compute_embeddings(
    Returns:
        numpy array of embeddings
    """
-    if use_server:
-        # Use embedding server (for search/query)
-        if port is None:
-            raise ValueError("port is required when use_server is True")
-        return compute_embeddings_via_server(chunks, model_name, port=port)
+    # Override mode for backward compatibility
+    if use_mlx:
+        mode = "mlx"
+
+    # Auto-detect mode based on model name if not explicitly set
+    if mode == "sentence-transformers" and model_name.startswith("text-embedding-"):
+        mode = "openai"
+
+    if mode == "mlx":
+        return compute_embeddings_mlx(chunks, model_name, batch_size=16)
+    elif mode == "openai":
+        return compute_embeddings_openai(chunks, model_name)
+    elif mode == "sentence-transformers":
+        return compute_embeddings_sentence_transformers(
+            chunks, model_name, use_server=use_server
+        )
    else:
-        # Use direct computation (for build_index)
-        from .embedding_compute import (
-            compute_embeddings as compute_embeddings_direct,
-        )
-
-        return compute_embeddings_direct(
-            chunks,
-            model_name,
-            mode=mode,
-            is_build=is_build,
+        raise ValueError(
+            f"Unsupported embedding mode: {mode}. Supported modes: sentence-transformers, mlx, openai"
        )


-def compute_embeddings_via_server(
-    chunks: List[str], model_name: str, port: int
+def compute_embeddings_sentence_transformers(
+    chunks: List[str], model_name: str, use_server: bool = True
 ) -> np.ndarray:
    """Computes embeddings using sentence-transformers.

    Args:
        chunks: List of text chunks to embed
        model_name: Name of the sentence transformer model
+        use_server: If True, use embedding server (good for search). If False, use direct computation (good for build).
    """
-    logger.info(
-        f"Computing embeddings for {len(chunks)} chunks using SentenceTransformer model '{model_name}' (via embedding server)..."
+    if not use_server:
+        print(
+            f"INFO: Computing embeddings for {len(chunks)} chunks using SentenceTransformer model '{model_name}' (direct)..."
+        )
+        return _compute_embeddings_sentence_transformers_direct(chunks, model_name)
+
+    print(
+        f"INFO: Computing embeddings for {len(chunks)} chunks using SentenceTransformer model '{model_name}' (via embedding server)..."
    )
-    import zmq
-    import msgpack
-    import numpy as np

-    # Connect to embedding server
-    context = zmq.Context()
-    socket = context.socket(zmq.REQ)
-    socket.connect(f"tcp://localhost:{port}")
+    # Use embedding server for sentence-transformers too
+    # This avoids loading the model twice (once in API, once in server)
+    try:
+        # Import ZMQ client functionality and server manager
+        import zmq
+        import msgpack
+        import numpy as np
+        from .embedding_server_manager import EmbeddingServerManager

-    # Send chunks to server for embedding computation
-    request = chunks
-    socket.send(msgpack.packb(request))
+        # Ensure embedding server is running
+        port = 5557
+        server_manager = EmbeddingServerManager(
+            backend_module_name="leann_backend_hnsw.hnsw_embedding_server"
+        )

-    # Receive embeddings from server
-    response = socket.recv()
-    embeddings_list = msgpack.unpackb(response)
+        server_started = server_manager.start_server(
+            port=port,
+            model_name=model_name,
+            embedding_mode="sentence-transformers",
+            enable_warmup=False,
+        )

-    # Convert back to numpy array
-    embeddings = np.array(embeddings_list, dtype=np.float32)
+        if not server_started:
+            raise RuntimeError(f"Failed to start embedding server on port {port}")

-    socket.close()
-    context.term()
+        # Connect to embedding server
+        context = zmq.Context()
+        socket = context.socket(zmq.REQ)
+        socket.connect(f"tcp://localhost:{port}")
+
+        # Send chunks to server for embedding computation
+        request = chunks
+        socket.send(msgpack.packb(request))
+
+        # Receive embeddings from server
+        response = socket.recv()
+        embeddings_list = msgpack.unpackb(response)
+
+        # Convert back to numpy array
+        embeddings = np.array(embeddings_list, dtype=np.float32)
+
+        socket.close()
+        context.term()
+
+        return embeddings
+
+    except Exception as e:
+        # Fallback to direct sentence-transformers if server connection fails
+        print(
+            f"Warning: Failed to connect to embedding server, falling back to direct computation: {e}"
+        )
+        return _compute_embeddings_sentence_transformers_direct(chunks, model_name)
+
+
+def _compute_embeddings_sentence_transformers_direct(
+    chunks: List[str], model_name: str
+) -> np.ndarray:
+    """Direct sentence-transformers computation (fallback)."""
+    try:
+        from sentence_transformers import SentenceTransformer
+    except ImportError as e:
+        raise RuntimeError(
+            "sentence-transformers not available. Install with: uv pip install sentence-transformers"
+        ) from e
+
+    # Load model using sentence-transformers
+    model = SentenceTransformer(model_name)
+
+    model = model.half()
+    print(
+        f"INFO: Computing embeddings for {len(chunks)} chunks using SentenceTransformer model '{model_name}' (direct)..."
+    )
+    # use acclerater GPU or MAC GPU
+
+    if torch.cuda.is_available():
+        model = model.to("cuda")
+    elif torch.backends.mps.is_available():
+        model = model.to("mps")
+
+    # Generate embeddings
+    # give use an warning if OOM here means we need to turn down the batch size
+    embeddings = model.encode(
+        chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=16
+    )

    return embeddings


+def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
+    """Computes embeddings using OpenAI API."""
+    try:
+        import openai
+        import os
+    except ImportError as e:
+        raise RuntimeError(
+            "openai not available. Install with: uv pip install openai"
+        ) from e
+
+    # Get API key from environment
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise RuntimeError("OPENAI_API_KEY environment variable not set")
+
+    client = openai.OpenAI(api_key=api_key)
+
+    print(
+        f"INFO: Computing embeddings for {len(chunks)} chunks using OpenAI model '{model_name}'..."
+    )
+
+    # OpenAI has a limit on batch size and input length
+    max_batch_size = 100  # Conservative batch size
+    all_embeddings = []
+    
+    try:
+        from tqdm import tqdm
+        total_batches = (len(chunks) + max_batch_size - 1) // max_batch_size
+        batch_range = range(0, len(chunks), max_batch_size)
+        batch_iterator = tqdm(batch_range, desc="Computing embeddings", unit="batch", total=total_batches)
+    except ImportError:
+        # Fallback without progress bar
+        batch_iterator = range(0, len(chunks), max_batch_size)
+    
+    for i in batch_iterator:
+        batch_chunks = chunks[i:i + max_batch_size]
+        
+        try:
+            response = client.embeddings.create(model=model_name, input=batch_chunks)
+            batch_embeddings = [embedding.embedding for embedding in response.data]
+            all_embeddings.extend(batch_embeddings)
+        except Exception as e:
+            print(f"ERROR: Failed to get embeddings for batch starting at {i}: {e}")
+            raise
+
+    embeddings = np.array(all_embeddings, dtype=np.float32)
+    print(
+        f"INFO: Generated {len(embeddings)} embeddings with dimension {embeddings.shape[1]}"
+    )
+    return embeddings
+
+
+def compute_embeddings_mlx(chunks: List[str], model_name: str, batch_size: int = 16) -> np.ndarray:
+    """Computes embeddings using an MLX model."""
+    try:
+        import mlx.core as mx
+        from mlx_lm.utils import load
+        from tqdm import tqdm
+    except ImportError as e:
+        raise RuntimeError(
+            "MLX or related libraries not available. Install with: uv pip install mlx mlx-lm"
+        ) from e
+
+    print(
+        f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}' with batch_size={batch_size}..."
+    )
+
+    # Load model and tokenizer
+    model, tokenizer = load(model_name)
+
+    # Process chunks in batches with progress bar
+    all_embeddings = []
+    
+    try:
+        from tqdm import tqdm
+        batch_iterator = tqdm(range(0, len(chunks), batch_size), desc="Computing embeddings", unit="batch")
+    except ImportError:
+        batch_iterator = range(0, len(chunks), batch_size)
+    
+    for i in batch_iterator:
+        batch_chunks = chunks[i:i + batch_size]
+        
+        # Tokenize all chunks in the batch
+        batch_token_ids = []
+        for chunk in batch_chunks:
+            token_ids = tokenizer.encode(chunk)  # type: ignore
+            batch_token_ids.append(token_ids)
+        
+        # Pad sequences to the same length for batch processing
+        max_length = max(len(ids) for ids in batch_token_ids)
+        padded_token_ids = []
+        for token_ids in batch_token_ids:
+            # Pad with tokenizer.pad_token_id or 0
+            padded = token_ids + [0] * (max_length - len(token_ids))
+            padded_token_ids.append(padded)
+        
+        # Convert to MLX array with batch dimension
+        input_ids = mx.array(padded_token_ids)
+
+        # Get embeddings for the batch
+        embeddings = model(input_ids)
+
+        # Mean pooling for each sequence in the batch
+        pooled = embeddings.mean(axis=1)  # Shape: (batch_size, hidden_size)
+
+        # Convert batch embeddings to numpy
+        for j in range(len(batch_chunks)):
+            pooled_list = pooled[j].tolist()  # Convert to list
+            pooled_numpy = np.array(pooled_list, dtype=np.float32)
+            all_embeddings.append(pooled_numpy)
+
+    # Stack numpy arrays
+    return np.stack(all_embeddings)
+
+
@dataclass
 class SearchResult:
    id: str
@@ -114,24 +299,25 @@ class PassageManager:
        self.global_offset_map = {}  # Combined map for fast lookup

        for source in passage_sources:
-            assert source["type"] == "jsonl", "only jsonl is supported"
-            passage_file = source["path"]
-            index_file = source["index_path"]  # .idx file
-            if not Path(index_file).exists():
-                raise FileNotFoundError(f"Passage index file not found: {index_file}")
-            with open(index_file, "rb") as f:
-                offset_map = pickle.load(f)
-                self.offset_maps[passage_file] = offset_map
-                self.passage_files[passage_file] = passage_file
+            if source["type"] == "jsonl":
+                passage_file = source["path"]
+                index_file = source["index_path"]
+                if not Path(index_file).exists():
+                    raise FileNotFoundError(
+                        f"Passage index file not found: {index_file}"
+                    )
+                with open(index_file, "rb") as f:
+                    offset_map = pickle.load(f)
+                    self.offset_maps[passage_file] = offset_map
+                    self.passage_files[passage_file] = passage_file

-                # Build global map for O(1) lookup
-                for passage_id, offset in offset_map.items():
-                    self.global_offset_map[passage_id] = (passage_file, offset)
+                    # Build global map for O(1) lookup
+                    for passage_id, offset in offset_map.items():
+                        self.global_offset_map[passage_id] = (passage_file, offset)

    def get_passage(self, passage_id: str) -> Dict[str, Any]:
        if passage_id in self.global_offset_map:
            passage_file, offset = self.global_offset_map[passage_id]
-            # Lazy file opening - only open when needed
            with open(passage_file, "r", encoding="utf-8") as f:
                f.seek(offset)
                return json.loads(f.readline())
@@ -158,12 +344,14 @@ class LeannBuilder:
        self.dimensions = dimensions
        self.embedding_mode = embedding_mode
        self.backend_kwargs = backend_kwargs
+        if 'mlx' in self.embedding_model:
+            self.embedding_mode = "mlx"
        self.chunks: List[Dict[str, Any]] = []

    def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
        if metadata is None:
            metadata = {}
-        passage_id = metadata.get("id", str(len(self.chunks)))
+        passage_id = metadata.get("id", str(uuid.uuid4()))
        chunk_data = {"id": passage_id, "text": text, "metadata": metadata}
        self.chunks.append(chunk_data)

@@ -189,13 +377,10 @@ class LeannBuilder:
        with open(passages_file, "w", encoding="utf-8") as f:
            try:
                from tqdm import tqdm
-
-                chunk_iterator = tqdm(
-                    self.chunks, desc="Writing passages", unit="chunk"
-                )
+                chunk_iterator = tqdm(self.chunks, desc="Writing passages", unit="chunk")
            except ImportError:
                chunk_iterator = self.chunks
-
+            
            for chunk in chunk_iterator:
                offset = f.tell()
                json.dump(
@@ -213,11 +398,7 @@ class LeannBuilder:
            pickle.dump(offset_map, f)
        texts_to_embed = [c["text"] for c in self.chunks]
        embeddings = compute_embeddings(
-            texts_to_embed,
-            self.embedding_model,
-            self.embedding_mode,
-            use_server=False,
-            is_build=True,
+            texts_to_embed, self.embedding_model, self.embedding_mode, use_server=False
        )
        string_ids = [chunk["id"] for chunk in self.chunks]
        current_backend_kwargs = {**self.backend_kwargs, "dimensions": self.dimensions}
@@ -291,7 +472,7 @@ class LeannBuilder:
                f"Dimension mismatch: expected {self.dimensions}, got {embedding_dim}"
            )

-        logger.info(
+        print(
            f"Building index from precomputed embeddings: {len(ids)} items, {embedding_dim} dimensions"
        )

@@ -299,7 +480,7 @@ class LeannBuilder:
        if len(self.chunks) != len(ids):
            # If no text chunks provided, create placeholder text entries
            if not self.chunks:
-                logger.info("No text chunks provided, creating placeholder entries...")
+                print("No text chunks provided, creating placeholder entries...")
                for id_val in ids:
                    self.add_text(
                        f"Document {id_val}",
@@ -374,19 +555,15 @@ class LeannBuilder:
        with open(leann_meta_path, "w", encoding="utf-8") as f:
            json.dump(meta_data, f, indent=2)

-        logger.info(
-            f"Index built successfully from precomputed embeddings: {index_path}"
-        )
+        print(f"Index built successfully from precomputed embeddings: {index_path}")


 class LeannSearcher:
    def __init__(self, index_path: str, enable_warmup: bool = False, **backend_kwargs):
-        self.meta_path_str = f"{index_path}.meta.json"
-        if not Path(self.meta_path_str).exists():
-            raise FileNotFoundError(
-                f"Leann metadata file not found at {self.meta_path_str}"
-            )
-        with open(self.meta_path_str, "r", encoding="utf-8") as f:
+        meta_path_str = f"{index_path}.meta.json"
+        if not Path(meta_path_str).exists():
+            raise FileNotFoundError(f"Leann metadata file not found at {meta_path_str}")
+        with open(meta_path_str, "r", encoding="utf-8") as f:
            self.meta_data = json.load(f)
        backend_name = self.meta_data["backend_name"]
        self.embedding_model = self.meta_data["embedding_model"]
@@ -394,15 +571,16 @@ class LeannSearcher:
        self.embedding_mode = self.meta_data.get(
            "embedding_mode", "sentence-transformers"
        )
+        # Backward compatibility with use_mlx
+        if self.meta_data.get("use_mlx", False):
+            self.embedding_mode = "mlx"
        self.passage_manager = PassageManager(self.meta_data.get("passage_sources", []))
        backend_factory = BACKEND_REGISTRY.get(backend_name)
        if backend_factory is None:
            raise ValueError(f"Backend '{backend_name}' not found.")
        final_kwargs = {**self.meta_data.get("backend_kwargs", {}), **backend_kwargs}
        final_kwargs["enable_warmup"] = enable_warmup
-        self.backend_impl: LeannBackendSearcherInterface = backend_factory.searcher(
-            index_path, **final_kwargs
-        )
+        self.backend_impl = backend_factory.searcher(index_path, **final_kwargs)

    def search(
        self,
@@ -411,39 +589,26 @@ class LeannSearcher:
        complexity: int = 64,
        beam_width: int = 1,
        prune_ratio: float = 0.0,
-        recompute_embeddings: bool = True,
+        recompute_embeddings: bool = False,
        pruning_strategy: Literal["global", "local", "proportional"] = "global",
-        expected_zmq_port: int = 5557,
+        zmq_port: int = 5557,
        **kwargs,
    ) -> List[SearchResult]:
-        logger.info("🔍 LeannSearcher.search() called:")
-        logger.info(f"  Query: '{query}'")
-        logger.info(f"  Top_k: {top_k}")
-        logger.info(f"  Additional kwargs: {kwargs}")
+        print("🔍 DEBUG LeannSearcher.search() called:")
+        print(f"  Query: '{query}'")
+        print(f"  Top_k: {top_k}")
+        print(f"  Additional kwargs: {kwargs}")

-        zmq_port = None
-
-        start_time = time.time()
-        if recompute_embeddings:
-            zmq_port = self.backend_impl._ensure_server_running(
-                self.meta_path_str,
-                port=expected_zmq_port,
-                **kwargs,
-            )
-            del expected_zmq_port
-        zmq_time = time.time() - start_time
-        logger.info(f"  Launching server time: {zmq_time} seconds")
+        # Use backend's compute_query_embedding method
+        # This will automatically use embedding server if available and needed
+        import time

        start_time = time.time()

-        query_embedding = self.backend_impl.compute_query_embedding(
-            query,
-            use_server_if_available=recompute_embeddings,
-            zmq_port=zmq_port,
-        )
-        logger.info(f"  Generated embedding shape: {query_embedding.shape}")
+        query_embedding = self.backend_impl.compute_query_embedding(query, zmq_port)
+        print(f"  Generated embedding shape: {query_embedding.shape}")
        embedding_time = time.time() - start_time
-        logger.info(f"  Embedding time: {embedding_time} seconds")
+        print(f"  Embedding time: {embedding_time} seconds")

        start_time = time.time()
        results = self.backend_impl.search(
@@ -458,14 +623,14 @@ class LeannSearcher:
            **kwargs,
        )
        search_time = time.time() - start_time
-        logger.info(f"  Search time: {search_time} seconds")
-        logger.info(
+        print(f"  Search time: {search_time} seconds")
+        print(
            f"  Backend returned: labels={len(results.get('labels', [[]])[0])} results"
        )

        enriched_results = []
        if "labels" in results and "distances" in results:
-            logger.info(f"  Processing {len(results['labels'][0])} passage IDs:")
+            print(f"  Processing {len(results['labels'][0])} passage IDs:")
            for i, (string_id, dist) in enumerate(
                zip(results["labels"][0], results["distances"][0])
            ):
@@ -479,15 +644,15 @@ class LeannSearcher:
                            metadata=passage_data.get("metadata", {}),
                        )
                    )
-                    logger.info(
+                    print(
                        f"    {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text']}..."
                    )
                except KeyError:
-                    logger.error(
+                    print(
                        f"    {i + 1}. passage_id='{string_id}' -> ERROR: Passage not found in PassageManager!"
                    )

-        logger.info(f"  Final enriched results: {len(enriched_results)} passages")
+        print(f"  Final enriched results: {len(enriched_results)} passages")
        return enriched_results


@@ -509,10 +674,10 @@ class LeannChat:
        complexity: int = 64,
        beam_width: int = 1,
        prune_ratio: float = 0.0,
-        recompute_embeddings: bool = True,
+        recompute_embeddings: bool = False,
        pruning_strategy: Literal["global", "local", "proportional"] = "global",
+        zmq_port: int = 5557,
        llm_kwargs: Optional[Dict[str, Any]] = None,
-        expected_zmq_port: int = 5557,
        **search_kwargs,
    ):
        if llm_kwargs is None:
@@ -526,7 +691,7 @@ class LeannChat:
            prune_ratio=prune_ratio,
            recompute_embeddings=recompute_embeddings,
            pruning_strategy=pruning_strategy,
-            expected_zmq_port=expected_zmq_port,
+            zmq_port=zmq_port,
            **search_kwargs,
        )
        context = "\n\n".join([r.text for r in results])
--- a/packages/leann-core/src/leann/chat.py
+++ b/packages/leann-core/src/leann/chat.py
@@ -375,9 +375,8 @@ class OllamaChat(LLMInterface):
            "stream": False,  # Keep it simple for now
            "options": kwargs,
        }
-        logger.debug(f"Sending request to Ollama: {payload}")
+        logger.info(f"Sending request to Ollama: {payload}")
        try:
-            logger.info(f"Sending request to Ollama and waiting for response...")
            response = requests.post(full_url, data=json.dumps(payload))
            response.raise_for_status()

--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -1,315 +0,0 @@
-import argparse
-import asyncio
-from pathlib import Path
-
-from llama_index.core import SimpleDirectoryReader
-from llama_index.core.node_parser import SentenceSplitter
-
-from .api import LeannBuilder, LeannSearcher, LeannChat
-
-
-class LeannCLI:
-    def __init__(self):
-        self.indexes_dir = Path.home() / ".leann" / "indexes"
-        self.indexes_dir.mkdir(parents=True, exist_ok=True)
-
-        self.node_parser = SentenceSplitter(
-            chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
-        )
-
-    def get_index_path(self, index_name: str) -> str:
-        index_dir = self.indexes_dir / index_name
-        return str(index_dir / "documents.leann")
-
-    def index_exists(self, index_name: str) -> bool:
-        index_dir = self.indexes_dir / index_name
-        meta_file = index_dir / "documents.leann.meta.json"
-        return meta_file.exists()
-
-    def create_parser(self) -> argparse.ArgumentParser:
-        parser = argparse.ArgumentParser(
-            prog="leann",
-            description="LEANN - Local Enhanced AI Navigation",
-            formatter_class=argparse.RawDescriptionHelpFormatter,
-            epilog="""
-Examples:
-  leann build my-docs --docs ./documents    # Build index named my-docs
-  leann search my-docs "query"             # Search in my-docs index
-  leann ask my-docs "question"             # Ask my-docs index
-  leann list                              # List all stored indexes
-            """,
-        )
-
-        subparsers = parser.add_subparsers(dest="command", help="Available commands")
-
-        # Build command
-        build_parser = subparsers.add_parser("build", help="Build document index")
-        build_parser.add_argument("index_name", help="Index name")
-        build_parser.add_argument(
-            "--docs", type=str, required=True, help="Documents directory"
-        )
-        build_parser.add_argument(
-            "--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
-        )
-        build_parser.add_argument(
-            "--embedding-model", type=str, default="facebook/contriever"
-        )
-        build_parser.add_argument(
-            "--force", "-f", action="store_true", help="Force rebuild"
-        )
-        build_parser.add_argument("--graph-degree", type=int, default=32)
-        build_parser.add_argument("--complexity", type=int, default=64)
-        build_parser.add_argument("--num-threads", type=int, default=1)
-        build_parser.add_argument("--compact", action="store_true", default=True)
-        build_parser.add_argument("--recompute", action="store_true", default=True)
-
-        # Search command
-        search_parser = subparsers.add_parser("search", help="Search documents")
-        search_parser.add_argument("index_name", help="Index name")
-        search_parser.add_argument("query", help="Search query")
-        search_parser.add_argument("--top-k", type=int, default=5)
-        search_parser.add_argument("--complexity", type=int, default=64)
-        search_parser.add_argument("--beam-width", type=int, default=1)
-        search_parser.add_argument("--prune-ratio", type=float, default=0.0)
-        search_parser.add_argument("--recompute-embeddings", action="store_true")
-        search_parser.add_argument(
-            "--pruning-strategy",
-            choices=["global", "local", "proportional"],
-            default="global",
-        )
-
-        # Ask command
-        ask_parser = subparsers.add_parser("ask", help="Ask questions")
-        ask_parser.add_argument("index_name", help="Index name")
-        ask_parser.add_argument(
-            "--llm",
-            type=str,
-            default="ollama",
-            choices=["simulated", "ollama", "hf", "openai"],
-        )
-        ask_parser.add_argument("--model", type=str, default="qwen3:8b")
-        ask_parser.add_argument("--host", type=str, default="http://localhost:11434")
-        ask_parser.add_argument("--interactive", "-i", action="store_true")
-        ask_parser.add_argument("--top-k", type=int, default=20)
-        ask_parser.add_argument("--complexity", type=int, default=32)
-        ask_parser.add_argument("--beam-width", type=int, default=1)
-        ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
-        ask_parser.add_argument("--recompute-embeddings", action="store_true")
-        ask_parser.add_argument(
-            "--pruning-strategy",
-            choices=["global", "local", "proportional"],
-            default="global",
-        )
-
-        # List command
-        list_parser = subparsers.add_parser("list", help="List all indexes")
-
-        return parser
-
-    def list_indexes(self):
-        print("Stored LEANN indexes:")
-
-        if not self.indexes_dir.exists():
-            print(
-                "No indexes found. Use 'leann build <name> --docs <dir>' to create one."
-            )
-            return
-
-        index_dirs = [d for d in self.indexes_dir.iterdir() if d.is_dir()]
-
-        if not index_dirs:
-            print(
-                "No indexes found. Use 'leann build <name> --docs <dir>' to create one."
-            )
-            return
-
-        print(f"Found {len(index_dirs)} indexes:")
-        for i, index_dir in enumerate(index_dirs, 1):
-            index_name = index_dir.name
-            status = "✓" if self.index_exists(index_name) else "✗"
-
-            print(f"  {i}. {index_name} [{status}]")
-            if self.index_exists(index_name):
-                meta_file = index_dir / "documents.leann.meta.json"
-                size_mb = sum(
-                    f.stat().st_size for f in index_dir.iterdir() if f.is_file()
-                ) / (1024 * 1024)
-                print(f"     Size: {size_mb:.1f} MB")
-
-        if index_dirs:
-            example_name = index_dirs[0].name
-            print(f"\nUsage:")
-            print(f'  leann search {example_name} "your query"')
-            print(f"  leann ask {example_name} --interactive")
-
-    def load_documents(self, docs_dir: str):
-        print(f"Loading documents from {docs_dir}...")
-
-        documents = SimpleDirectoryReader(
-            docs_dir,
-            recursive=True,
-            encoding="utf-8",
-            required_exts=[".pdf", ".txt", ".md", ".docx"],
-        ).load_data(show_progress=True)
-
-        all_texts = []
-        for doc in documents:
-            nodes = self.node_parser.get_nodes_from_documents([doc])
-            for node in nodes:
-                all_texts.append(node.get_content())
-
-        print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
-        return all_texts
-
-    async def build_index(self, args):
-        docs_dir = args.docs
-        index_name = args.index_name
-        index_dir = self.indexes_dir / index_name
-        index_path = self.get_index_path(index_name)
-
-        if index_dir.exists() and not args.force:
-            print(f"Index '{index_name}' already exists. Use --force to rebuild.")
-            return
-
-        all_texts = self.load_documents(docs_dir)
-        if not all_texts:
-            print("No documents found")
-            return
-
-        index_dir.mkdir(parents=True, exist_ok=True)
-
-        print(f"Building index '{index_name}' with {args.backend} backend...")
-
-        builder = LeannBuilder(
-            backend_name=args.backend,
-            embedding_model=args.embedding_model,
-            graph_degree=args.graph_degree,
-            complexity=args.complexity,
-            is_compact=args.compact,
-            is_recompute=args.recompute,
-            num_threads=args.num_threads,
-        )
-
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
-
-        builder.build_index(index_path)
-        print(f"Index built at {index_path}")
-
-    async def search_documents(self, args):
-        index_name = args.index_name
-        query = args.query
-        index_path = self.get_index_path(index_name)
-
-        if not self.index_exists(index_name):
-            print(
-                f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
-            )
-            return
-
-        searcher = LeannSearcher(index_path=index_path)
-        results = searcher.search(
-            query,
-            top_k=args.top_k,
-            complexity=args.complexity,
-            beam_width=args.beam_width,
-            prune_ratio=args.prune_ratio,
-            recompute_embeddings=args.recompute_embeddings,
-            pruning_strategy=args.pruning_strategy,
-        )
-
-        print(f"Search results for '{query}' (top {len(results)}):")
-        for i, result in enumerate(results, 1):
-            print(f"{i}. Score: {result.score:.3f}")
-            print(f"   {result.text[:200]}...")
-            print()
-
-    async def ask_questions(self, args):
-        index_name = args.index_name
-        index_path = self.get_index_path(index_name)
-
-        if not self.index_exists(index_name):
-            print(
-                f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
-            )
-            return
-
-        print(f"Starting chat with index '{index_name}'...")
-        print(f"Using {args.model} ({args.llm})")
-
-        llm_config = {"type": args.llm, "model": args.model}
-        if args.llm == "ollama":
-            llm_config["host"] = args.host
-
-        chat = LeannChat(index_path=index_path, llm_config=llm_config)
-
-        if args.interactive:
-            print("LEANN Assistant ready! Type 'quit' to exit")
-            print("=" * 40)
-
-            while True:
-                user_input = input("\nYou: ").strip()
-                if user_input.lower() in ["quit", "exit", "q"]:
-                    print("Goodbye!")
-                    break
-
-                if not user_input:
-                    continue
-
-                response = chat.ask(
-                    user_input,
-                    top_k=args.top_k,
-                    complexity=args.complexity,
-                    beam_width=args.beam_width,
-                    prune_ratio=args.prune_ratio,
-                    recompute_embeddings=args.recompute_embeddings,
-                    pruning_strategy=args.pruning_strategy,
-                )
-                print(f"LEANN: {response}")
-        else:
-            query = input("Enter your question: ").strip()
-            if query:
-                response = chat.ask(
-                    query,
-                    top_k=args.top_k,
-                    complexity=args.complexity,
-                    beam_width=args.beam_width,
-                    prune_ratio=args.prune_ratio,
-                    recompute_embeddings=args.recompute_embeddings,
-                    pruning_strategy=args.pruning_strategy,
-                )
-                print(f"LEANN: {response}")
-
-    async def run(self, args=None):
-        parser = self.create_parser()
-
-        if args is None:
-            args = parser.parse_args()
-
-        if not args.command:
-            parser.print_help()
-            return
-
-        if args.command == "list":
-            self.list_indexes()
-        elif args.command == "build":
-            await self.build_index(args)
-        elif args.command == "search":
-            await self.search_documents(args)
-        elif args.command == "ask":
-            await self.ask_questions(args)
-        else:
-            parser.print_help()
-
-
-def main():
-    import dotenv
-
-    dotenv.load_dotenv()
-
-    cli = LeannCLI()
-    asyncio.run(cli.run())
-
-
-if __name__ == "__main__":
-    main()
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -1,375 +0,0 @@
-"""
-Unified embedding computation module
-Consolidates all embedding computation logic using SentenceTransformer
-Preserves all optimization parameters to ensure performance
-"""
-
-import numpy as np
-import torch
-from typing import List, Dict, Any
-import logging
-import os
-
-# Set up logger with proper level
-logger = logging.getLogger(__name__)
-LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
-log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
-logger.setLevel(log_level)
-
-# Global model cache to avoid repeated loading
-_model_cache: Dict[str, Any] = {}
-
-
-def compute_embeddings(
-    texts: List[str],
-    model_name: str,
-    mode: str = "sentence-transformers",
-    is_build: bool = False,
-    batch_size: int = 32,
-    adaptive_optimization: bool = True,
-) -> np.ndarray:
-    """
-    Unified embedding computation entry point
-
-    Args:
-        texts: List of texts to compute embeddings for
-        model_name: Model name
-        mode: Computation mode ('sentence-transformers', 'openai', 'mlx')
-        is_build: Whether this is a build operation (shows progress bar)
-        batch_size: Batch size for processing
-        adaptive_optimization: Whether to use adaptive optimization based on batch size
-
-    Returns:
-        Normalized embeddings array, shape: (len(texts), embedding_dim)
-    """
-    if mode == "sentence-transformers":
-        return compute_embeddings_sentence_transformers(
-            texts,
-            model_name,
-            is_build=is_build,
-            batch_size=batch_size,
-            adaptive_optimization=adaptive_optimization,
-        )
-    elif mode == "openai":
-        return compute_embeddings_openai(texts, model_name)
-    elif mode == "mlx":
-        return compute_embeddings_mlx(texts, model_name)
-    else:
-        raise ValueError(f"Unsupported embedding mode: {mode}")
-
-
-def compute_embeddings_sentence_transformers(
-    texts: List[str],
-    model_name: str,
-    use_fp16: bool = True,
-    device: str = "auto",
-    batch_size: int = 32,
-    is_build: bool = False,
-    adaptive_optimization: bool = True,
-) -> np.ndarray:
-    """
-    Compute embeddings using SentenceTransformer with model caching and adaptive optimization
-
-    Args:
-        texts: List of texts to compute embeddings for
-        model_name: Model name
-        use_fp16: Whether to use FP16 precision
-        device: Device to use ('auto', 'cuda', 'mps', 'cpu')
-        batch_size: Batch size for processing
-        is_build: Whether this is a build operation (shows progress bar)
-        adaptive_optimization: Whether to use adaptive optimization based on batch size
-    """
-    # Handle empty input
-    if not texts:
-        raise ValueError("Cannot compute embeddings for empty text list")
-    logger.info(
-        f"Computing embeddings for {len(texts)} texts using SentenceTransformer, model: '{model_name}'"
-    )
-
-    # Auto-detect device
-    if device == "auto":
-        if torch.cuda.is_available():
-            device = "cuda"
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            device = "mps"
-        else:
-            device = "cpu"
-
-    # Apply optimizations based on benchmark results
-    if adaptive_optimization:
-        # Use optimal batch_size constants for different devices based on benchmark results
-        if device == "mps":
-            batch_size = 128  # MPS optimal batch size from benchmark
-            if model_name == "Qwen/Qwen3-Embedding-0.6B":
-                batch_size = 64
-        elif device == "cuda":
-            batch_size = 256  # CUDA optimal batch size
-        # Keep original batch_size for CPU
-
-    # Create cache key
-    cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}_optimized"
-
-    # Check if model is already cached
-    if cache_key in _model_cache:
-        logger.info(f"Using cached optimized model: {model_name}")
-        model = _model_cache[cache_key]
-    else:
-        logger.info(
-            f"Loading and caching optimized SentenceTransformer model: {model_name}"
-        )
-        from sentence_transformers import SentenceTransformer
-
-        logger.info(f"Using device: {device}")
-
-        # Apply hardware optimizations
-        if device == "cuda":
-            # TODO: Haven't tested this yet
-            torch.backends.cuda.matmul.allow_tf32 = True
-            torch.backends.cudnn.allow_tf32 = True
-            torch.backends.cudnn.benchmark = True
-            torch.backends.cudnn.deterministic = False
-            torch.cuda.set_per_process_memory_fraction(0.9)
-        elif device == "mps":
-            try:
-                if hasattr(torch.mps, "set_per_process_memory_fraction"):
-                    torch.mps.set_per_process_memory_fraction(0.9)
-            except AttributeError:
-                logger.warning(
-                    "Some MPS optimizations not available in this PyTorch version"
-                )
-        elif device == "cpu":
-            # TODO: Haven't tested this yet
-            torch.set_num_threads(min(8, os.cpu_count() or 4))
-            try:
-                torch.backends.mkldnn.enabled = True
-            except AttributeError:
-                pass
-
-        # Prepare optimized model and tokenizer parameters
-        model_kwargs = {
-            "torch_dtype": torch.float16 if use_fp16 else torch.float32,
-            "low_cpu_mem_usage": True,
-            "_fast_init": True,
-            "attn_implementation": "eager",  # Use eager attention for speed
-        }
-
-        tokenizer_kwargs = {
-            "use_fast": True,
-            "padding": True,
-            "truncation": True,
-        }
-
-        try:
-            # Try local loading first
-            model_kwargs["local_files_only"] = True
-            tokenizer_kwargs["local_files_only"] = True
-
-            model = SentenceTransformer(
-                model_name,
-                device=device,
-                model_kwargs=model_kwargs,
-                tokenizer_kwargs=tokenizer_kwargs,
-                local_files_only=True,
-            )
-            logger.info("Model loaded successfully! (local + optimized)")
-        except Exception as e:
-            logger.warning(f"Local loading failed ({e}), trying network download...")
-            # Fallback to network loading
-            model_kwargs["local_files_only"] = False
-            tokenizer_kwargs["local_files_only"] = False
-
-            model = SentenceTransformer(
-                model_name,
-                device=device,
-                model_kwargs=model_kwargs,
-                tokenizer_kwargs=tokenizer_kwargs,
-                local_files_only=False,
-            )
-            logger.info("Model loaded successfully! (network + optimized)")
-
-        # Apply additional optimizations based on mode
-        if use_fp16 and device in ["cuda", "mps"]:
-            try:
-                model = model.half()
-                logger.info(f"Applied FP16 precision: {model_name}")
-            except Exception as e:
-                logger.warning(f"FP16 optimization failed: {e}")
-
-        # Apply torch.compile optimization
-        if device in ["cuda", "mps"]:
-            try:
-                model = torch.compile(model, mode="reduce-overhead", dynamic=True)
-                logger.info(f"Applied torch.compile optimization: {model_name}")
-            except Exception as e:
-                logger.warning(f"torch.compile optimization failed: {e}")
-
-        # Set model to eval mode and disable gradients for inference
-        model.eval()
-        for param in model.parameters():
-            param.requires_grad_(False)
-
-        # Cache the model
-        _model_cache[cache_key] = model
-        logger.info(f"Model cached: {cache_key}")
-
-    # Compute embeddings with optimized inference mode
-    logger.info(f"Starting embedding computation... (batch_size: {batch_size})")
-
-    # Use torch.inference_mode for optimal performance
-    with torch.inference_mode():
-        embeddings = model.encode(
-            texts,
-            batch_size=batch_size,
-            show_progress_bar=is_build,  # Don't show progress bar in server environment
-            convert_to_numpy=True,
-            normalize_embeddings=False,
-            device=device,
-        )
-
-    logger.info(
-        f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
-    )
-
-    # Validate results
-    if np.isnan(embeddings).any() or np.isinf(embeddings).any():
-        raise RuntimeError(
-            f"Detected NaN or Inf values in embeddings, model: {model_name}"
-        )
-
-    return embeddings
-
-
-def compute_embeddings_openai(texts: List[str], model_name: str) -> np.ndarray:
-    # TODO: @yichuan-w add progress bar only in build mode
-    """Compute embeddings using OpenAI API"""
-    try:
-        import openai
-        import os
-    except ImportError as e:
-        raise ImportError(f"OpenAI package not installed: {e}")
-
-    api_key = os.getenv("OPENAI_API_KEY")
-    if not api_key:
-        raise RuntimeError("OPENAI_API_KEY environment variable not set")
-
-    # Cache OpenAI client
-    cache_key = "openai_client"
-    if cache_key in _model_cache:
-        client = _model_cache[cache_key]
-    else:
-        client = openai.OpenAI(api_key=api_key)
-        _model_cache[cache_key] = client
-        logger.info("OpenAI client cached")
-
-    logger.info(
-        f"Computing embeddings for {len(texts)} texts using OpenAI API, model: '{model_name}'"
-    )
-
-    # OpenAI has limits on batch size and input length
-    max_batch_size = 100  # Conservative batch size
-    all_embeddings = []
-
-    try:
-        from tqdm import tqdm
-
-        total_batches = (len(texts) + max_batch_size - 1) // max_batch_size
-        batch_range = range(0, len(texts), max_batch_size)
-        batch_iterator = tqdm(
-            batch_range, desc="Computing embeddings", unit="batch", total=total_batches
-        )
-    except ImportError:
-        # Fallback when tqdm is not available
-        batch_iterator = range(0, len(texts), max_batch_size)
-
-    for i in batch_iterator:
-        batch_texts = texts[i : i + max_batch_size]
-
-        try:
-            response = client.embeddings.create(model=model_name, input=batch_texts)
-            batch_embeddings = [embedding.embedding for embedding in response.data]
-            all_embeddings.extend(batch_embeddings)
-        except Exception as e:
-            logger.error(f"Batch {i} failed: {e}")
-            raise
-
-    embeddings = np.array(all_embeddings, dtype=np.float32)
-    logger.info(
-        f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
-    )
-    return embeddings
-
-
-def compute_embeddings_mlx(
-    chunks: List[str], model_name: str, batch_size: int = 16
-) -> np.ndarray:
-    # TODO: @yichuan-w add progress bar only in build mode
-    """Computes embeddings using an MLX model."""
-    try:
-        import mlx.core as mx
-        from mlx_lm.utils import load
-    except ImportError as e:
-        raise RuntimeError(
-            "MLX or related libraries not available. Install with: uv pip install mlx mlx-lm"
-        ) from e
-
-    logger.info(
-        f"Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}' with batch_size={batch_size}..."
-    )
-
-    # Cache MLX model and tokenizer
-    cache_key = f"mlx_{model_name}"
-    if cache_key in _model_cache:
-        logger.info(f"Using cached MLX model: {model_name}")
-        model, tokenizer = _model_cache[cache_key]
-    else:
-        logger.info(f"Loading and caching MLX model: {model_name}")
-        model, tokenizer = load(model_name)
-        _model_cache[cache_key] = (model, tokenizer)
-        logger.info(f"MLX model cached: {cache_key}")
-
-    # Process chunks in batches with progress bar
-    all_embeddings = []
-
-    try:
-        from tqdm import tqdm
-
-        batch_iterator = tqdm(
-            range(0, len(chunks), batch_size), desc="Computing embeddings", unit="batch"
-        )
-    except ImportError:
-        batch_iterator = range(0, len(chunks), batch_size)
-
-    for i in batch_iterator:
-        batch_chunks = chunks[i : i + batch_size]
-
-        # Tokenize all chunks in the batch
-        batch_token_ids = []
-        for chunk in batch_chunks:
-            token_ids = tokenizer.encode(chunk)  # type: ignore
-            batch_token_ids.append(token_ids)
-
-        # Pad sequences to the same length for batch processing
-        max_length = max(len(ids) for ids in batch_token_ids)
-        padded_token_ids = []
-        for token_ids in batch_token_ids:
-            # Pad with tokenizer.pad_token_id or 0
-            padded = token_ids + [0] * (max_length - len(token_ids))
-            padded_token_ids.append(padded)
-
-        # Convert to MLX array with batch dimension
-        input_ids = mx.array(padded_token_ids)
-
-        # Get embeddings for the batch
-        embeddings = model(input_ids)
-
-        # Mean pooling for each sequence in the batch
-        pooled = embeddings.mean(axis=1)  # Shape: (batch_size, hidden_size)
-
-        # Convert batch embeddings to numpy
-        for j in range(len(batch_chunks)):
-            pooled_list = pooled[j].tolist()  # Convert to list
-            pooled_numpy = np.array(pooled_list, dtype=np.float32)
-            all_embeddings.append(pooled_numpy)
-
-    # Stack numpy arrays
-    return np.stack(all_embeddings)
--- a/packages/leann-core/src/leann/embedding_server_manager.py
+++ b/packages/leann-core/src/leann/embedding_server_manager.py
@@ -1,21 +1,14 @@
+import threading
 import time
 import atexit
 import socket
 import subprocess
 import sys
-import os
-import logging
+import zmq
+import msgpack
 from pathlib import Path
 from typing import Optional
-import psutil
-
-# Set up logging based on environment variable
-LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
-logging.basicConfig(
-    level=getattr(logging, LOG_LEVEL, logging.INFO),
-    format="%(levelname)s - %(name)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
+import select


 def _check_port(port: int) -> bool:
@@ -24,135 +17,151 @@ def _check_port(port: int) -> bool:
        return s.connect_ex(("localhost", port)) == 0


-def _check_process_matches_config(
-    port: int, expected_model: str, expected_passages_file: str
-) -> bool:
+def _check_server_meta_path(port: int, expected_meta_path: str) -> bool:
    """
-    Check if the process using the port matches our expected model and passages file.
-    Returns True if matches, False otherwise.
+    Check if the existing server on the port is using the correct meta file.
+    Returns True if the server has the right meta path, False otherwise.
    """
    try:
-        for proc in psutil.process_iter(["pid", "cmdline"]):
-            if not _is_process_listening_on_port(proc, port):
-                continue
+        context = zmq.Context()
+        socket = context.socket(zmq.REQ)
+        socket.setsockopt(zmq.RCVTIMEO, 3000)  # 3 second timeout
+        socket.connect(f"tcp://localhost:{port}")

-            cmdline = proc.info["cmdline"]
-            if not cmdline:
-                continue
+        # Send a special control message to query the server's meta path
+        control_request = ["__QUERY_META_PATH__"]
+        request_bytes = msgpack.packb(control_request)
+        socket.send(request_bytes)

-            return _check_cmdline_matches_config(
-                cmdline, port, expected_model, expected_passages_file
-            )
+        # Wait for response
+        response_bytes = socket.recv()
+        response = msgpack.unpackb(response_bytes)
+
+        socket.close()
+        context.term()
+
+        # Check if the response contains the meta path and if it matches
+        if isinstance(response, list) and len(response) > 0:
+            server_meta_path = response[0]
+            # Normalize paths for comparison
+            expected_path = Path(expected_meta_path).resolve()
+            server_path = Path(server_meta_path).resolve() if server_meta_path else None
+            return server_path == expected_path

-        logger.debug(f"No process found listening on port {port}")
        return False

    except Exception as e:
-        logger.warning(f"Could not check process on port {port}: {e}")
+        print(f"WARNING: Could not query server meta path on port {port}: {e}")
        return False


-def _is_process_listening_on_port(proc, port: int) -> bool:
-    """Check if a process is listening on the given port."""
+def _update_server_meta_path(port: int, new_meta_path: str) -> bool:
+    """
+    Send a control message to update the server's meta path.
+    Returns True if successful, False otherwise.
+    """
    try:
-        connections = proc.net_connections()
-        for conn in connections:
-            if conn.laddr.port == port and conn.status == psutil.CONN_LISTEN:
-                return True
+        context = zmq.Context()
+        socket = context.socket(zmq.REQ)
+        socket.setsockopt(zmq.RCVTIMEO, 5000)  # 5 second timeout
+        socket.connect(f"tcp://localhost:{port}")
+
+        # Send a control message to update the meta path
+        control_request = ["__UPDATE_META_PATH__", new_meta_path]
+        request_bytes = msgpack.packb(control_request)
+        socket.send(request_bytes)
+
+        # Wait for response
+        response_bytes = socket.recv()
+        response = msgpack.unpackb(response_bytes)
+
+        socket.close()
+        context.term()
+
+        # Check if the update was successful
+        if isinstance(response, list) and len(response) > 0:
+            return response[0] == "SUCCESS"
+
        return False
-    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+
+    except Exception as e:
+        print(f"ERROR: Could not update server meta path on port {port}: {e}")
        return False


-def _check_cmdline_matches_config(
-    cmdline: list, port: int, expected_model: str, expected_passages_file: str
-) -> bool:
-    """Check if command line matches our expected configuration."""
-    cmdline_str = " ".join(cmdline)
-    logger.debug(f"Found process on port {port}: {cmdline_str}")
-
-    # Check if it's our embedding server
-    is_embedding_server = any(
-        server_type in cmdline_str
-        for server_type in [
-            "embedding_server",
-            "leann_backend_diskann.embedding_server",
-            "leann_backend_hnsw.hnsw_embedding_server",
-        ]
-    )
-
-    if not is_embedding_server:
-        logger.debug(f"Process on port {port} is not our embedding server")
-        return False
-
-    # Check model name
-    model_matches = _check_model_in_cmdline(cmdline, expected_model)
-
-    # Check passages file if provided
-    passages_matches = _check_passages_in_cmdline(cmdline, expected_passages_file)
-
-    result = model_matches and passages_matches
-    logger.debug(
-        f"model_matches: {model_matches}, passages_matches: {passages_matches}, overall: {result}"
-    )
-    return result
-
-
-def _check_model_in_cmdline(cmdline: list, expected_model: str) -> bool:
-    """Check if the command line contains the expected model."""
-    if "--model-name" not in cmdline:
-        return False
-
-    model_idx = cmdline.index("--model-name")
-    if model_idx + 1 >= len(cmdline):
-        return False
-
-    actual_model = cmdline[model_idx + 1]
-    return actual_model == expected_model
-
-
-def _check_passages_in_cmdline(cmdline: list, expected_passages_file: str) -> bool:
-    """Check if the command line contains the expected passages file."""
-    if "--passages-file" not in cmdline:
-        return False  # Expected but not found
-
-    passages_idx = cmdline.index("--passages-file")
-    if passages_idx + 1 >= len(cmdline):
-        return False
-
-    actual_passages = cmdline[passages_idx + 1]
-    expected_path = Path(expected_passages_file).resolve()
-    actual_path = Path(actual_passages).resolve()
-    return actual_path == expected_path
-
-
-def _find_compatible_port_or_next_available(
-    start_port: int, model_name: str, passages_file: str, max_attempts: int = 100
-) -> tuple[int, bool]:
+def _check_server_model(port: int, expected_model: str) -> bool:
    """
-    Find a port that either has a compatible server or is available.
-    Returns (port, is_compatible) where is_compatible indicates if we found a matching server.
+    Check if the existing server on the port is using the correct embedding model.
+    Returns True if the server has the right model, False otherwise.
    """
-    for port in range(start_port, start_port + max_attempts):
-        if not _check_port(port):
-            # Port is available
-            return port, False
+    try:
+        context = zmq.Context()
+        socket = context.socket(zmq.REQ)
+        socket.setsockopt(zmq.RCVTIMEO, 3000)  # 3 second timeout
+        socket.connect(f"tcp://localhost:{port}")

-        # Port is in use, check if it's compatible
-        if _check_process_matches_config(port, model_name, passages_file):
-            logger.info(f"Found compatible server on port {port}")
-            return port, True
-        else:
-            logger.info(f"Port {port} has incompatible server, trying next port...")
+        # Send a special control message to query the server's model
+        control_request = ["__QUERY_MODEL__"]
+        request_bytes = msgpack.packb(control_request)
+        socket.send(request_bytes)

-    raise RuntimeError(
-        f"Could not find compatible or available port in range {start_port}-{start_port + max_attempts}"
-    )
+        # Wait for response
+        response_bytes = socket.recv()
+        response = msgpack.unpackb(response_bytes)
+
+        socket.close()
+        context.term()
+
+        # Check if the response contains the model name and if it matches
+        if isinstance(response, list) and len(response) > 0:
+            server_model = response[0]
+            return server_model == expected_model
+
+        return False
+
+    except Exception as e:
+        print(f"WARNING: Could not query server model on port {port}: {e}")
+        return False
+
+
+def _update_server_model(port: int, new_model: str) -> bool:
+    """
+    Send a control message to update the server's embedding model.
+    Returns True if successful, False otherwise.
+    """
+    try:
+        context = zmq.Context()
+        socket = context.socket(zmq.REQ)
+        socket.setsockopt(zmq.RCVTIMEO, 30000)  # 30 second timeout for model loading
+        socket.setsockopt(zmq.SNDTIMEO, 5000)  # 5 second timeout for sending
+        socket.connect(f"tcp://localhost:{port}")
+
+        # Send a control message to update the model
+        control_request = ["__UPDATE_MODEL__", new_model]
+        request_bytes = msgpack.packb(control_request)
+        socket.send(request_bytes)
+
+        # Wait for response
+        response_bytes = socket.recv()
+        response = msgpack.unpackb(response_bytes)
+
+        socket.close()
+        context.term()
+
+        # Check if the update was successful
+        if isinstance(response, list) and len(response) > 0:
+            return response[0] == "SUCCESS"
+
+        return False
+
+    except Exception as e:
+        print(f"ERROR: Could not update server model on port {port}: {e}")
+        return False


 class EmbeddingServerManager:
    """
-    A simplified manager for embedding server processes that avoids complex update mechanisms.
+    A generic manager for handling the lifecycle of a backend-specific embedding server process.
    """

    def __init__(self, backend_module_name: str):
@@ -166,183 +175,246 @@ class EmbeddingServerManager:
        self.backend_module_name = backend_module_name
        self.server_process: Optional[subprocess.Popen] = None
        self.server_port: Optional[int] = None
-        self._atexit_registered = False
+        atexit.register(self.stop_server)

-    def start_server(
-        self,
-        port: int,
-        model_name: str,
-        embedding_mode: str = "sentence-transformers",
-        **kwargs,
-    ) -> tuple[bool, int]:
+    def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool:
        """
        Starts the embedding server process.

        Args:
-            port (int): The preferred ZMQ port for the server.
+            port (int): The ZMQ port for the server.
            model_name (str): The name of the embedding model to use.
-            **kwargs: Additional arguments for the server.
+            **kwargs: Additional arguments for the server (e.g., passages_file, distance_metric, enable_warmup).

        Returns:
-            tuple[bool, int]: (success, actual_port_used)
+            bool: True if the server is started successfully or already running, False otherwise.
        """
-        passages_file = kwargs.get("passages_file")
-        assert isinstance(passages_file, str), "passages_file must be a string"
+        if self.server_process and self.server_process.poll() is None:
+            # Even if we have a running process, check if model/meta path match
+            if self.server_port is not None:
+                port_in_use = _check_port(self.server_port)
+                if port_in_use:
+                    print(
+                        f"INFO: Checking compatibility of existing server process (PID {self.server_process.pid})"
+                    )

-        # Check if we have a compatible running server
-        if self._has_compatible_running_server(model_name, passages_file):
-            assert self.server_port is not None, (
-                "a compatible running server should set server_port"
-            )
-            return True, self.server_port
+                    # Check model compatibility
+                    model_matches = _check_server_model(self.server_port, model_name)
+                    if model_matches:
+                        print(
+                            f"✅ Existing server already using correct model: {model_name}"
+                        )
+                        
+                        # Still check meta path if provided
+                        passages_file = kwargs.get("passages_file")
+                        if passages_file and str(passages_file).endswith(
+                            ".meta.json"
+                        ):
+                            meta_matches = _check_server_meta_path(
+                                self.server_port, str(passages_file)
+                            )
+                            if not meta_matches:
+                                print("⚠️  Updating meta path to: {passages_file}")
+                                _update_server_meta_path(
+                                    self.server_port, str(passages_file)
+                                )
+                        
+                        return True
+                    else:
+                        print(
+                            f"⚠️  Existing server has different model. Attempting to update to: {model_name}"
+                        )
+                        if not _update_server_model(self.server_port, model_name):
+                            print(
+                                "❌ Failed to update existing server model. Restarting server..."
+                            )
+                            self.stop_server()
+                            # Continue to start new server below
+                        else:
+                            print(
+                                f"✅ Successfully updated existing server model to: {model_name}"
+                            )

-        # Find available port (compatible or free)
-        try:
-            actual_port, is_compatible = _find_compatible_port_or_next_available(
-                port, model_name, passages_file
-            )
-        except RuntimeError as e:
-            logger.error(str(e))
-            return False, port
+                            # Also check meta path if provided
+                            passages_file = kwargs.get("passages_file")
+                            if passages_file and str(passages_file).endswith(
+                                ".meta.json"
+                            ):
+                                meta_matches = _check_server_meta_path(
+                                    self.server_port, str(passages_file)
+                                )
+                                if not meta_matches:
+                                    print("⚠️  Updating meta path to: {passages_file}")
+                                    _update_server_meta_path(
+                                        self.server_port, str(passages_file)
+                                    )

-        if is_compatible:
-            logger.info(f"Using existing compatible server on port {actual_port}")
-            self.server_port = actual_port
-            self.server_process = None  # We don't own this process
-            return True, actual_port
+                            return True
+                else:
+                    # Server process exists but port not responding - restart
+                    print("⚠️  Server process exists but not responding. Restarting...")
+                    self.stop_server()
+                    # Continue to start new server below
+            else:
+                # No port stored - restart
+                print("⚠️  No port information stored. Restarting server...")
+                self.stop_server()
+                # Continue to start new server below

-        if actual_port != port:
-            logger.info(f"Using port {actual_port} instead of {port}")
+        if _check_port(port):
+            # Port is in use, check if it's using the correct meta file and model
+            passages_file = kwargs.get("passages_file")

-        # Start new server
-        return self._start_new_server(actual_port, model_name, embedding_mode, **kwargs)
+            print(f"INFO: Port {port} is in use. Checking server compatibility...")

-    def _has_compatible_running_server(
-        self, model_name: str, passages_file: str
-    ) -> bool:
-        """Check if we have a compatible running server."""
-        if not (
-            self.server_process
-            and self.server_process.poll() is None
-            and self.server_port
-        ):
-            return False
+            # Check model compatibility first
+            model_matches = _check_server_model(port, model_name)
+            if model_matches:
+                print(
+                    f"✅ Existing server on port {port} is using correct model: {model_name}"
+                )
+            else:
+                print(
+                    f"⚠️  Existing server on port {port} has different model. Attempting to update to: {model_name}"
+                )
+                if not _update_server_model(port, model_name):
+                    raise RuntimeError(
+                        f"❌ Failed to update server model to {model_name}. Consider using a different port."
+                    )
+                print(f"✅ Successfully updated server model to: {model_name}")

-        if _check_process_matches_config(self.server_port, model_name, passages_file):
-            logger.info(
-                f"Existing server process (PID {self.server_process.pid}) is compatible"
-            )
+            # Check meta path compatibility if provided
+            if passages_file and str(passages_file).endswith(".meta.json"):
+                meta_matches = _check_server_meta_path(port, str(passages_file))
+                if not meta_matches:
+                    print(
+                        f"⚠️  Existing server on port {port} has different meta path. Attempting to update..."
+                    )
+                    if not _update_server_meta_path(port, str(passages_file)):
+                        raise RuntimeError(
+                            "❌ Failed to update server meta path. This may cause data synchronization issues."
+                        )
+                    print(
+                        f"✅ Successfully updated server meta path to: {passages_file}"
+                    )
+                else:
+                    print(
+                        f"✅ Existing server on port {port} is using correct meta path: {passages_file}"
+                    )
+
+            print(f"✅ Server on port {port} is compatible and ready to use.")
            return True

-        logger.info(
-            "Existing server process is incompatible. Should start a new server."
+        print(
+            f"INFO: Starting session-level embedding server for '{self.backend_module_name}'..."
        )
-        return False
-
-    def _start_new_server(
-        self, port: int, model_name: str, embedding_mode: str, **kwargs
-    ) -> tuple[bool, int]:
-        """Start a new embedding server on the given port."""
-        logger.info(f"Starting embedding server on port {port}...")
-
-        command = self._build_server_command(port, model_name, embedding_mode, **kwargs)

        try:
-            self._launch_server_process(command, port)
-            return self._wait_for_server_ready(port)
+            command = [
+                sys.executable,
+                "-m",
+                self.backend_module_name,
+                "--zmq-port",
+                str(port),
+                "--model-name",
+                model_name,
+            ]
+
+            # Add extra arguments for specific backends
+            if "passages_file" in kwargs and kwargs["passages_file"]:
+                command.extend(["--passages-file", str(kwargs["passages_file"])])
+            # if "distance_metric" in kwargs and kwargs["distance_metric"]:
+            #     command.extend(["--distance-metric", kwargs["distance_metric"]])
+            if embedding_mode != "sentence-transformers":
+                command.extend(["--embedding-mode", embedding_mode])
+            if "enable_warmup" in kwargs and not kwargs["enable_warmup"]:
+                command.extend(["--disable-warmup"])
+
+            project_root = Path(__file__).parent.parent.parent.parent.parent
+            print(f"INFO: Running command from project root: {project_root}")
+            print(f"INFO: Command: {' '.join(command)}")  # Debug: show actual command
+
+            self.server_process = subprocess.Popen(
+                command,
+                cwd=project_root,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,  # Merge stderr into stdout for easier monitoring
+                text=True,
+                encoding="utf-8",
+                bufsize=1,  # Line buffered
+                universal_newlines=True,
+            )
+            self.server_port = port
+            print(f"INFO: Server process started with PID: {self.server_process.pid}")
+
+            max_wait, wait_interval = 120, 0.5
+            for _ in range(int(max_wait / wait_interval)):
+                if _check_port(port):
+                    print("✅ Embedding server is up and ready for this session.")
+                    log_thread = threading.Thread(target=self._log_monitor, daemon=True)
+                    log_thread.start()
+                    return True
+                if self.server_process.poll() is not None:
+                    print(
+                        "❌ ERROR: Server process terminated unexpectedly during startup."
+                    )
+                    self._print_recent_output()
+                    return False
+                time.sleep(wait_interval)
+
+            print(
+                f"❌ ERROR: Server process failed to start listening within {max_wait} seconds."
+            )
+            self.stop_server()
+            return False
+
        except Exception as e:
-            logger.error(f"Failed to start embedding server: {e}")
-            return False, port
+            print(f"❌ ERROR: Failed to start embedding server process: {e}")
+            return False

-    def _build_server_command(
-        self, port: int, model_name: str, embedding_mode: str, **kwargs
-    ) -> list:
-        """Build the command to start the embedding server."""
-        command = [
-            sys.executable,
-            "-m",
-            self.backend_module_name,
-            "--zmq-port",
-            str(port),
-            "--model-name",
-            model_name,
-        ]
+    def _print_recent_output(self):
+        """Print any recent output from the server process."""
+        if not self.server_process or not self.server_process.stdout:
+            return
+        try:
+            # Read any available output

-        if kwargs.get("passages_file"):
-            command.extend(["--passages-file", str(kwargs["passages_file"])])
-        if embedding_mode != "sentence-transformers":
-            command.extend(["--embedding-mode", embedding_mode])
+            if select.select([self.server_process.stdout], [], [], 0)[0]:
+                output = self.server_process.stdout.read()
+                if output:
+                    print(f"[{self.backend_module_name} OUTPUT]: {output}")
+        except Exception as e:
+            print(f"Error reading server output: {e}")

-        return command
-
-    def _launch_server_process(self, command: list, port: int) -> None:
-        """Launch the server process."""
-        project_root = Path(__file__).parent.parent.parent.parent.parent
-        logger.info(f"Command: {' '.join(command)}")
-
-        # Let server output go directly to console
-        # The server will respect LEANN_LOG_LEVEL environment variable
-        self.server_process = subprocess.Popen(
-            command,
-            cwd=project_root,
-            stdout=None,  # Direct to console
-            stderr=None,  # Direct to console
-        )
-        self.server_port = port
-        logger.info(f"Server process started with PID: {self.server_process.pid}")
-
-        # Register atexit callback only when we actually start a process
-        if not self._atexit_registered:
-            # Use a lambda to avoid issues with bound methods
-            atexit.register(lambda: self.stop_server() if self.server_process else None)
-            self._atexit_registered = True
-
-    def _wait_for_server_ready(self, port: int) -> tuple[bool, int]:
-        """Wait for the server to be ready."""
-        max_wait, wait_interval = 120, 0.5
-        for _ in range(int(max_wait / wait_interval)):
-            if _check_port(port):
-                logger.info("Embedding server is ready!")
-                return True, port
-
-            if self.server_process and self.server_process.poll() is not None:
-                logger.error("Server terminated during startup.")
-                return False, port
-
-            time.sleep(wait_interval)
-
-        logger.error(f"Server failed to start within {max_wait} seconds.")
-        self.stop_server()
-        return False, port
+    def _log_monitor(self):
+        """Monitors and prints the server's stdout and stderr."""
+        if not self.server_process:
+            return
+        try:
+            if self.server_process.stdout:
+                while True:
+                    line = self.server_process.stdout.readline()
+                    if not line:
+                        break
+                    print(
+                        f"[{self.backend_module_name} LOG]: {line.strip()}", flush=True
+                    )
+        except Exception as e:
+            print(f"Log monitor error: {e}")

    def stop_server(self):
        """Stops the embedding server process if it's running."""
-        if not self.server_process:
-            return
-
-        if self.server_process.poll() is not None:
-            # Process already terminated
-            self.server_process = None
-            return
-
-        logger.info(
-            f"Terminating server process (PID: {self.server_process.pid}) for backend {self.backend_module_name}..."
-        )
-        self.server_process.terminate()
-
-        try:
-            self.server_process.wait(timeout=5)
-            logger.info(f"Server process {self.server_process.pid} terminated.")
-        except subprocess.TimeoutExpired:
-            logger.warning(
-                f"Server process {self.server_process.pid} did not terminate gracefully, killing it."
+        if self.server_process and self.server_process.poll() is None:
+            print(
+                f"INFO: Terminating session server process (PID: {self.server_process.pid})..."
            )
-            self.server_process.kill()
-
-        # Clean up process resources to prevent resource tracker warnings
-        try:
-            self.server_process.wait()  # Ensure process is fully cleaned up
-        except Exception:
-            pass
-
+            self.server_process.terminate()
+            try:
+                self.server_process.wait(timeout=5)
+                print("INFO: Server process terminated.")
+            except subprocess.TimeoutExpired:
+                print(
+                    "WARNING: Server process did not terminate gracefully, killing it."
+                )
+                self.server_process.kill()
        self.server_process = None
--- a/packages/leann-core/src/leann/interface.py
+++ b/packages/leann-core/src/leann/interface.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 import numpy as np
-from typing import Dict, Any, List, Literal, Optional
+from typing import Dict, Any, List, Literal


 class LeannBackendBuilderInterface(ABC):
@@ -34,13 +34,6 @@ class LeannBackendSearcherInterface(ABC):
        """
        pass

-    @abstractmethod
-    def _ensure_server_running(
-        self, passages_source_file: str, port: Optional[int], **kwargs
-    ) -> int:
-        """Ensure server is running"""
-        pass
-
    @abstractmethod
    def search(
        self,
@@ -51,7 +44,7 @@ class LeannBackendSearcherInterface(ABC):
        prune_ratio: float = 0.0,
        recompute_embeddings: bool = False,
        pruning_strategy: Literal["global", "local", "proportional"] = "global",
-        zmq_port: Optional[int] = None,
+        zmq_port: int = 5557,
        **kwargs,
    ) -> Dict[str, Any]:
        """Search for nearest neighbors
@@ -64,7 +57,7 @@ class LeannBackendSearcherInterface(ABC):
            prune_ratio: Ratio of neighbors to prune via approximate distance (0.0-1.0)
            recompute_embeddings: Whether to fetch fresh embeddings from server vs use stored PQ codes
            pruning_strategy: PQ candidate selection strategy - "global" (default), "local", or "proportional"
-            zmq_port: ZMQ port for embedding server communication. Must be provided if recompute_embeddings is True.
+            zmq_port: ZMQ port for embedding server communication
            **kwargs: Backend-specific parameters

        Returns:
@@ -74,10 +67,7 @@ class LeannBackendSearcherInterface(ABC):

    @abstractmethod
    def compute_query_embedding(
-        self,
-        query: str,
-        use_server_if_available: bool = True,
-        zmq_port: Optional[int] = None,
+        self, query: str, zmq_port: int = 5557, use_server_if_available: bool = True
    ) -> np.ndarray:
        """Compute embedding for a query string

--- a/packages/leann-core/src/leann/registry.py
+++ b/packages/leann-core/src/leann/registry.py
@@ -7,37 +7,30 @@ import importlib.metadata
 if TYPE_CHECKING:
    from leann.interface import LeannBackendFactoryInterface

-BACKEND_REGISTRY: Dict[str, "LeannBackendFactoryInterface"] = {}
-
+BACKEND_REGISTRY: Dict[str, 'LeannBackendFactoryInterface'] = {}

 def register_backend(name: str):
    """A decorator to register a new backend class."""
-
    def decorator(cls):
        print(f"INFO: Registering backend '{name}'")
        BACKEND_REGISTRY[name] = cls
        return cls
-
    return decorator

-
 def autodiscover_backends():
    """Automatically discovers and imports all 'leann-backend-*' packages."""
-    # print("INFO: Starting backend auto-discovery...")
+    print("INFO: Starting backend auto-discovery...")
    discovered_backends = []
    for dist in importlib.metadata.distributions():
-        dist_name = dist.metadata["name"]
-        if dist_name.startswith("leann-backend-"):
-            backend_module_name = dist_name.replace("-", "_")
+        dist_name = dist.metadata['name']
+        if dist_name.startswith('leann-backend-'):
+            backend_module_name = dist_name.replace('-', '_')
            discovered_backends.append(backend_module_name)
-
-    for backend_module_name in sorted(
-        discovered_backends
-    ):  # sort for deterministic loading
+            
+    for backend_module_name in sorted(discovered_backends): # sort for deterministic loading
        try:
            importlib.import_module(backend_module_name)
            # Registration message is printed by the decorator
        except ImportError as e:
-            # print(f"WARN: Could not import backend module '{backend_module_name}': {e}")
-            pass
-    # print("INFO: Backend auto-discovery finished.")
+            print(f"WARN: Could not import backend module '{backend_module_name}': {e}")
+    print("INFO: Backend auto-discovery finished.")
--- a/packages/leann-core/src/leann/searcher_base.py
+++ b/packages/leann-core/src/leann/searcher_base.py
@@ -1,7 +1,8 @@
 import json
+import pickle
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, Any, Literal, Optional
+from typing import Dict, Any, Literal

 import numpy as np

@@ -42,10 +43,10 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
                "WARNING: embedding_model not found in meta.json. Recompute will fail."
            )

-        self.embedding_mode = self.meta.get("embedding_mode", "sentence-transformers")
+        self.label_map = self._load_label_map()

        self.embedding_server_manager = EmbeddingServerManager(
-            backend_module_name=backend_module_name,
+            backend_module_name=backend_module_name
        )

    def _load_meta(self) -> Dict[str, Any]:
@@ -57,9 +58,17 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
        with open(meta_path, "r", encoding="utf-8") as f:
            return json.load(f)

+    def _load_label_map(self) -> Dict[int, str]:
+        """Loads the mapping from integer IDs to string IDs."""
+        label_map_file = self.index_dir / "leann.labels.map"
+        if not label_map_file.exists():
+            raise FileNotFoundError(f"Label map file not found: {label_map_file}")
+        with open(label_map_file, "rb") as f:
+            return pickle.load(f)
+
    def _ensure_server_running(
        self, passages_source_file: str, port: int, **kwargs
-    ) -> int:
+    ) -> None:
        """
        Ensures the embedding server is running if recompute is needed.
        This is a helper for subclasses.
@@ -69,26 +78,21 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
                "Cannot use recompute mode without 'embedding_model' in meta.json."
            )

-        server_started, actual_port = self.embedding_server_manager.start_server(
+        embedding_mode = self.meta.get("embedding_mode", "sentence-transformers")
+        
+        server_started = self.embedding_server_manager.start_server(
            port=port,
            model_name=self.embedding_model,
-            embedding_mode=self.embedding_mode,
            passages_file=passages_source_file,
            distance_metric=kwargs.get("distance_metric"),
+            embedding_mode=embedding_mode,
            enable_warmup=kwargs.get("enable_warmup", False),
        )
        if not server_started:
-            raise RuntimeError(
-                f"Failed to start embedding server on port {actual_port}"
-            )
-
-        return actual_port
+            raise RuntimeError(f"Failed to start embedding server on port {port}")

    def compute_query_embedding(
-        self,
-        query: str,
-        use_server_if_available: bool = True,
-        zmq_port: int = 5557,
+        self, query: str, zmq_port: int = 5557, use_server_if_available: bool = True
    ) -> np.ndarray:
        """
        Compute embedding for a query string.
@@ -102,20 +106,12 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
            Query embedding as numpy array
        """
        # Try to use embedding server if available and requested
-        if use_server_if_available:
+        if (
+            use_server_if_available
+            and self.embedding_server_manager
+            and self.embedding_server_manager.server_process
+        ):
            try:
-                # TODO: Maybe we can directly use this port here?
-                # For this internal method, it's ok to assume that the server is running
-                # on that port?
-
-                # Ensure we have a server with passages_file for compatibility
-                passages_source_file = (
-                    self.index_dir / f"{self.index_path.name}.meta.json"
-                )
-                zmq_port = self._ensure_server_running(
-                    str(passages_source_file), zmq_port
-                )
-
                return self._compute_embedding_via_server([query], zmq_port)[
                    0:1
                ]  # Return (1, D) shape
@@ -124,7 +120,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
                print("⏭️ Falling back to direct model loading...")

        # Fallback to direct computation
-        from .embedding_compute import compute_embeddings
+        from .api import compute_embeddings

        embedding_mode = self.meta.get("embedding_mode", "sentence-transformers")
        return compute_embeddings([query], self.embedding_model, embedding_mode)
@@ -171,7 +167,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
        prune_ratio: float = 0.0,
        recompute_embeddings: bool = False,
        pruning_strategy: Literal["global", "local", "proportional"] = "global",
-        zmq_port: Optional[int] = None,
+        zmq_port: int = 5557,
        **kwargs,
    ) -> Dict[str, Any]:
        """
@@ -185,7 +181,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
            prune_ratio: Ratio of neighbors to prune via approximate distance (0.0-1.0)
            recompute_embeddings: Whether to fetch fresh embeddings from server vs use stored PQ codes
            pruning_strategy: PQ candidate selection strategy - "global" (default), "local", or "proportional"
-            zmq_port: ZMQ port for embedding server communication. Must be provided if recompute_embeddings is True.
+            zmq_port: ZMQ port for embedding server communication
            **kwargs: Backend-specific parameters (e.g., batch_size, dedup_node_dis, etc.)

        Returns:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ requires-python = ">=3.10"

 dependencies = [
    "leann-core",
+    "leann-backend-diskann",
    "leann-backend-hnsw",
    "numpy>=1.26.0",
    "torch",
@@ -35,7 +36,6 @@ dependencies = [
    "llama-index-embeddings-huggingface>=0.5.5",
    "mlx>=0.26.3",
    "mlx-lm>=0.26.0",
-    "psutil>=5.8.0",
 ]

 [project.optional-dependencies]
@@ -48,10 +48,6 @@ dev = [
    "huggingface-hub>=0.20.0",
 ]

-diskann = [
-    "leann-backend-diskann",
-]
-
 [tool.setuptools]
 py-modules = []

--- a/test/build_mlx_index.py
+++ b/test/build_mlx_index.py
@@ -12,7 +12,7 @@ else:
    builder = LeannBuilder(
        backend_name="hnsw",
        embedding_model="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ",
-        embedding_mode="mlx",
+        use_mlx=True,
    )

    # 2. Add documents
--- a/uv.lock
+++ b/uv.lock
@@ -1834,14 +1834,10 @@ source = { editable = "packages/leann-core" }
 dependencies = [
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
    { name = "numpy", version = "2.3.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "tqdm" },
 ]

 [package.metadata]
-requires-dist = [
-    { name = "numpy", specifier = ">=1.20.0" },
-    { name = "tqdm", specifier = ">=4.60.0" },
-]
+requires-dist = [{ name = "numpy", specifier = ">=1.20.0" }]

 [[package]]
 name = "leann-workspace"
@@ -1855,6 +1851,7 @@ dependencies = [
    { name = "flask" },
    { name = "flask-compress" },
    { name = "ipykernel" },
+    { name = "leann-backend-diskann" },
    { name = "leann-backend-hnsw" },
    { name = "leann-core" },
    { name = "llama-index" },
@@ -1870,7 +1867,6 @@ dependencies = [
    { name = "ollama" },
    { name = "openai" },
    { name = "protobuf" },
-    { name = "psutil" },
    { name = "pypdf2" },
    { name = "requests" },
    { name = "sentence-transformers" },
@@ -1888,9 +1884,6 @@ dev = [
    { name = "pytest-cov" },
    { name = "ruff" },
 ]
-diskann = [
-    { name = "leann-backend-diskann" },
-]

 [package.metadata]
 requires-dist = [
@@ -1903,7 +1896,7 @@ requires-dist = [
    { name = "flask-compress" },
    { name = "huggingface-hub", marker = "extra == 'dev'", specifier = ">=0.20.0" },
    { name = "ipykernel", specifier = "==6.29.5" },
-    { name = "leann-backend-diskann", marker = "extra == 'diskann'", editable = "packages/leann-backend-diskann" },
+    { name = "leann-backend-diskann", editable = "packages/leann-backend-diskann" },
    { name = "leann-backend-hnsw", editable = "packages/leann-backend-hnsw" },
    { name = "leann-core", editable = "packages/leann-core" },
    { name = "llama-index", specifier = ">=0.12.44" },
@@ -1919,7 +1912,6 @@ requires-dist = [
    { name = "ollama" },
    { name = "openai", specifier = ">=1.0.0" },
    { name = "protobuf", specifier = "==4.25.3" },
-    { name = "psutil", specifier = ">=5.8.0" },
    { name = "pypdf2", specifier = ">=3.0.0" },
    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
    { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },
@@ -1930,7 +1922,7 @@ requires-dist = [
    { name = "torch" },
    { name = "tqdm" },
 ]
-provides-extras = ["dev", "diskann"]
+provides-extras = ["dev"]

 [[package]]
 name = "llama-cloud"
Author	SHA1	Message	Date
Yichuan Wang	f83c97e6d1	Merge branch 'main' into readme-polish	2025-07-19 21:47:17 -07:00
Andy Lee	6e755f0402	docs: follow yichuan's suggestion	2025-07-19 21:44:31 -07:00
Andy Lee	cc6b904c44	docs: follow yichuan's suggestion	2025-07-19 21:21:41 -07:00
Andy Lee	bda028cc1b	docs: polish	2025-07-19 21:02:25 -07:00
Andy Lee	bed814e7e6	docs: polish	2025-07-19 20:45:50 -07:00
Andy Lee	96f74973b1	docs: how it works earlier	2025-07-19 20:42:52 -07:00
Andy Lee	1f90cdfafb	docs: polish	2025-07-19 20:35:15 -07:00
Andy Lee	8f4f66d871	docs: highlight applications	2025-07-19 20:23:29 -07:00
Andy Lee	43b52a8c0a	docs: polish	2025-07-19 20:21:25 -07:00
Andy Lee	1a3180bc0f	docs: readme effects	2025-07-19 19:54:21 -07:00
Andy Lee	fe4a748a69	docs: logo with text	2025-07-19 16:47:06 -07:00
Andy Lee	d296f372e0	docs: logo	2025-07-19 16:26:31 -07:00
Andy Lee	909835dd2d	docs: logo	2025-07-19 16:24:40 -07:00
Andy Lee	1eea69e8d7	docs: polish	2025-07-19 16:16:24 -07:00