fix larger file read and add faq

This commit is contained in:
yichuan520030910320
2025-07-03 23:25:36 +00:00
parent a627abe794
commit 368474d036
5 changed files with 8825 additions and 2526 deletions

1
.gitignore vendored
View File

@@ -29,6 +29,7 @@ build/
nprobe_logs/ nprobe_logs/
micro/results micro/results
micro/contriever-INT8 micro/contriever-INT8
examples/data/
*.qdstrm *.qdstrm
benchmark_results/ benchmark_results/
results/ results/

View File

@@ -241,6 +241,25 @@ uv run python tests/sanity_checks/test_distance_functions.py
# Verify L2 implementation # Verify L2 implementation
uv run python tests/sanity_checks/test_l2_verification.py uv run python tests/sanity_checks/test_l2_verification.py
``` ```
## ❓ FAQ
### Common Issues
#### NCCL Topology Error
**Problem**: You encounter `ncclTopoComputePaths` error during document processing:
```
ncclTopoComputePaths (system=<optimized out>, comm=comm@entry=0x5555a82fa3c0) at graph/paths.cc:688
```
**Solution**: Set these environment variables before running your script:
```bash
export NCCL_TOPO_DUMP_FILE=/tmp/nccl_topo.xml
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=INIT,GRAPH
export NCCL_IB_DISABLE=1
export NCCL_NET_PLUGIN=none
export NCCL_SOCKET_IFNAME=ens5
## 📈 Roadmap ## 📈 Roadmap

View File

@@ -7,7 +7,7 @@ import asyncio
import os import os
import dotenv import dotenv
from leann.api import LeannBuilder, LeannSearcher, LeannChat from leann.api import LeannBuilder, LeannSearcher, LeannChat
import leann_backend_hnsw # Import to ensure backend registration import leann_backend_diskann # Import to ensure backend registration
import shutil import shutil
from pathlib import Path from pathlib import Path
@@ -21,9 +21,9 @@ file_extractor: dict[str, BaseReader] = {
".xlsx": reader, ".xlsx": reader,
} }
node_parser = DoclingNodeParser( node_parser = DoclingNodeParser(
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64) chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=256)
) )
print("Loading documents...")
documents = SimpleDirectoryReader( documents = SimpleDirectoryReader(
"examples/data", "examples/data",
recursive=True, recursive=True,
@@ -31,7 +31,7 @@ documents = SimpleDirectoryReader(
encoding="utf-8", encoding="utf-8",
required_exts=[".pdf", ".docx", ".pptx", ".xlsx"] required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
).load_data(show_progress=True) ).load_data(show_progress=True)
print("Documents loaded.")
# Extract text from documents and prepare for Leann # Extract text from documents and prepare for Leann
all_texts = [] all_texts = []
for doc in documents: for doc in documents:
@@ -50,7 +50,7 @@ if INDEX_DIR.exists():
print(f"\n[PHASE 1] Building Leann index...") print(f"\n[PHASE 1] Building Leann index...")
builder = LeannBuilder( builder = LeannBuilder(
backend_name="hnsw", backend_name="diskann",
embedding_model="facebook/contriever", # Using a common sentence transformer model embedding_model="facebook/contriever", # Using a common sentence transformer model
graph_degree=32, graph_degree=32,
complexity=64 complexity=64
@@ -67,9 +67,10 @@ async def main():
print(f"\n[PHASE 2] Starting Leann chat session...") print(f"\n[PHASE 2] Starting Leann chat session...")
chat = LeannChat(index_path=INDEX_PATH) chat = LeannChat(index_path=INDEX_PATH)
query = "Based on the paper, what are the main techniques LEANN and DLPM explores to reduce the storage overhead?" query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
# query = "What is the Off-policy training in RL?"
print(f"You: {query}") print(f"You: {query}")
chat_response = chat.ask(query, top_k=10, recompute_beighbor_embeddings=True) chat_response = chat.ask(query, top_k=20, recompute_beighbor_embeddings=True)
print(f"Leann: {chat_response}") print(f"Leann: {chat_response}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -265,6 +265,7 @@ class HNSWSearcher(LeannBackendSearcherInterface):
def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]: def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]:
"""Search using HNSW index with optional recompute functionality""" """Search using HNSW index with optional recompute functionality"""
from . import faiss
ef = kwargs.get("ef", 200) # Size of the dynamic candidate list for search ef = kwargs.get("ef", 200) # Size of the dynamic candidate list for search
# Recompute parameters # Recompute parameters
@@ -293,15 +294,20 @@ class HNSWSearcher(LeannBackendSearcherInterface):
# Set search parameter # Set search parameter
self._index.hnsw.efSearch = ef self._index.hnsw.efSearch = ef
# Prepare output arrays for the older FAISS SWIG API
batch_size = query.shape[0]
distances = np.empty((batch_size, top_k), dtype=np.float32)
labels = np.empty((batch_size, top_k), dtype=np.int64)
if recompute_neighbor_embeddings: if recompute_neighbor_embeddings:
# Use custom search with recompute # Use custom search with recompute
# This would require implementing custom HNSW search logic # This would require implementing custom HNSW search logic
# For now, we'll fall back to standard search # For now, we'll fall back to standard search
print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search") print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search")
distances, labels = self._index.search(query, top_k) self._index.search(query.shape[0], faiss.swig_ptr(query), top_k, faiss.swig_ptr(distances), faiss.swig_ptr(labels))
else: else:
# Standard FAISS search # Standard FAISS search using SWIG API
distances, labels = self._index.search(query, top_k) self._index.search(query.shape[0], faiss.swig_ptr(query), top_k, faiss.swig_ptr(distances), faiss.swig_ptr(labels))
return {"labels": labels, "distances": distances} return {"labels": labels, "distances": distances}

View File

File diff suppressed because it is too large Load Diff