fix larger file read and add faq
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -29,6 +29,7 @@ build/
|
|||||||
nprobe_logs/
|
nprobe_logs/
|
||||||
micro/results
|
micro/results
|
||||||
micro/contriever-INT8
|
micro/contriever-INT8
|
||||||
|
examples/data/
|
||||||
*.qdstrm
|
*.qdstrm
|
||||||
benchmark_results/
|
benchmark_results/
|
||||||
results/
|
results/
|
||||||
|
|||||||
19
README.md
19
README.md
@@ -241,6 +241,25 @@ uv run python tests/sanity_checks/test_distance_functions.py
|
|||||||
# Verify L2 implementation
|
# Verify L2 implementation
|
||||||
uv run python tests/sanity_checks/test_l2_verification.py
|
uv run python tests/sanity_checks/test_l2_verification.py
|
||||||
```
|
```
|
||||||
|
## ❓ FAQ
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
#### NCCL Topology Error
|
||||||
|
**Problem**: You encounter `ncclTopoComputePaths` error during document processing:
|
||||||
|
```
|
||||||
|
ncclTopoComputePaths (system=<optimized out>, comm=comm@entry=0x5555a82fa3c0) at graph/paths.cc:688
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution**: Set these environment variables before running your script:
|
||||||
|
```bash
|
||||||
|
export NCCL_TOPO_DUMP_FILE=/tmp/nccl_topo.xml
|
||||||
|
export NCCL_DEBUG=INFO
|
||||||
|
export NCCL_DEBUG_SUBSYS=INIT,GRAPH
|
||||||
|
export NCCL_IB_DISABLE=1
|
||||||
|
export NCCL_NET_PLUGIN=none
|
||||||
|
export NCCL_SOCKET_IFNAME=ens5
|
||||||
|
|
||||||
|
|
||||||
## 📈 Roadmap
|
## 📈 Roadmap
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import asyncio
|
|||||||
import os
|
import os
|
||||||
import dotenv
|
import dotenv
|
||||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||||
import leann_backend_hnsw # Import to ensure backend registration
|
import leann_backend_diskann # Import to ensure backend registration
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -21,9 +21,9 @@ file_extractor: dict[str, BaseReader] = {
|
|||||||
".xlsx": reader,
|
".xlsx": reader,
|
||||||
}
|
}
|
||||||
node_parser = DoclingNodeParser(
|
node_parser = DoclingNodeParser(
|
||||||
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64)
|
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=256)
|
||||||
)
|
)
|
||||||
|
print("Loading documents...")
|
||||||
documents = SimpleDirectoryReader(
|
documents = SimpleDirectoryReader(
|
||||||
"examples/data",
|
"examples/data",
|
||||||
recursive=True,
|
recursive=True,
|
||||||
@@ -31,7 +31,7 @@ documents = SimpleDirectoryReader(
|
|||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
|
required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
|
||||||
).load_data(show_progress=True)
|
).load_data(show_progress=True)
|
||||||
|
print("Documents loaded.")
|
||||||
# Extract text from documents and prepare for Leann
|
# Extract text from documents and prepare for Leann
|
||||||
all_texts = []
|
all_texts = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
@@ -50,7 +50,7 @@ if INDEX_DIR.exists():
|
|||||||
print(f"\n[PHASE 1] Building Leann index...")
|
print(f"\n[PHASE 1] Building Leann index...")
|
||||||
|
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name="hnsw",
|
backend_name="diskann",
|
||||||
embedding_model="facebook/contriever", # Using a common sentence transformer model
|
embedding_model="facebook/contriever", # Using a common sentence transformer model
|
||||||
graph_degree=32,
|
graph_degree=32,
|
||||||
complexity=64
|
complexity=64
|
||||||
@@ -67,9 +67,10 @@ async def main():
|
|||||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||||
chat = LeannChat(index_path=INDEX_PATH)
|
chat = LeannChat(index_path=INDEX_PATH)
|
||||||
|
|
||||||
query = "Based on the paper, what are the main techniques LEANN and DLPM explores to reduce the storage overhead?"
|
query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
|
||||||
|
# query = "What is the Off-policy training in RL?"
|
||||||
print(f"You: {query}")
|
print(f"You: {query}")
|
||||||
chat_response = chat.ask(query, top_k=10, recompute_beighbor_embeddings=True)
|
chat_response = chat.ask(query, top_k=20, recompute_beighbor_embeddings=True)
|
||||||
print(f"Leann: {chat_response}")
|
print(f"Leann: {chat_response}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -265,6 +265,7 @@ class HNSWSearcher(LeannBackendSearcherInterface):
|
|||||||
|
|
||||||
def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]:
|
def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]:
|
||||||
"""Search using HNSW index with optional recompute functionality"""
|
"""Search using HNSW index with optional recompute functionality"""
|
||||||
|
from . import faiss
|
||||||
ef = kwargs.get("ef", 200) # Size of the dynamic candidate list for search
|
ef = kwargs.get("ef", 200) # Size of the dynamic candidate list for search
|
||||||
|
|
||||||
# Recompute parameters
|
# Recompute parameters
|
||||||
@@ -293,15 +294,20 @@ class HNSWSearcher(LeannBackendSearcherInterface):
|
|||||||
# Set search parameter
|
# Set search parameter
|
||||||
self._index.hnsw.efSearch = ef
|
self._index.hnsw.efSearch = ef
|
||||||
|
|
||||||
|
# Prepare output arrays for the older FAISS SWIG API
|
||||||
|
batch_size = query.shape[0]
|
||||||
|
distances = np.empty((batch_size, top_k), dtype=np.float32)
|
||||||
|
labels = np.empty((batch_size, top_k), dtype=np.int64)
|
||||||
|
|
||||||
if recompute_neighbor_embeddings:
|
if recompute_neighbor_embeddings:
|
||||||
# Use custom search with recompute
|
# Use custom search with recompute
|
||||||
# This would require implementing custom HNSW search logic
|
# This would require implementing custom HNSW search logic
|
||||||
# For now, we'll fall back to standard search
|
# For now, we'll fall back to standard search
|
||||||
print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search")
|
print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search")
|
||||||
distances, labels = self._index.search(query, top_k)
|
self._index.search(query.shape[0], faiss.swig_ptr(query), top_k, faiss.swig_ptr(distances), faiss.swig_ptr(labels))
|
||||||
else:
|
else:
|
||||||
# Standard FAISS search
|
# Standard FAISS search using SWIG API
|
||||||
distances, labels = self._index.search(query, top_k)
|
self._index.search(query.shape[0], faiss.swig_ptr(query), top_k, faiss.swig_ptr(distances), faiss.swig_ptr(labels))
|
||||||
|
|
||||||
return {"labels": labels, "distances": distances}
|
return {"labels": labels, "distances": distances}
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user