fix larger file read and add faq

2025-07-03 23:25:36 +00:00
parent a627abe794
commit 368474d036
5 changed files with 8825 additions and 2526 deletions
@@ -29,6 +29,7 @@ build/
 nprobe_logs/
 micro/results
 micro/contriever-INT8
+examples/data/
 *.qdstrm
 benchmark_results/
 results/
@@ -241,6 +241,25 @@ uv run python tests/sanity_checks/test_distance_functions.py
 # Verify L2 implementation
 uv run python tests/sanity_checks/test_l2_verification.py
 ```
+## ❓ FAQ
+
+### Common Issues
+
+#### NCCL Topology Error
+**Problem**: You encounter `ncclTopoComputePaths` error during document processing:
+```
+ncclTopoComputePaths (system=<optimized out>, comm=comm@entry=0x5555a82fa3c0) at graph/paths.cc:688
+```
+
+**Solution**: Set these environment variables before running your script:
+```bash
+export NCCL_TOPO_DUMP_FILE=/tmp/nccl_topo.xml
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=INIT,GRAPH
+export NCCL_IB_DISABLE=1
+export NCCL_NET_PLUGIN=none
+export NCCL_SOCKET_IFNAME=ens5
+

 ## 📈 Roadmap

@@ -7,7 +7,7 @@ import asyncio
 import os
 import dotenv
 from leann.api import LeannBuilder, LeannSearcher, LeannChat
-import leann_backend_hnsw # Import to ensure backend registration
+import leann_backend_diskann # Import to ensure backend registration
 import shutil
 from pathlib import Path

@@ -21,9 +21,9 @@ file_extractor: dict[str, BaseReader] = {
    ".xlsx": reader,
 }
 node_parser = DoclingNodeParser(
-    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64)
+    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=256)
 )
-
+print("Loading documents...")
 documents = SimpleDirectoryReader(
    "examples/data", 
    recursive=True, 
@@ -31,7 +31,7 @@ documents = SimpleDirectoryReader(
    encoding="utf-8",
    required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
 ).load_data(show_progress=True)
-
+print("Documents loaded.")
 # Extract text from documents and prepare for Leann
 all_texts = []
 for doc in documents:
@@ -50,7 +50,7 @@ if INDEX_DIR.exists():
 print(f"\n[PHASE 1] Building Leann index...")

 builder = LeannBuilder(
-    backend_name="hnsw",
+    backend_name="diskann",
    embedding_model="facebook/contriever", # Using a common sentence transformer model
    graph_degree=32, 
    complexity=64
@@ -67,9 +67,10 @@ async def main():
    print(f"\n[PHASE 2] Starting Leann chat session...")
    chat = LeannChat(index_path=INDEX_PATH)
    
-    query = "Based on the paper, what are the main techniques LEANN and DLPM explores to reduce the storage overhead?"
+    query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
+    # query = "What is the Off-policy training in RL?"
    print(f"You: {query}")
-    chat_response = chat.ask(query, top_k=10, recompute_beighbor_embeddings=True)
+    chat_response = chat.ask(query, top_k=20, recompute_beighbor_embeddings=True)
    print(f"Leann: {chat_response}")

 if __name__ == "__main__":
@@ -265,6 +265,7 @@ class HNSWSearcher(LeannBackendSearcherInterface):

    def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]:
        """Search using HNSW index with optional recompute functionality"""
+        from . import faiss
        ef = kwargs.get("ef", 200)  # Size of the dynamic candidate list for search
        
        # Recompute parameters
@@ -293,15 +294,20 @@ class HNSWSearcher(LeannBackendSearcherInterface):
            # Set search parameter
            self._index.hnsw.efSearch = ef
            
+            # Prepare output arrays for the older FAISS SWIG API
+            batch_size = query.shape[0]
+            distances = np.empty((batch_size, top_k), dtype=np.float32)
+            labels = np.empty((batch_size, top_k), dtype=np.int64)
+            
            if recompute_neighbor_embeddings:
                # Use custom search with recompute
                # This would require implementing custom HNSW search logic
                # For now, we'll fall back to standard search
                print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search")
-                distances, labels = self._index.search(query, top_k)
+                self._index.search(query.shape[0], faiss.swig_ptr(query), top_k, faiss.swig_ptr(distances), faiss.swig_ptr(labels))
            else:
-                # Standard FAISS search
-                distances, labels = self._index.search(query, top_k)
+                # Standard FAISS search using SWIG API
+                self._index.search(query.shape[0], faiss.swig_ptr(query), top_k, faiss.swig_ptr(distances), faiss.swig_ptr(labels))
            
            return {"labels": labels, "distances": distances}