diff --git a/README.md b/README.md
index 95bc2bc..13aeef2 100755
--- a/README.md
+++ b/README.md
@@ -145,12 +145,11 @@ Above we showed the Python API, while this CLI script demonstrates the same conc
The following scripts use Ollama `qwen3:8b` by default, so you need `ollama pull qwen3:8b` first. For other models: `--llm openai --model gpt-4o` (requires `OPENAI_API_KEY` environment variable) or `--llm hf --model Qwen/Qwen3-4B`.
```bash
-# Drop your PDFs, .txt, .md files into examples/data/
-uv run ./examples/main_cli_example.py
+# Drop your PDFs, .txt, .md files into apps/documents/data/
+python -m apps.documents
-# Or use python directly
-source .venv/bin/activate
-python ./examples/main_cli_example.py
+# Or with uv
+uv run python -m apps.documents
```
@@ -159,7 +158,7 @@ python ./examples/main_cli_example.py
### Search Your Entire Life
```bash
-python examples/mail_reader_leann.py
+python -m apps.email
# "What's the number of class recommend to take per semester for incoming EECS students?"
```
**90K emails → 14MB.** Finally, search your email like you search Google.
@@ -169,19 +168,19 @@ python examples/mail_reader_leann.py
```bash
# Use default mail path (works for most macOS setups)
-python examples/mail_reader_leann.py
+python -m apps.email
# Run with custom index directory
-python examples/mail_reader_leann.py --index-dir "./my_mail_index"
+python -m apps.email --index-dir "./my_mail_index"
# Process all emails (may take time but indexes everything)
-python examples/mail_reader_leann.py --max-emails -1
+python -m apps.email --max-emails -1
# Limit number of emails processed (useful for testing)
-python examples/mail_reader_leann.py --max-emails 1000
+python -m apps.email --max-emails 1000
# Run a single query
-python examples/mail_reader_leann.py --query "What did my boss say about deadlines?"
+python -m apps.email --query "What did my boss say about deadlines?"
```
@@ -197,7 +196,7 @@ Once the index is built, you can ask questions like:
### Time Machine for the Web
```bash
-python examples/google_history_reader_leann.py
+python -m apps.browser
# "Tell me my browser history about machine learning system stuff?"
```
**38K browser entries → 6MB.** Your browser history becomes your personal search engine.
@@ -207,16 +206,16 @@ python examples/google_history_reader_leann.py
```bash
# Use default Chrome profile (auto-finds all profiles)
-python examples/google_history_reader_leann.py
+python -m apps.browser
# Run with custom index directory
-python examples/google_history_reader_leann.py --index-dir "./my_chrome_index"
+python -m apps.browser --index-dir "./my_chrome_index"
# Limit number of history entries processed (useful for testing)
-python examples/google_history_reader_leann.py --max-entries 500
+python -m apps.browser --max-entries 500
# Run a single query
-python examples/google_history_reader_leann.py --query "What websites did I visit about machine learning?"
+python -m apps.browser --query "What websites did I visit about machine learning?"
```
@@ -252,7 +251,7 @@ Once the index is built, you can ask questions like:
### WeChat Detective
```bash
-python examples/wechat_history_reader_leann.py
+python -m apps.wechat
# "Show me all group chats about weekend plans"
```
**400K messages → 64MB.** Search years of chat history in any language.
@@ -274,19 +273,19 @@ sudo packages/wechat-exporter/wechattweak-cli install
```bash
# Use default settings (recommended for first run)
-python examples/wechat_history_reader_leann.py
+python -m apps.wechat
# Run with custom export directory and wehn we run the first time, LEANN will export all chat history automatically for you
-python examples/wechat_history_reader_leann.py --export-dir "./my_wechat_exports"
+python -m apps.wechat --export-dir "./my_wechat_exports"
# Run with custom index directory
-python examples/wechat_history_reader_leann.py --index-dir "./my_wechat_index"
+python -m apps.wechat --index-dir "./my_wechat_index"
# Limit number of chat entries processed (useful for testing)
-python examples/wechat_history_reader_leann.py --max-entries 1000
+python -m apps.wechat --max-entries 1000
# Run a single query
-python examples/wechat_history_reader_leann.py --query "Show me conversations about travel plans"
+python -m apps.wechat --query "Show me conversations about travel plans"
```
@@ -388,7 +387,7 @@ Options:
Run the comparison yourself:
```bash
-python examples/compare_faiss_vs_leann.py
+python -m apps.benchmarks
```
| System | Storage |
@@ -430,8 +429,8 @@ Same dataset, same hardware, same embedding model. LEANN just works better.
```bash
uv pip install -e ".[dev]" # Install dev dependencies
-python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR dataset
-python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia
+python -m apps.evaluation data/indices/dpr/dpr_diskann # DPR dataset
+python -m apps.evaluation data/indices/rpj_wiki/rpj_wiki.index # Wikipedia
```
The evaluation script downloads data automatically on first run. The last three results were tested with partial personal data, and you can reproduce them with your own data!
diff --git a/apps/__init__.py b/apps/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/benchmarks/__init__.py b/apps/benchmarks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/benchmarks/__main__.py b/apps/benchmarks/__main__.py
new file mode 100644
index 0000000..f901255
--- /dev/null
+++ b/apps/benchmarks/__main__.py
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3
+"""
+Memory comparison between Faiss HNSW and LEANN HNSW backend
+"""
+
+import logging
+import os
+import sys
+import time
+import psutil
+import gc
+import subprocess
+from pathlib import Path
+from llama_index.core.node_parser import SentenceSplitter
+
+# Setup logging
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def get_memory_usage():
+ """Get current memory usage in MB"""
+ process = psutil.Process()
+ return process.memory_info().rss / 1024 / 1024
+
+
+def print_memory_stats(stage: str, start_mem: float):
+ """Print memory statistics"""
+ current_mem = get_memory_usage()
+ diff = current_mem - start_mem
+ print(f"[{stage}] Memory: {current_mem:.1f} MB (+{diff:.1f} MB)")
+ return current_mem
+
+
+class MemoryTracker:
+ def __init__(self, name: str):
+ self.name = name
+ self.start_mem = get_memory_usage()
+ self.stages = []
+
+ def checkpoint(self, stage: str):
+ current_mem = print_memory_stats(f"{self.name} - {stage}", self.start_mem)
+ self.stages.append((stage, current_mem))
+ return current_mem
+
+ def summary(self):
+ print(f"\n=== {self.name} Memory Summary ===")
+ for stage, mem in self.stages:
+ print(f"{stage}: {mem:.1f} MB")
+ peak_mem = max(mem for _, mem in self.stages)
+ print(f"Peak Memory: {peak_mem:.1f} MB")
+ print(f"Total Memory Increase: {peak_mem - self.start_mem:.1f} MB")
+ return peak_mem
+
+
+def test_faiss_hnsw():
+ """Test Faiss HNSW Vector Store in subprocess"""
+ print("\n" + "=" * 50)
+ print("TESTING FAISS HNSW VECTOR STORE")
+ print("=" * 50)
+
+ try:
+ # Get the directory of this script
+ script_dir = Path(__file__).parent
+ faiss_script = script_dir / "faiss_only.py"
+ result = subprocess.run(
+ [sys.executable, str(faiss_script)],
+ capture_output=True,
+ text=True,
+ timeout=300,
+ )
+
+ print(result.stdout)
+ if result.stderr:
+ print("Stderr:", result.stderr)
+
+ if result.returncode != 0:
+ return {
+ "peak_memory": float("inf"),
+ "error": f"Process failed with code {result.returncode}",
+ }
+
+ # Parse peak memory from output
+ lines = result.stdout.split("\n")
+ peak_memory = 0.0
+
+ for line in lines:
+ if "Peak Memory:" in line:
+ peak_memory = float(
+ line.split("Peak Memory:")[1].split("MB")[0].strip()
+ )
+
+ return {"peak_memory": peak_memory}
+
+ except Exception as e:
+ return {
+ "peak_memory": float("inf"),
+ "error": str(e),
+ }
+
+
+def test_leann_hnsw():
+ """Test LEANN HNSW Search Memory (load existing index)"""
+ print("\n" + "=" * 50)
+ print("TESTING LEANN HNSW SEARCH MEMORY")
+ print("=" * 50)
+
+ tracker = MemoryTracker("LEANN HNSW Search")
+
+ # Import and setup
+ tracker.checkpoint("Initial")
+
+ from leann.api import LeannSearcher
+
+ tracker.checkpoint("After imports")
+
+ from llama_index.core import SimpleDirectoryReader
+ from leann.api import LeannBuilder, LeannSearcher
+
+
+ # Load and parse documents
+ documents = SimpleDirectoryReader(
+ "../documents/data",
+ recursive=True,
+ encoding="utf-8",
+ required_exts=[".pdf", ".txt", ".md"],
+ ).load_data()
+
+ tracker.checkpoint("After document loading")
+
+ # Parse into chunks
+ node_parser = SentenceSplitter(
+ chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
+ )
+
+ all_texts = []
+ for doc in documents:
+ nodes = node_parser.get_nodes_from_documents([doc])
+ for node in nodes:
+ all_texts.append(node.get_content())
+
+ tracker.checkpoint("After text chunking")
+
+ # Build LEANN index
+ INDEX_DIR = Path("./test_leann_comparison")
+ INDEX_PATH = str(INDEX_DIR / "comparison.leann")
+
+ # Check if index already exists
+ if os.path.exists(INDEX_PATH + ".meta.json"):
+ print("Loading existing LEANN HNSW index...")
+ tracker.checkpoint("After loading existing index")
+ else:
+ print("Building new LEANN HNSW index...")
+ # Clean up previous index
+ import shutil
+
+ if INDEX_DIR.exists():
+ shutil.rmtree(INDEX_DIR)
+
+ builder = LeannBuilder(
+ backend_name="hnsw",
+ embedding_model="facebook/contriever",
+ graph_degree=32,
+ complexity=64,
+ is_compact=True,
+ is_recompute=True,
+ num_threads=1,
+ )
+
+ tracker.checkpoint("After builder setup")
+
+ print("Building LEANN HNSW index...")
+
+ for chunk_text in all_texts:
+ builder.add_text(chunk_text)
+
+ builder.build_index(INDEX_PATH)
+ del builder
+ gc.collect()
+
+ tracker.checkpoint("After index building")
+
+ # Find existing LEANN index
+ index_paths = [
+ "./test_leann_comparison/comparison.leann",
+ ]
+ index_path = None
+ for path in index_paths:
+ if os.path.exists(path + ".meta.json"):
+ index_path = path
+ break
+
+ if not index_path:
+ print("❌ LEANN index not found. Please build it first")
+ return {"peak_memory": float("inf"), "error": "Index not found"}
+
+ # Measure runtime memory overhead
+ print("\nMeasuring runtime memory overhead...")
+ runtime_start_mem = get_memory_usage()
+ print(f"Before load memory: {runtime_start_mem:.1f} MB")
+ tracker.checkpoint("Before load memory")
+
+ # Load searcher
+ searcher = LeannSearcher(index_path)
+ tracker.checkpoint("After searcher loading")
+
+
+
+ print("Running search queries...")
+ queries = [
+ "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
+ "What is LEANN and how does it work?",
+ "华为诺亚方舟实验室的主要研究内容",
+ ]
+
+ for i, query in enumerate(queries):
+ start_time = time.time()
+ # Use same parameters as Faiss: top_k=20, ef=120 (complexity parameter)
+ _ = searcher.search(query, top_k=20, ef=120)
+ query_time = time.time() - start_time
+ print(f"Query {i + 1} time: {query_time:.3f}s")
+ tracker.checkpoint(f"After query {i + 1}")
+
+ runtime_end_mem = get_memory_usage()
+ runtime_overhead = runtime_end_mem - runtime_start_mem
+
+ peak_memory = tracker.summary()
+ print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
+
+ # Get storage size before cleanup
+ storage_size = 0
+ INDEX_DIR = Path(index_path).parent
+ if INDEX_DIR.exists():
+ total_size = 0
+ for dirpath, _, filenames in os.walk(str(INDEX_DIR)):
+ for filename in filenames:
+ # Only count actual index files, skip text data and backups
+ if filename.endswith((".old", ".tmp", ".bak", ".jsonl", ".json")):
+ continue
+ # Count .index, .idx, .map files (actual index structures)
+ if filename.endswith((".index", ".idx", ".map")):
+ filepath = os.path.join(dirpath, filename)
+ total_size += os.path.getsize(filepath)
+ storage_size = total_size / (1024 * 1024) # Convert to MB
+
+ # Clean up
+ del searcher
+ gc.collect()
+
+ return {
+ "peak_memory": peak_memory,
+ "storage_size": storage_size,
+ }
+
+
+def main():
+ """Run comparison tests"""
+ print("Storage + Search Memory Comparison: Faiss HNSW vs LEANN HNSW")
+ print("=" * 60)
+
+ # Test Faiss HNSW
+ faiss_results = test_faiss_hnsw()
+
+ # Force garbage collection
+ gc.collect()
+ time.sleep(2)
+
+ # Test LEANN HNSW
+ leann_results = test_leann_hnsw()
+
+ # Final comparison
+ print("\n" + "=" * 60)
+ print("STORAGE + SEARCH MEMORY COMPARISON")
+ print("=" * 60)
+
+ # Get storage sizes
+ faiss_storage_size = 0
+ leann_storage_size = leann_results.get("storage_size", 0)
+
+ # Get Faiss storage size using Python
+ if os.path.exists("./storage_faiss"):
+ total_size = 0
+ for dirpath, _, filenames in os.walk("./storage_faiss"):
+ for filename in filenames:
+ filepath = os.path.join(dirpath, filename)
+ total_size += os.path.getsize(filepath)
+ faiss_storage_size = total_size / (1024 * 1024) # Convert to MB
+
+ print("Faiss HNSW:")
+ if "error" in faiss_results:
+ print(f" ❌ Failed: {faiss_results['error']}")
+ else:
+ print(f" Search Memory: {faiss_results['peak_memory']:.1f} MB")
+ print(f" Storage Size: {faiss_storage_size:.1f} MB")
+
+ print("\nLEANN HNSW:")
+ if "error" in leann_results:
+ print(f" ❌ Failed: {leann_results['error']}")
+ else:
+ print(f" Search Memory: {leann_results['peak_memory']:.1f} MB")
+ print(f" Storage Size: {leann_storage_size:.1f} MB")
+
+ # Calculate improvements only if both tests succeeded
+ if "error" not in faiss_results and "error" not in leann_results:
+ memory_ratio = faiss_results["peak_memory"] / leann_results["peak_memory"]
+
+ print("\nLEANN vs Faiss Performance:")
+ memory_saving = faiss_results["peak_memory"] - leann_results["peak_memory"]
+ print(
+ f" Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)"
+ )
+
+ # Storage comparison
+ if leann_storage_size > faiss_storage_size:
+ storage_ratio = leann_storage_size / faiss_storage_size
+ print(
+ f" Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)"
+ )
+ elif faiss_storage_size > leann_storage_size:
+ storage_ratio = faiss_storage_size / leann_storage_size
+ print(
+ f" Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)"
+ )
+ else:
+ print(" Storage Size: similar")
+ else:
+ if "error" not in leann_results:
+ print("\n✅ LEANN HNSW completed successfully!")
+ print(f"📊 Search Memory: {leann_results['peak_memory']:.1f} MB")
+ print(f"📊 Storage Size: {leann_storage_size:.1f} MB")
+ if "error" not in faiss_results:
+ print("\n✅ Faiss HNSW completed successfully!")
+ print(f"📊 Search Memory: {faiss_results['peak_memory']:.1f} MB")
+ print(f"📊 Storage Size: {faiss_storage_size:.1f} MB")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/benchmarks/faiss_only.py b/apps/benchmarks/faiss_only.py
new file mode 100644
index 0000000..2e6c2f8
--- /dev/null
+++ b/apps/benchmarks/faiss_only.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Test only Faiss HNSW"""
+
+import sys
+import time
+import psutil
+import gc
+import os
+
+
+def get_memory_usage():
+ process = psutil.Process()
+ return process.memory_info().rss / 1024 / 1024
+
+
+class MemoryTracker:
+ def __init__(self, name: str):
+ self.name = name
+ self.start_mem = get_memory_usage()
+ self.stages = []
+
+ def checkpoint(self, stage: str):
+ current_mem = get_memory_usage()
+ diff = current_mem - self.start_mem
+ print(f"[{self.name} - {stage}] Memory: {current_mem:.1f} MB (+{diff:.1f} MB)")
+ self.stages.append((stage, current_mem))
+ return current_mem
+
+ def summary(self):
+ peak_mem = max(mem for _, mem in self.stages)
+ print(f"Peak Memory: {peak_mem:.1f} MB")
+ return peak_mem
+
+
+def main():
+ try:
+ import faiss
+ except ImportError:
+ print("Faiss is not installed.")
+ print("Please install it with `uv pip install faiss-cpu`")
+ sys.exit(1)
+
+ from llama_index.core import (
+ SimpleDirectoryReader,
+ VectorStoreIndex,
+ StorageContext,
+ Settings,
+ node_parser,
+ Document,
+ )
+ from llama_index.core.node_parser import SentenceSplitter
+ from llama_index.vector_stores.faiss import FaissVectorStore
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+
+ tracker = MemoryTracker("Faiss HNSW")
+ tracker.checkpoint("Initial")
+
+ embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
+ Settings.embed_model = embed_model
+ tracker.checkpoint("After embedding model setup")
+
+ d = 768
+ faiss_index = faiss.IndexHNSWFlat(d, 32)
+ faiss_index.hnsw.efConstruction = 64
+ tracker.checkpoint("After Faiss index creation")
+
+ documents = SimpleDirectoryReader(
+ "../documents/data",
+ recursive=True,
+ encoding="utf-8",
+ required_exts=[".pdf", ".txt", ".md"],
+ ).load_data()
+ tracker.checkpoint("After document loading")
+
+ # Parse into chunks using the same splitter as LEANN
+ node_parser = SentenceSplitter(
+ chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
+ )
+
+ tracker.checkpoint("After text splitter setup")
+
+ # Check if index already exists and try to load it
+ index_loaded = False
+ if os.path.exists("./storage_faiss"):
+ print("Loading existing Faiss HNSW index...")
+ try:
+ # Use the correct Faiss loading pattern from the example
+ vector_store = FaissVectorStore.from_persist_dir("./storage_faiss")
+ storage_context = StorageContext.from_defaults(
+ vector_store=vector_store, persist_dir="./storage_faiss"
+ )
+ from llama_index.core import load_index_from_storage
+ index = load_index_from_storage(storage_context=storage_context)
+ print(f"Index loaded from ./storage_faiss")
+ tracker.checkpoint("After loading existing index")
+ index_loaded = True
+ except Exception as e:
+ print(f"Failed to load existing index: {e}")
+ print("Cleaning up corrupted index and building new one...")
+ # Clean up corrupted index
+ import shutil
+ if os.path.exists("./storage_faiss"):
+ shutil.rmtree("./storage_faiss")
+
+ if not index_loaded:
+ print("Building new Faiss HNSW index...")
+
+ # Use the correct Faiss building pattern from the example
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
+ index = VectorStoreIndex.from_documents(
+ documents,
+ storage_context=storage_context,
+ transformations=[node_parser]
+ )
+ tracker.checkpoint("After index building")
+
+ # Save index to disk using the correct pattern
+ index.storage_context.persist(persist_dir="./storage_faiss")
+ tracker.checkpoint("After index saving")
+
+ # Measure runtime memory overhead
+ print("\nMeasuring runtime memory overhead...")
+ runtime_start_mem = get_memory_usage()
+ print(f"Before load memory: {runtime_start_mem:.1f} MB")
+ tracker.checkpoint("Before load memory")
+
+ query_engine = index.as_query_engine(similarity_top_k=20)
+ queries = [
+ "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
+ "What is LEANN and how does it work?",
+ "华为诺亚方舟实验室的主要研究内容",
+ ]
+
+ for i, query in enumerate(queries):
+ start_time = time.time()
+ _ = query_engine.query(query)
+ query_time = time.time() - start_time
+ print(f"Query {i + 1} time: {query_time:.3f}s")
+ tracker.checkpoint(f"After query {i + 1}")
+
+ runtime_end_mem = get_memory_usage()
+ runtime_overhead = runtime_end_mem - runtime_start_mem
+
+ peak_memory = tracker.summary()
+ print(f"Peak Memory: {peak_memory:.1f} MB")
+ print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/browser/__init__.py b/apps/browser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/browser/__main__.py b/apps/browser/__main__.py
new file mode 100644
index 0000000..8f9906c
--- /dev/null
+++ b/apps/browser/__main__.py
@@ -0,0 +1,201 @@
+import os
+import asyncio
+import argparse
+try:
+ import dotenv
+ dotenv.load_dotenv()
+except ModuleNotFoundError:
+ # python-dotenv is not installed; skip loading environment variables
+ dotenv = None
+from pathlib import Path
+from typing import List, Any
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+from llama_index.core.node_parser import SentenceSplitter
+
+# Default Chrome profile path
+DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
+
+def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1):
+ """
+ Create LEANN index from multiple Chrome profile data sources.
+
+ Args:
+ profile_dirs: List of Path objects pointing to Chrome profile directories
+ index_path: Path to save the LEANN index
+ max_count: Maximum number of history entries to process per profile
+ """
+ print("Creating LEANN index from multiple Chrome profile data sources...")
+
+ # Load documents using ChromeHistoryReader from local readers module
+ from .readers import ChromeHistoryReader
+ reader = ChromeHistoryReader()
+
+ INDEX_DIR = Path(index_path).parent
+
+ if not INDEX_DIR.exists():
+ print(f"--- Index directory not found, building new index ---")
+ all_documents = []
+ total_processed = 0
+
+ # Process each Chrome profile directory
+ for i, profile_dir in enumerate(profile_dirs):
+ print(f"\nProcessing Chrome profile {i+1}/{len(profile_dirs)}: {profile_dir}")
+
+ try:
+ documents = reader.load_data(
+ chrome_profile_path=str(profile_dir),
+ max_count=max_count
+ )
+ if documents:
+ print(f"Loaded {len(documents)} history documents from {profile_dir}")
+ all_documents.extend(documents)
+ total_processed += len(documents)
+
+ # Check if we've reached the max count
+ if max_count > 0 and total_processed >= max_count:
+ print(f"Reached max count of {max_count} documents")
+ break
+ else:
+ print(f"No documents loaded from {profile_dir}")
+ except Exception as e:
+ print(f"Error processing {profile_dir}: {e}")
+ continue
+
+ if not all_documents:
+ print("No documents loaded from any source. Exiting.")
+ return None
+
+ print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles")
+
+ # Create text splitter with 256 chunk size
+ text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
+
+ # Convert Documents to text strings and chunk them
+ all_texts = []
+ for doc in all_documents:
+ # Split the document into chunks
+ nodes = text_splitter.get_nodes_from_documents([doc])
+ for node in nodes:
+ all_texts.append(node.get_content())
+
+ print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
+
+ # Create LEANN index directory
+ print(f"--- Index directory not found, building new index ---")
+ INDEX_DIR.mkdir(exist_ok=True)
+
+ print(f"--- Building new LEANN index ---")
+
+ print(f"\n[PHASE 1] Building Leann index...")
+
+ # Use HNSW backend for better macOS compatibility
+ builder = LeannBuilder(
+ backend_name="hnsw",
+ embedding_model="facebook/contriever",
+ graph_degree=32,
+ complexity=64,
+ is_compact=True,
+ is_recompute=True,
+ num_threads=1 # Force single-threaded mode
+ )
+
+ print(f"Adding {len(all_texts)} history chunks to index...")
+ for chunk_text in all_texts:
+ builder.add_text(chunk_text)
+
+ builder.build_index(index_path)
+ print(f"\nLEANN index built at {index_path}!")
+ else:
+ print(f"--- Using existing index at {INDEX_DIR} ---")
+
+ return index_path
+
+async def query_leann_index(index_path: str, query: str):
+ """
+ Query the LEANN index.
+
+ Args:
+ index_path: Path to the LEANN index
+ query: The query string
+ """
+ print(f"\n[PHASE 2] Starting Leann chat session...")
+ chat = LeannChat(index_path=index_path)
+
+ print(f"You: {query}")
+ chat_response = chat.ask(
+ query,
+ top_k=10,
+ recompute_beighbor_embeddings=True,
+ complexity=32,
+ beam_width=1,
+ llm_config={
+ "type": "openai",
+ "model": "gpt-4o",
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ },
+ llm_kwargs={
+ "temperature": 0.0,
+ "max_tokens": 1000
+ }
+ )
+ print(f"Leann: {chat_response}")
+
+async def main():
+ # Parse command line arguments
+ parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
+ parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
+ help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
+ parser.add_argument('--index-dir', type=str, default="./chrome_history_index_leann_test",
+ help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
+ parser.add_argument('--max-entries', type=int, default=1000,
+ help='Maximum number of history entries to process (default: 1000)')
+ parser.add_argument('--query', type=str, default=None,
+ help='Single query to run (default: runs example queries)')
+ parser.add_argument('--auto-find-profiles', action='store_true', default=True,
+ help='Automatically find all Chrome profiles (default: True)')
+
+ args = parser.parse_args()
+
+ INDEX_DIR = Path(args.index_dir)
+ INDEX_PATH = str(INDEX_DIR / "chrome_history.leann")
+
+ print(f"Using Chrome profile: {args.chrome_profile}")
+ print(f"Index directory: {INDEX_DIR}")
+ print(f"Max entries: {args.max_entries}")
+
+ # Find Chrome profile directories
+ from .readers import ChromeHistoryReader
+
+ if args.auto_find_profiles:
+ profile_dirs = ChromeHistoryReader.find_chrome_profiles()
+ if not profile_dirs:
+ print("No Chrome profiles found automatically. Exiting.")
+ return
+ else:
+ # Use single specified profile
+ profile_path = Path(args.chrome_profile)
+ if not profile_path.exists():
+ print(f"Chrome profile not found: {profile_path}")
+ return
+ profile_dirs = [profile_path]
+
+ # Create or load the LEANN index from all sources
+ index_path = create_leann_index_from_multiple_chrome_profiles(profile_dirs, INDEX_PATH, args.max_entries)
+
+ if index_path:
+ if args.query:
+ # Run single query
+ await query_leann_index(index_path, args.query)
+ else:
+ # Example queries
+ queries = [
+ "What websites did I visit about machine learning?",
+ "Find my search history about programming"
+ ]
+
+ for query in queries:
+ print("\n" + "="*60)
+ await query_leann_index(index_path, query)
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/apps/browser/readers.py b/apps/browser/readers.py
new file mode 100644
index 0000000..0258258
--- /dev/null
+++ b/apps/browser/readers.py
@@ -0,0 +1,176 @@
+import sqlite3
+import os
+from pathlib import Path
+from typing import List, Any
+from llama_index.core import Document
+from llama_index.core.readers.base import BaseReader
+
+class ChromeHistoryReader(BaseReader):
+ """
+ Chrome browser history reader that extracts browsing data from SQLite database.
+
+ Reads Chrome history from the default Chrome profile location and creates documents
+ with embedded metadata similar to the email reader structure.
+ """
+
+ def __init__(self) -> None:
+ """Initialize."""
+ pass
+
+ def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
+ """
+ Load Chrome history data from the default Chrome profile location.
+
+ Args:
+ input_dir: Not used for Chrome history (kept for compatibility)
+ **load_kwargs:
+ max_count (int): Maximum amount of history entries to read.
+ chrome_profile_path (str): Custom path to Chrome profile directory.
+ """
+ docs: List[Document] = []
+ max_count = load_kwargs.get('max_count', 1000)
+ chrome_profile_path = load_kwargs.get('chrome_profile_path', None)
+
+ # Default Chrome profile path on macOS
+ if chrome_profile_path is None:
+ chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
+
+ history_db_path = os.path.join(chrome_profile_path, "History")
+
+ if not os.path.exists(history_db_path):
+ print(f"Chrome history database not found at: {history_db_path}")
+ return docs
+
+ try:
+ # Connect to the Chrome history database
+ print(f"Connecting to database: {history_db_path}")
+ conn = sqlite3.connect(history_db_path)
+ cursor = conn.cursor()
+
+ # Query to get browsing history with metadata (removed created_time column)
+ query = """
+ SELECT
+ datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
+ url,
+ title,
+ visit_count,
+ typed_count,
+ hidden
+ FROM urls
+ ORDER BY last_visit_time DESC
+ """
+
+ print(f"Executing query on database: {history_db_path}")
+ cursor.execute(query)
+ rows = cursor.fetchall()
+ print(f"Query returned {len(rows)} rows")
+
+ count = 0
+ for row in rows:
+ if count >= max_count and max_count > 0:
+ break
+
+ last_visit, url, title, visit_count, typed_count, hidden = row
+
+ # Create document content with metadata embedded in text
+ doc_content = f"""
+[BROWSING HISTORY METADATA]
+URL: {url}
+Title: {title}
+Last Visit: {last_visit}
+Visit Count: {visit_count}
+Typed Count: {typed_count}
+Hidden: {hidden}
+[END METADATA]
+
+Title: {title}
+URL: {url}
+Last visited: {last_visit}
+"""
+
+ # Create document with embedded metadata
+ doc = Document(text=doc_content, metadata={})
+ docs.append(doc)
+ count += 1
+
+ conn.close()
+ print(f"Loaded {len(docs)} Chrome history documents")
+
+ except Exception as e:
+ print(f"Error reading Chrome history: {e}")
+ return docs
+
+ return docs
+
+ @staticmethod
+ def find_chrome_profiles() -> List[Path]:
+ """
+ Find all Chrome profile directories.
+
+ Returns:
+ List of Path objects pointing to Chrome profile directories
+ """
+ chrome_base_path = Path(os.path.expanduser("~/Library/Application Support/Google/Chrome"))
+ profile_dirs = []
+
+ if not chrome_base_path.exists():
+ print(f"Chrome directory not found at: {chrome_base_path}")
+ return profile_dirs
+
+ # Find all profile directories
+ for profile_dir in chrome_base_path.iterdir():
+ if profile_dir.is_dir() and profile_dir.name != "System Profile":
+ history_path = profile_dir / "History"
+ if history_path.exists():
+ profile_dirs.append(profile_dir)
+ print(f"Found Chrome profile: {profile_dir}")
+
+ print(f"Found {len(profile_dirs)} Chrome profiles")
+ return profile_dirs
+
+ @staticmethod
+ def export_history_to_file(output_file: str = "chrome_history_export.txt", max_count: int = 1000):
+ """
+ Export Chrome history to a text file using the same SQL query format.
+
+ Args:
+ output_file: Path to the output file
+ max_count: Maximum number of entries to export
+ """
+ chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
+ history_db_path = os.path.join(chrome_profile_path, "History")
+
+ if not os.path.exists(history_db_path):
+ print(f"Chrome history database not found at: {history_db_path}")
+ return
+
+ try:
+ conn = sqlite3.connect(history_db_path)
+ cursor = conn.cursor()
+
+ query = """
+ SELECT
+ datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
+ url,
+ title,
+ visit_count,
+ typed_count,
+ hidden
+ FROM urls
+ ORDER BY last_visit_time DESC
+ LIMIT ?
+ """
+
+ cursor.execute(query, (max_count,))
+ rows = cursor.fetchall()
+
+ with open(output_file, 'w', encoding='utf-8') as f:
+ for row in rows:
+ last_visit, url, title, visit_count, typed_count, hidden = row
+ f.write(f"{last_visit}\t{url}\t{title}\t{visit_count}\t{typed_count}\t{hidden}\n")
+
+ conn.close()
+ print(f"Exported {len(rows)} history entries to {output_file}")
+
+ except Exception as e:
+ print(f"Error exporting Chrome history: {e}")
\ No newline at end of file
diff --git a/apps/documents/__init__.py b/apps/documents/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/documents/__main__.py b/apps/documents/__main__.py
new file mode 100644
index 0000000..bd450ef
--- /dev/null
+++ b/apps/documents/__main__.py
@@ -0,0 +1,113 @@
+import argparse
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.node_parser import SentenceSplitter
+import asyncio
+import dotenv
+from leann.api import LeannBuilder, LeannChat
+from pathlib import Path
+import os
+
+dotenv.load_dotenv()
+
+
+async def main(args):
+ INDEX_DIR = Path(args.index_dir)
+ INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
+
+ if not INDEX_DIR.exists():
+ node_parser = SentenceSplitter(
+ chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
+ )
+
+ print("Loading documents...")
+ # Get the data directory relative to this module
+ current_dir = Path(__file__).parent
+ data_dir = current_dir / "data"
+
+ documents = SimpleDirectoryReader(
+ str(data_dir),
+ recursive=True,
+ encoding="utf-8",
+ required_exts=[".pdf", ".txt", ".md"],
+ ).load_data(show_progress=True)
+ print("Documents loaded.")
+ all_texts = []
+ for doc in documents:
+ nodes = node_parser.get_nodes_from_documents([doc])
+ for node in nodes:
+ all_texts.append(node.get_content())
+
+ print("--- Index directory not found, building new index ---")
+
+ print("\n[PHASE 1] Building Leann index...")
+
+ # Use HNSW backend for better macOS compatibility
+ builder = LeannBuilder(
+ backend_name="hnsw",
+ embedding_model="facebook/contriever",
+ graph_degree=32,
+ complexity=64,
+ is_compact=True,
+ is_recompute=True,
+ num_threads=1, # Force single-threaded mode
+ )
+
+ print(f"Loaded {len(all_texts)} text chunks from documents.")
+ for chunk_text in all_texts:
+ builder.add_text(chunk_text)
+
+ builder.build_index(INDEX_PATH)
+ print(f"\nLeann index built at {INDEX_PATH}!")
+ else:
+ print(f"--- Using existing index at {INDEX_DIR} ---")
+
+ print(f"\n[PHASE 2] Starting Leann chat session...")
+
+ # llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
+ llm_config = {"type": "ollama", "model": "qwen3:8b"}
+
+ chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
+
+ query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
+
+ # query = (
+ # "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
+ # )
+
+ print(f"You: {query}")
+ chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32)
+ print(f"Leann: {chat_response}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Run Leann Chat with various LLM backends."
+ )
+ parser.add_argument(
+ "--llm",
+ type=str,
+ default="hf",
+ choices=["simulated", "ollama", "hf", "openai"],
+ help="The LLM backend to use.",
+ )
+ parser.add_argument(
+ "--model",
+ type=str,
+ default="Qwen/Qwen3-0.6B",
+ help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
+ )
+ parser.add_argument(
+ "--host",
+ type=str,
+ default="http://localhost:11434",
+ help="The host for the Ollama API.",
+ )
+ parser.add_argument(
+ "--index-dir",
+ type=str,
+ default="./test_doc_files",
+ help="Directory where the Leann index will be stored.",
+ )
+ args = parser.parse_args()
+
+ asyncio.run(main(args))
\ No newline at end of file
diff --git a/examples/data/README.md b/apps/documents/data/README.md
similarity index 100%
rename from examples/data/README.md
rename to apps/documents/data/README.md
diff --git a/apps/documents/data/pangu.md b/apps/documents/data/pangu.md
new file mode 100644
index 0000000..0dd35da
--- /dev/null
+++ b/apps/documents/data/pangu.md
@@ -0,0 +1,82 @@
+# 盘古之殇:华为诺亚盘古大模型研发历程的心酸与黑暗
+
+各位好,
+
+我是一名盘古大模型团队,华为诺亚方舟实验室的员工。
+
+首先为自证身份,列举一些细节:
+
+1. 现诺亚主任,前算法应用部部长,后改名为小模型实验室的主任王云鹤。前诺亚主任:姚骏(大家称姚老师)。几个实验室主任:唐睿明(明哥,明队,已离职),尚利峰,张维(维哥),郝建业(郝老师),刘武龙(称呼为武龙所)等。其他骨干成员和专家陆续有很多人离职。
+2. 我们隶属于“四野”这个组织。四野下属有许多纵队,基础语言大模型是四纵。王云鹤的小模型是十六纵队。我们参加过苏州的集结,有各种月份的时间节点。在苏州攻关会颁发任务令,需要在节点前达成目标。苏州集结会把各地的人员都集中在苏州研究所,平常住宾馆,比如在甪直的酒店,与家人孩子天各一方。
+3. 在苏州集结的时候周六默认上班,非常辛苦,不过周六有下午茶,有一次还有小龙虾。在苏州研究所的工位搬迁过一次,从一栋楼换到了另一栋。苏州研究所楼栋都是欧式装修,门口有大坡,里面景色很不错。去苏州集结一般至少要去一周,甚至更久,多的人甚至一两个月都回不了家。
+4. 诺亚曾经传说是研究型的,但是来了之后因为在四野做大模型项目,项目成员完全变成了交付型的,且充满了例会,评审,汇报。很多时候做实验都要申请。团队需要对接终端小艺,华为云,ICT等诸多业务线,交付压力不小。
+5. 诺亚研发的盘古模型早期内部代号叫做“盘古智子”,一开始只有内部需要申请试用的网页版,到后续迫于压力在welink上接入和公测开放。
+
+这些天发生关于质疑盘古大模型抄袭千问的事情闹的沸沸扬扬。作为一个盘古团队的成员,我最近夜夜辗转反侧,难以入眠。盘古的品牌受到如此大的影响,一方面,我自私的为我的职业发展担忧,也为自己过去的努力工作感到不值。另一方面,由于有人开始揭露这些事情我内心又感到大快人心。在多少个日日夜夜,我们对内部某些人一次次靠着造假而又获得了无数利益的行为咬牙切齿而又无能为力。这种压抑和羞辱也逐渐消磨了我对华为的感情,让我在这里的时日逐渐浑浑噩噩,迷茫无措,时常怀疑自己的人生和自我价值。
+
+我承认我是一个懦弱的人,作为一个小小的打工人,我不仅不敢和王云鹤等内部手眼通天的人做对,更不敢和华为这样的庞然大物做对。我很怕失去我的工作,毕竟我也有家人和孩子,所以我打心眼里很佩服揭露者。但是,看到内部还在试图洗地掩盖事实,蒙蔽公众的时候,我实在不能容忍了。我也希望勇敢一次,顺从自己本心。就算自损八百,我也希望能伤敌一千。我决定把我在这里的所见所闻(部分来自于同事口述)公布出来,关于盘古大模型的“传奇故事”:
+
+华为确实主要在昇腾卡上训练大模型(小模型实验室有不少英伟达的卡,他们之前也会用来训练,后面转移到昇腾)。曾经我被华为“打造世界第二选择”的决心而折服,我本身也曾经对华为有深厚的感情。我们陪着昇腾一步步摸爬滚打,从充满bug到现在能训出模型,付出了巨大的心血和代价。
+
+最初我们的算力非常有限,在910A上训练模型。那会只支持fp16,训练的稳定性远不如bf16。盘古的moe开始很早,23年就主要是训练38Bmoe模型和后续的71B dense模型。71B的dense模型通过扩增变成了第一代的135Bdense模型,后面主力模型也逐渐在910B上训练。
+
+71B和135B模型都有一个巨大的硬伤就是tokenizer。当时使用的tokenizer编码效率极低,每个单个的符号,数字,空格,乃至汉字都会占用一个token。可想而知这会非常浪费算力,且使得模型的效果很差。这时候小模型实验室正好有个自己训的词表。姚老师当时怀疑是不是模型的tokenizer不好(虽然事后来看,他的怀疑是无疑正确的),于是就决定,让71B和135B换tokenizer,因为小模型实验室曾经尝试过。团队缝合了两个tokenizer,开始了tokenizer的更换。71B模型的更换失败了,而135B因为采用了更精细的embedding初始化策略,续训了至少1T的数据后词表总算更换成功,但可想而知,效果并不会变好。
+
+于此同期,阿里和智谱等国内其他公司在GPU上训练,且已经摸索出了正确的方法,盘古和竞品的差距越来越大。内部一个230B从头训练的dense模型又因为各种原因训练失败,导致项目的状况几乎陷入绝境。面临几个节点的压力以及内部对盘古的强烈质疑时,团队的士气低迷到了极点。团队在算力极其有限的时候,做出了很多努力和挣扎。比如,团队偶然发现当时的38B moe并没有预期moe的效果。于是去掉了moe参数,还原为了13B的dense模型。由于38B的moe源自很早的pangu alpha 13B,架构相对落后,团队进行了一系列的操作,比如切换绝对位置编码到rope,去掉bias,切换为rmsnorm。同时鉴于tokenizer的一些失败和换词表的经验,这个模型的词表也更换为了王云鹤的小模型实验室7B模型所使用的词表。后面这个13B模型进行了扩增续训,变成了第二代38B dense模型(在几个月内这个模型都是主要的盘古中档位模型),曾经具有一定的竞争力。但是,由于更大的135B模型架构落后,且更换词表模型损伤巨大(后续分析发现当时更换的缝合词表有更严重的bug),续训后也与千问等当时国内领先模型存在很大差距。这时由于内部的质疑声和领导的压力也越来越大。团队的状态几乎陷入了绝境。
+
+在这种情况下,王云鹤和他的小模型实验室出手了。他们声称是从旧的135B参数继承改造而来,通过训练短短的几百B数据,各项指标平均提升了十个点左右。实际上,这就是他们套壳应用到大模型的第一次杰作。华为的外行领导内行,使得领导完全对于这种扯淡的事情没有概念,他们只会觉得肯定是有什么算法创新。经过内部的分析,他们实际上是使用Qwen 1.5 110B续训而来,通过加层,扩增ffn维度,添加盘古pi论文的一些机制得来,凑够了大概135B的参数。实际上,旧的135B有107层,而这个模型只有82层,各种配置也都不一样。新的来路不明的135B训练完很多参数的分布也和Qwen 110B几乎一模一样。连模型代码的类名当时都是Qwen,甚至懒得改名。后续这个模型就是所谓的135B V2。而这个模型当时也提供给了很多下游,甚至包括外部客户。
+
+这件事对于我们这些认真诚实做事的同事们带来了巨大的冲击,内部很多人其实都知道这件事,甚至包括终端和华为云。我们都戏称以后别叫盘古模型了,叫千古吧。当时团队成员就想向bcg举报了,毕竟这已经是重大的业务造假了。但是后面据说被领导拦了下来,因为更高级别的领导(比如姚老师,以及可能熊总和查老)其实后面也知道了,但是并不管,因为通过套壳拿出好的结果,对他们也是有利的。这件事使得当时团队几位最强的同事开始心灰意冷,离职跑路也逐渐成为挂在嘴边的事。
+
+此时,盘古似乎迎来了转机。由于前面所述的这些盘古模型基本都是续训和改造而来,当时诺亚完全没有掌握从头训练的技术,何况还是在昇腾的NPU上进行训练。在当时团队的核心成员的极力争取下,盘古开始了第三代模型的训练,付出了巨大的努力后,在数据架构和训练算法方面都与业界逐渐接轨,而这其中的艰辛和小模型实验室的人一点关系都没有。
+
+一开始团队成员毫无信心,只从一个13B的模型开始训练,但是后面发现效果还不错,于是这个模型后续再次进行了一次参数扩增,变成了第三代的38B,代号38B V3。想必很多产品线的兄弟都对这个模型很熟悉。当时这个模型的tokenizer是基于llama的词表进行扩展的(也是业界常见的做法)。而当时王云鹤的实验室做出来了另一个词表(也就是后续pangu系列的词表)。当时两个词表还被迫进行了一次赛马,最终没有明显的好坏结论。于是,领导当即决定,应该统一词表,使用王云鹤他们的。于是,在后续从头训练的135B V3(也就是对外的Pangu Ultra),便是采用了这个tokenizer。这也解释了很多使用我们模型的兄弟的疑惑,为什么当时同为V3代的两个不同档位的模型,会使用不同的tokenizer。
+
+
+我们打心眼里觉得,135B V3是我们四纵团队当时的骄傲。这是第一个真正意义上的,华为全栈自研,正经从头训练的千亿级别的模型,且效果与24年同期竞品可比的。写到这里我已经热泪盈眶,太不容易了。当时为了稳定训练,团队做了大量实验对比,并且多次在模型梯度出现异常的时候进行及时回退重启。这个模型真正做到了后面技术报告所说的训练全程没有一个loss spike。我们克服了不知道多少困难,我们做到了,我们愿用生命和荣誉保证这个模型训练的真实性。多少个凌晨,我们为了它的训练而不眠。在被内部心声骂的一文不值的时候,我们有多么不甘,有多少的委屈,我们挺住了。
+
+我们这帮人是真的在为打磨国产算力底座燃烧自己的青春啊……客居他乡,我们放弃了家庭,放弃了假期,放弃了健康,放弃了娱乐,抛头颅洒热血,其中的艰辛与困苦,寥寥数笔不足以概括其万一。在各种动员大会上,当时口号中喊出的盘古必胜,华为必胜,我们心里是真的深深被感动。
+
+然而,我们的所有辛苦的成果,经常被小模型实验室轻飘飘的拿走了。数据,直接要走。代码,直接要走,还要求我们配合适配到能一键运行。我们当时戏称小模型实验室为点鼠标实验室。我们付出辛苦,他们取得荣耀。果然应了那句话,你在负重前行是因为有人替你岁月静好。在这种情况下,越来越多的战友再也坚持不下去了,选择了离开。看到身边那些优秀的同事一个个离职,我的内心又感叹又难过。在这种作战一样的环境下,我们比起同事来说更像是战友。他们在技术上也有无数值得我学习的地方,堪称良师。看到他们去了诸如字节Seed,Deepseek,月之暗面,腾讯和快手等等很多出色的团队,我打心眼里为他们高兴和祝福,脱离了这个辛苦却肮脏的地方。我至今还对一位离职同事的话记忆犹新,ta说:“来这里是我技术生涯中的耻辱,在这里再呆每一天都是浪费生命”。话虽难听却让我无言以对。我担心我自己技术方面的积累不足,以及没法适应互联网公司高淘汰的环境,让我多次想离职的心始终没有迈出这一步。
+
+盘古除了dense模型,后续也启动了moe的探索。一开始训练的是一个224B的moe模型。而与之平行的,小模型实验室也开启了第二次主要的套壳行动(次要的插曲可能还包括一些别的模型,比如math模型),即这次流传甚广的pangu pro moe 72B。这个模型内部自称是从小模型实验室的7B扩增上来的(就算如此,这也与技术报告不符,何况是套壳qwen 2.5的14b续训)。还记得他们训了没几天,内部的评测就立刻追上了当时的38B V3。AI系统实验室很多兄弟因为需要适配模型,都知道他们的套壳行动,只是迫于各种原因,无法伸张正义。实际上,对于后续训了很久很久的这个模型,Honestagi能够分析出这个量级的相似性我已经很诧异了,因为这个模型为了续训洗参数,所付出的算力甚至早就足够从头训一个同档位的模型了。听同事说他们为了洗掉千问的水印,采取了不少办法,甚至包括故意训了脏数据。这也为学术界研究模型血缘提供了一个前所未有的特殊模范吧。以后新的血缘方法提出可以拿出来溜溜。
+
+24年底和25年初,在Deepseek v3和r1发布之后,由于其惊艳的技术水平,团队受到了巨大的冲击,也受到了更大的质疑。于是为了紧跟潮流,盘古模仿Deepseek的模型尺寸,开启了718B moe的训练。这个时候,小模型实验室再次出手了。他们选择了套壳Deepseekv3续训。他们通过冻住Deepseek加载的参数,进行训练。连任务加载ckpt的目录都是deepseekv3,改都不改,何其嚣张?与之相反,一些有真正技术信仰的同事,在从头训练另一个718B的moe。但其中出现了各种各样的问题。但是很显然,这个模型怎么可能比直接套壳的好呢?如果不是团队leader坚持,早就被叫停了。
+
+华为的流程管理之繁重,严重拖累了大模型的研发节奏,例如版本管理,模型血缘,各种流程化,各种可追溯。讽刺的是,小模型实验室的模型似乎从来不受这些流程的约束,想套壳就套壳,想续训就续训,算力源源不断的伸手拿走。这种强烈到近乎魔幻的对比,说明了当前流程管理的情况:只许州官放火,不许百姓点灯。何其可笑?何其可悲?何其可恶?何其可耻!
+
+HonestAGI的事情出来后,内部让大家不停的研讨分析,如何公关和“回应”。诚然,这个原文的分析也许不够有力,给了王云鹤与小模型实验室他们狡辩和颠倒黑白的机会。为此,这两天我内心感到作呕,时时怀疑自己的人生意义以及苍天无眼。我不奉陪了,我要离职了,同时我也在申请从盘古部分技术报告的作者名单中移除。曾经在这些技术报告上署名是我一生都无法抹除的污点。当时我没想到,他们竟然猖狂到敢开源。我没想到,他们敢如此愚弄世人,大肆宣发。当时,我也许是存了侥幸心理,没有拒绝署名。我相信很多扎实做事的战友,也只是被迫上了贼船,或者不知情。但这件事已经无法挽回,我希望我的余生能够坚持扎实做真正有意义的事,为我当时的软弱和不坚定赎罪。
+
+深夜写到这里,我已经泪流满面,泣不成声。还记得一些出色的同事离职时,我苦笑问他们要不要发个长长的心声惯例帖,揭露一下现状。对方说:不了,浪费时间,而且我也怕揭露出来你们过的更糟。我当时一下黯然神伤,因为曾经共同为了理想奋斗过的战友已经彻底对华为彻底灰心了。当时大家调侃,我们用着当年共产党的小米加步枪,组织却有着堪比当年国民党的作风。
+
+曾几何时,我为我们用着小米加步枪打败洋枪洋炮而自豪。
+
+现在,我累了,我想投降。
+
+其实时至今日,我还是真心希望华为能认真吸取教训,能做好盘古,把盘古做到世界一流,把昇腾变成英伟达的水平。内部的劣币驱逐良币,使得诺亚乃至华为在短时间内急剧流失了大量出色的大模型人才。相信他们也正在如Deepseek等各个团队闪耀着,施展着他们的抱负才华,为中美在AI的激烈竞赛中奉献力量。我时常感叹,华为不是没有人才,而是根本不知道怎么留住人才。如果给这些人合适的环境,合适的资源,更少的枷锁,更少的政治斗争,盘古何愁不成?
+
+最后:我以生命,人格和荣誉发誓,我写的以上所有内容均为真实(至少在我有限的认知范围内)。我没有那么高的技术水平以及机会去做详尽扎实的分析,也不敢直接用内部记录举证,怕因为信息安全抓到。但是我相信我很多曾经的战友,会为我作证。在华为内部的兄弟,包括我们曾经服务过的产品线兄弟们,相信本文的无数细节能和你们的印象对照,印证我的说法。你们可能也曾经被蒙骗,但这些残酷的真相不会被尘封。我们奋战过的痕迹,也不应该被扭曲和埋葬。
+
+写了这么多,某些人肯定想把我找出来,抹杀掉。公司搞不好也想让我噤声乃至追责。如果真的这样,我,乃至我的家人的人身乃至生命安全可能都会受到威胁。为了自我保护,我近期每天会跟大家报平安。
+
+如果我消失了,就当是我为了真理和理想,为了华为乃至中国能够更好地发展算力和AI而牺牲了吧,我愿埋葬于那片曾经奋斗过的地方。
+
+诺亚,再见
+
+2025年7月6日凌晨 写于深圳
+
+---
+
+各位好,
+
+感谢大家的关心与祝福。我目前暂时安全,但公司应该在进行排查与某些名单收集,后续情况未知。
+
+我补充一些细节,以免某些人继续颠倒黑白。
+
+关于135B V2,小模型实验室在迅速地完成套壳并拿完所有套壳带来的好处后(比如任务令表彰和及时激励),因为不想继续支撑下游应用和模型迭代,又把这个烫手山芋甩给了四纵。确实技高一筹,直接把四纵的兄弟们拉下水。同事提供过去一个老旧的模型,最终拿回了一个当时一个魔改的先进的千问。做大模型的人,自己做的模型就像自己孩子一样熟悉,不要把别人都当傻子。就像自家儿子出门一趟,回来个别人家孩子。
+
+盘古report的署名是不符合学术规范的。例如,135B V3有不少有技术贡献的人,因为作者名额数量限制,劳动成果没有得到应有的回报,团队内曾经有不小的意见。这个模型当时是大家智慧和汗水的结晶,甚至是团队当时的精神支柱,支撑着不少兄弟们继续留在诺亚。所谓的名额限制,以及挂名了一些毫无技术贡献的人(如一些小模型实验室的人),让兄弟们何其心寒。
+
+---
+
+暂时平安。另外,支持我勇于说出真相的战友们 https://github.com/HW-whistleblower/True-Story-of-Pangu/issues/317
diff --git a/apps/email/__init__.py b/apps/email/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/email/__main__.py b/apps/email/__main__.py
new file mode 100644
index 0000000..e001b45
--- /dev/null
+++ b/apps/email/__main__.py
@@ -0,0 +1,193 @@
+import os
+import sys
+import asyncio
+import dotenv
+import argparse
+from pathlib import Path
+from typing import List, Any
+
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+from llama_index.core.node_parser import SentenceSplitter
+
+dotenv.load_dotenv()
+
+# Auto-detect user's mail path
+def get_mail_path():
+ """Get the mail path for the current user"""
+ home_dir = os.path.expanduser("~")
+ return os.path.join(home_dir, "Library", "Mail")
+
+def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False, embedding_model: str = "facebook/contriever"):
+ """
+ Create LEANN index from multiple mail data sources.
+
+ Args:
+ messages_dirs: List of Path objects pointing to Messages directories
+ index_path: Path to save the LEANN index
+ max_count: Maximum number of emails to process per directory
+ include_html: Whether to include HTML content in email processing
+ """
+ print("Creating LEANN index from multiple mail data sources...")
+
+ # Load documents using EmlxReader from local readers module
+ from .readers import EmlxReader, find_all_messages_directories
+ reader = EmlxReader(include_html=include_html)
+ INDEX_DIR = Path(index_path).parent
+
+ if not INDEX_DIR.exists():
+ print(f"--- Index directory not found, building new index ---")
+ all_documents = []
+ total_processed = 0
+
+ # Process each Messages directory
+ for i, messages_dir in enumerate(messages_dirs):
+ print(f"\nProcessing Messages directory {i+1}/{len(messages_dirs)}: {messages_dir}")
+
+ try:
+ documents = reader.load_data(messages_dir)
+ if documents:
+ print(f"Loaded {len(documents)} email documents from {messages_dir}")
+ all_documents.extend(documents)
+ total_processed += len(documents)
+
+ # Check if we've reached the max count
+ if max_count > 0 and total_processed >= max_count:
+ print(f"Reached max count of {max_count} documents")
+ break
+ else:
+ print(f"No documents loaded from {messages_dir}")
+ except Exception as e:
+ print(f"Error processing {messages_dir}: {e}")
+ continue
+
+ if not all_documents:
+ print("No documents loaded from any source. Exiting.")
+ return None
+
+ print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories")
+
+ # Create text splitter with 256 chunk size
+ text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
+
+ # Convert Documents to text strings and chunk them
+ all_texts = []
+ for doc in all_documents:
+ # Split the document into chunks
+ nodes = text_splitter.get_nodes_from_documents([doc])
+ for node in nodes:
+ all_texts.append(node.get_content())
+
+ print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
+
+ # Create LEANN index directory
+ print(f"--- Index directory not found, building new index ---")
+ INDEX_DIR.mkdir(exist_ok=True)
+
+ print(f"--- Building new LEANN index ---")
+
+ print(f"\n[PHASE 1] Building Leann index...")
+
+ # Use HNSW backend for better macOS compatibility
+ builder = LeannBuilder(
+ backend_name="hnsw",
+ embedding_model=embedding_model,
+ graph_degree=32,
+ complexity=64,
+ is_compact=True,
+ is_recompute=True,
+ num_threads=1 # Force single-threaded mode
+ )
+
+ print(f"Adding {len(all_texts)} email chunks to index...")
+ for chunk_text in all_texts:
+ builder.add_text(chunk_text)
+
+ builder.build_index(index_path)
+ print(f"\nLEANN index built at {index_path}!")
+ else:
+ print(f"--- Using existing index at {INDEX_DIR} ---")
+
+ return index_path
+
+async def query_leann_index(index_path: str, query: str):
+ """
+ Query the LEANN index.
+
+ Args:
+ index_path: Path to the LEANN index
+ query: The query string
+ """
+ print(f"\n[PHASE 2] Starting Leann chat session...")
+ chat = LeannChat(index_path=index_path,
+ llm_config={"type": "openai", "model": "gpt-4o"})
+
+ print(f"You: {query}")
+ import time
+ start_time = time.time()
+ chat_response = chat.ask(
+ query,
+ top_k=10,
+ recompute_beighbor_embeddings=True,
+ complexity=12,
+ beam_width=1,
+
+ )
+ end_time = time.time()
+ print(f"Time taken: {end_time - start_time} seconds")
+ print(f"Leann: {chat_response}")
+
+async def main():
+ # Parse command line arguments
+ parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
+ parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts",
+ help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
+ parser.add_argument('--max-emails', type=int, default=1000,
+ help='Maximum number of emails to process (-1 means all)')
+ parser.add_argument('--query', type=str, default="Give me some funny advertisement about apple or other companies",
+ help='Single query to run (default: runs example queries)')
+ parser.add_argument('--include-html', action='store_true', default=False,
+ help='Include HTML content in email processing (default: False)')
+ parser.add_argument('--embedding-model', type=str, default="facebook/contriever",
+ help='Embedding model to use (default: facebook/contriever)')
+
+ args = parser.parse_args()
+
+ print(f"args: {args}")
+
+ # Automatically find all Messages directories under the current user's Mail directory
+ from .readers import find_all_messages_directories
+ mail_path = get_mail_path()
+ print(f"Searching for email data in: {mail_path}")
+ messages_dirs = find_all_messages_directories(mail_path)
+
+ print('len(messages_dirs): ', len(messages_dirs))
+
+ if not messages_dirs:
+ print("No Messages directories found. Exiting.")
+ return
+
+ INDEX_DIR = Path(args.index_dir)
+ INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
+ print(f"Index directory: {INDEX_DIR}")
+ print(f"Found {len(messages_dirs)} Messages directories.")
+
+ # Create or load the LEANN index from all sources
+ index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model)
+
+ if index_path:
+ if args.query:
+ # Run single query
+ await query_leann_index(index_path, args.query)
+ else:
+ # Example queries
+ queries = [
+ "Hows Berkeley Graduate Student Instructor",
+ "how's the icloud related advertisement saying",
+ "Whats the number of class recommend to take per semester for incoming EECS students"
+ ]
+ for query in queries:
+ print("\n" + "="*60)
+ await query_leann_index(index_path, query)
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/apps/email/email.py b/apps/email/email.py
new file mode 100644
index 0000000..689618b
--- /dev/null
+++ b/apps/email/email.py
@@ -0,0 +1,192 @@
+"""
+Mbox parser.
+
+Contains simple parser for mbox files.
+
+"""
+
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fsspec import AbstractFileSystem
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class MboxReader(BaseReader):
+ """
+ Mbox parser.
+
+ Extract messages from mailbox files.
+ Returns string including date, subject, sender, receiver and
+ content for each message.
+
+ """
+
+ DEFAULT_MESSAGE_FORMAT: str = (
+ "Date: {_date}\n"
+ "From: {_from}\n"
+ "To: {_to}\n"
+ "Subject: {_subject}\n"
+ "Content: {_content}"
+ )
+
+ def __init__(
+ self,
+ *args: Any,
+ max_count: int = 0,
+ message_format: str = DEFAULT_MESSAGE_FORMAT,
+ **kwargs: Any,
+ ) -> None:
+ """Init params."""
+ try:
+ from bs4 import BeautifulSoup # noqa
+ except ImportError:
+ raise ImportError(
+ "`beautifulsoup4` package not found: `pip install beautifulsoup4`"
+ )
+
+ super().__init__(*args, **kwargs)
+ self.max_count = max_count
+ self.message_format = message_format
+
+ def load_data(
+ self,
+ file: Path,
+ extra_info: Optional[Dict] = None,
+ fs: Optional[AbstractFileSystem] = None,
+ ) -> List[Document]:
+ """Parse file into string."""
+ # Import required libraries
+ import mailbox
+ from email.parser import BytesParser
+ from email.policy import default
+
+ from bs4 import BeautifulSoup
+
+ if fs:
+ logger.warning(
+ "fs was specified but MboxReader doesn't support loading "
+ "from fsspec filesystems. Will load from local filesystem instead."
+ )
+
+ i = 0
+ results: List[str] = []
+ # Load file using mailbox
+ bytes_parser = BytesParser(policy=default).parse
+ mbox = mailbox.mbox(file, factory=bytes_parser) # type: ignore
+
+ # Iterate through all messages
+ for _, _msg in enumerate(mbox):
+ try:
+ msg: mailbox.mboxMessage = _msg
+ # Parse multipart messages
+ if msg.is_multipart():
+ for part in msg.walk():
+ ctype = part.get_content_type()
+ cdispo = str(part.get("Content-Disposition"))
+ if "attachment" in cdispo:
+ print(f"Attachment found: {part.get_filename()}")
+ if ctype == "text/plain" and "attachment" not in cdispo:
+ content = part.get_payload(decode=True) # decode
+ break
+ # Get plain message payload for non-multipart messages
+ else:
+ content = msg.get_payload(decode=True)
+
+ # Parse message HTML content and remove unneeded whitespace
+ soup = BeautifulSoup(content)
+ stripped_content = " ".join(soup.get_text().split())
+ # Format message to include date, sender, receiver and subject
+ msg_string = self.message_format.format(
+ _date=msg["date"],
+ _from=msg["from"],
+ _to=msg["to"],
+ _subject=msg["subject"],
+ _content=stripped_content,
+ )
+ # Add message string to results
+ results.append(msg_string)
+ except Exception as e:
+ logger.warning(f"Failed to parse message:\n{_msg}\n with exception {e}")
+
+ # Increment counter and return if max count is met
+ i += 1
+ if self.max_count > 0 and i >= self.max_count:
+ break
+
+ return [Document(text=result, metadata=extra_info or {}) for result in results]
+
+
+class EmlxMboxReader(MboxReader):
+ """
+ EmlxMboxReader - Modified MboxReader that handles directories of .emlx files.
+
+ Extends MboxReader to work with Apple Mail's .emlx format by:
+ 1. Reading .emlx files from a directory
+ 2. Converting them to mbox format in memory
+ 3. Using the parent MboxReader's parsing logic
+ """
+
+ def load_data(
+ self,
+ directory: Path,
+ extra_info: Optional[Dict] = None,
+ fs: Optional[AbstractFileSystem] = None,
+ ) -> List[Document]:
+ """Parse .emlx files from directory into strings using MboxReader logic."""
+ import tempfile
+ import os
+
+ if fs:
+ logger.warning(
+ "fs was specified but EmlxMboxReader doesn't support loading "
+ "from fsspec filesystems. Will load from local filesystem instead."
+ )
+
+ # Find all .emlx files in the directory
+ emlx_files = list(directory.glob("*.emlx"))
+ logger.info(f"Found {len(emlx_files)} .emlx files in {directory}")
+
+ if not emlx_files:
+ logger.warning(f"No .emlx files found in {directory}")
+ return []
+
+ # Create a temporary mbox file
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.mbox', delete=False) as temp_mbox:
+ temp_mbox_path = temp_mbox.name
+
+ # Convert .emlx files to mbox format
+ for emlx_file in emlx_files:
+ try:
+ # Read the .emlx file
+ with open(emlx_file, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+
+ # .emlx format: first line is length, rest is email content
+ lines = content.split('\n', 1)
+ if len(lines) >= 2:
+ email_content = lines[1] # Skip the length line
+
+ # Write to mbox format (each message starts with "From " and ends with blank line)
+ temp_mbox.write(f"From {emlx_file.name} {email_content}\n\n")
+
+ except Exception as e:
+ logger.warning(f"Failed to process {emlx_file}: {e}")
+ continue
+
+ # Close the temporary file so MboxReader can read it
+ temp_mbox.close()
+
+ try:
+ # Use the parent MboxReader's logic to parse the mbox file
+ return super().load_data(Path(temp_mbox_path), extra_info, fs)
+ finally:
+ # Clean up temporary file
+ try:
+ os.unlink(temp_mbox_path)
+ except:
+ pass
\ No newline at end of file
diff --git a/apps/email/readers.py b/apps/email/readers.py
new file mode 100644
index 0000000..2c79108
--- /dev/null
+++ b/apps/email/readers.py
@@ -0,0 +1,124 @@
+import os
+import email
+from pathlib import Path
+from typing import List, Any
+from llama_index.core import Document
+from llama_index.core.readers.base import BaseReader
+
+def find_all_messages_directories(root: str = None) -> List[Path]:
+ """
+ Recursively find all 'Messages' directories under the given root.
+ Returns a list of Path objects.
+ """
+ if root is None:
+ # Auto-detect user's mail path
+ home_dir = os.path.expanduser("~")
+ root = os.path.join(home_dir, "Library", "Mail")
+
+ messages_dirs = []
+ for dirpath, dirnames, filenames in os.walk(root):
+ if os.path.basename(dirpath) == "Messages":
+ messages_dirs.append(Path(dirpath))
+ return messages_dirs
+
+class EmlxReader(BaseReader):
+ """
+ Apple Mail .emlx file reader with embedded metadata.
+
+ Reads individual .emlx files from Apple Mail's storage format.
+ """
+
+ def __init__(self, include_html: bool = False) -> None:
+ """
+ Initialize.
+
+ Args:
+ include_html: Whether to include HTML content in the email body (default: False)
+ """
+ self.include_html = include_html
+
+ def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
+ """
+ Load data from the input directory containing .emlx files.
+
+ Args:
+ input_dir: Directory containing .emlx files
+ **load_kwargs:
+ max_count (int): Maximum amount of messages to read.
+ """
+ docs: List[Document] = []
+ max_count = load_kwargs.get('max_count', 1000)
+ count = 0
+
+ # Walk through the directory recursively
+ for dirpath, dirnames, filenames in os.walk(input_dir):
+ # Skip hidden directories
+ dirnames[:] = [d for d in dirnames if not d.startswith(".")]
+
+ for filename in filenames:
+ if count >= max_count:
+ break
+
+ if filename.endswith(".emlx"):
+ filepath = os.path.join(dirpath, filename)
+ try:
+ # Read the .emlx file
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+ content = f.read()
+
+ # .emlx files have a length prefix followed by the email content
+ # The first line contains the length, followed by the email
+ lines = content.split('\n', 1)
+ if len(lines) >= 2:
+ email_content = lines[1]
+
+ # Parse the email using Python's email module
+ try:
+ msg = email.message_from_string(email_content)
+
+ # Extract email metadata
+ subject = msg.get('Subject', 'No Subject')
+ from_addr = msg.get('From', 'Unknown')
+ to_addr = msg.get('To', 'Unknown')
+ date = msg.get('Date', 'Unknown')
+
+ # Extract email body
+ body = ""
+ if msg.is_multipart():
+ for part in msg.walk():
+ if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
+ if part.get_content_type() == "text/html" and not self.include_html:
+ continue
+ body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
+ # break
+ else:
+ body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
+
+ # Create document content with metadata embedded in text
+ doc_content = f"""
+[EMAIL METADATA]
+File: {filename}
+From: {from_addr}
+To: {to_addr}
+Subject: {subject}
+Date: {date}
+[END METADATA]
+
+{body}
+"""
+
+ # No separate metadata - everything is in the text
+ doc = Document(text=doc_content, metadata={})
+ docs.append(doc)
+ count += 1
+
+ except Exception as e:
+ print(f"Error parsing email from {filepath}: {e}")
+ continue
+
+ except Exception as e:
+ print(f"Error reading file {filepath}: {e}")
+ continue
+
+ print(f"Loaded {len(docs)} email documents")
+ return docs
\ No newline at end of file
diff --git a/apps/evaluation/__init__.py b/apps/evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/evaluation/__main__.py b/apps/evaluation/__main__.py
new file mode 100644
index 0000000..6b07f09
--- /dev/null
+++ b/apps/evaluation/__main__.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+"""
+This script runs a recall evaluation on a given LEANN index.
+It correctly compares results by fetching the text content for both the new search
+results and the golden standard results, making the comparison robust to ID changes.
+"""
+
+import json
+import argparse
+import time
+from pathlib import Path
+import sys
+import numpy as np
+from typing import List
+
+from leann.api import LeannSearcher, LeannBuilder
+
+
+def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
+ """Checks if the data directory exists, and if not, downloads it from HF Hub."""
+ if not data_root.exists():
+ print(f"Data directory '{data_root}' not found.")
+ print(
+ "Downloading evaluation data from Hugging Face Hub... (this may take a moment)"
+ )
+ try:
+ from huggingface_hub import snapshot_download
+
+ if download_embeddings:
+ # Download everything including embeddings (large files)
+ snapshot_download(
+ repo_id="LEANN-RAG/leann-rag-evaluation-data",
+ repo_type="dataset",
+ local_dir=data_root,
+ local_dir_use_symlinks=False,
+ )
+ print("Data download complete (including embeddings)!")
+ else:
+ # Download only specific folders, excluding embeddings
+ allow_patterns = [
+ "ground_truth/**",
+ "indices/**",
+ "queries/**",
+ "*.md",
+ "*.txt",
+ ]
+ snapshot_download(
+ repo_id="LEANN-RAG/leann-rag-evaluation-data",
+ repo_type="dataset",
+ local_dir=data_root,
+ local_dir_use_symlinks=False,
+ allow_patterns=allow_patterns,
+ )
+ print("Data download complete (excluding embeddings)!")
+ except ImportError:
+ print(
+ "Error: huggingface_hub is not installed. Please install it to download the data:"
+ )
+ print("uv pip install -e '.[dev]'")
+ sys.exit(1)
+ except Exception as e:
+ print(f"An error occurred during data download: {e}")
+ sys.exit(1)
+
+
+def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):
+ """Download embeddings files specifically."""
+ embeddings_dir = data_root / "embeddings"
+
+ if dataset_type:
+ # Check if specific dataset embeddings exist
+ target_file = embeddings_dir / dataset_type / "passages_00.pkl"
+ if target_file.exists():
+ print(f"Embeddings for {dataset_type} already exist")
+ return str(target_file)
+
+ print("Downloading embeddings from HuggingFace Hub...")
+ try:
+ from huggingface_hub import snapshot_download
+
+ # Download only embeddings folder
+ snapshot_download(
+ repo_id="LEANN-RAG/leann-rag-evaluation-data",
+ repo_type="dataset",
+ local_dir=data_root,
+ local_dir_use_symlinks=False,
+ allow_patterns=["embeddings/**/*.pkl"],
+ )
+ print("Embeddings download complete!")
+
+ if dataset_type:
+ target_file = embeddings_dir / dataset_type / "passages_00.pkl"
+ if target_file.exists():
+ return str(target_file)
+
+ return str(embeddings_dir)
+
+ except Exception as e:
+ print(f"Error downloading embeddings: {e}")
+ sys.exit(1)
+
+
+# --- Helper Function to get Golden Passages ---
+def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
+ """
+ Retrieves the text for golden passage IDs directly from the LeannSearcher's
+ passage manager.
+ """
+ golden_texts = set()
+ for gid in golden_ids:
+ try:
+ # PassageManager uses string IDs
+ passage_data = searcher.passage_manager.get_passage(str(gid))
+ golden_texts.add(passage_data["text"])
+ except KeyError:
+ print(
+ f"Warning: Golden passage ID '{gid}' not found in the index's passage data."
+ )
+ return golden_texts
+
+
+def load_queries(file_path: Path) -> List[str]:
+ queries = []
+ with open(file_path, "r", encoding="utf-8") as f:
+ for line in f:
+ data = json.loads(line)
+ queries.append(data["query"])
+ return queries
+
+
+def build_index_from_embeddings(
+ embeddings_file: str, output_path: str, backend: str = "hnsw"
+):
+ """
+ Build a LEANN index from pre-computed embeddings.
+
+ Args:
+ embeddings_file: Path to pickle file with (ids, embeddings) tuple
+ output_path: Path where to save the index
+ backend: Backend to use ("hnsw" or "diskann")
+ """
+ print(f"Building {backend} index from embeddings: {embeddings_file}")
+
+ # Create builder with appropriate parameters
+ if backend == "hnsw":
+ builder_kwargs = {
+ "M": 32, # Graph degree
+ "efConstruction": 256, # Construction complexity
+ "is_compact": True, # Use compact storage
+ "is_recompute": True, # Enable pruning for better recall
+ }
+ elif backend == "diskann":
+ builder_kwargs = {
+ "complexity": 64,
+ "graph_degree": 32,
+ "search_memory_maximum": 8.0, # GB
+ "build_memory_maximum": 16.0, # GB
+ }
+ else:
+ builder_kwargs = {}
+
+ builder = LeannBuilder(
+ backend_name=backend,
+ embedding_model="facebook/contriever-msmarco", # Model used to create embeddings
+ dimensions=768, # Will be auto-detected from embeddings
+ **builder_kwargs,
+ )
+
+ # Build index from precomputed embeddings
+ builder.build_index_from_embeddings(output_path, embeddings_file)
+ print(f"Index saved to: {output_path}")
+ return output_path
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Run recall evaluation on a LEANN index."
+ )
+ parser.add_argument(
+ "index_path",
+ type=str,
+ nargs="?",
+ help="Path to the LEANN index to evaluate or build (optional).",
+ )
+ parser.add_argument(
+ "--mode",
+ choices=["evaluate", "build"],
+ default="evaluate",
+ help="Mode: 'evaluate' existing index or 'build' from embeddings",
+ )
+ parser.add_argument(
+ "--embeddings-file",
+ type=str,
+ help="Path to embeddings pickle file (optional for build mode)",
+ )
+ parser.add_argument(
+ "--backend",
+ choices=["hnsw", "diskann"],
+ default="hnsw",
+ help="Backend to use for building index (default: hnsw)",
+ )
+ parser.add_argument(
+ "--num-queries", type=int, default=10, help="Number of queries to evaluate."
+ )
+ parser.add_argument(
+ "--top-k", type=int, default=3, help="The 'k' value for recall@k."
+ )
+ parser.add_argument(
+ "--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW."
+ )
+ args = parser.parse_args()
+
+ # --- Path Configuration ---
+ # Assumes a project structure where the script is in 'examples/'
+ # and data is in 'data/' at the project root.
+ project_root = Path(__file__).resolve().parent.parent
+ data_root = project_root / "data"
+
+ # Download data based on mode
+ if args.mode == "build":
+ # For building mode, we need embeddings
+ download_data_if_needed(
+ data_root, download_embeddings=False
+ ) # Basic data first
+
+ # Auto-detect dataset type and download embeddings
+ if args.embeddings_file:
+ embeddings_file = args.embeddings_file
+ # Try to detect dataset type from embeddings file path
+ if "rpj_wiki" in str(embeddings_file):
+ dataset_type = "rpj_wiki"
+ elif "dpr" in str(embeddings_file):
+ dataset_type = "dpr"
+ else:
+ dataset_type = "dpr" # Default
+ else:
+ # Auto-detect from index path if provided, otherwise default to DPR
+ if args.index_path:
+ index_path_str = str(args.index_path)
+ if "rpj_wiki" in index_path_str:
+ dataset_type = "rpj_wiki"
+ elif "dpr" in index_path_str:
+ dataset_type = "dpr"
+ else:
+ dataset_type = "dpr" # Default to DPR
+ else:
+ dataset_type = "dpr" # Default to DPR
+
+ embeddings_file = download_embeddings_if_needed(data_root, dataset_type)
+
+ # Auto-generate index path if not provided
+ if not args.index_path:
+ indices_dir = data_root / "indices" / dataset_type
+ indices_dir.mkdir(parents=True, exist_ok=True)
+ args.index_path = str(indices_dir / f"{dataset_type}_from_embeddings")
+ print(f"Auto-generated index path: {args.index_path}")
+
+ print(f"Building index from embeddings: {embeddings_file}")
+ built_index_path = build_index_from_embeddings(
+ embeddings_file, args.index_path, args.backend
+ )
+ print(f"Index built successfully: {built_index_path}")
+
+ # Ask if user wants to run evaluation
+ eval_response = (
+ input("Run evaluation on the built index? (y/n): ").strip().lower()
+ )
+ if eval_response != "y":
+ print("Index building complete. Exiting.")
+ return
+ else:
+ # For evaluation mode, don't need embeddings
+ download_data_if_needed(data_root, download_embeddings=False)
+
+ # Auto-detect index path if not provided
+ if not args.index_path:
+ # Default to using downloaded indices
+ indices_dir = data_root / "indices"
+
+ # Try common datasets in order of preference
+ for dataset in ["dpr", "rpj_wiki"]:
+ dataset_dir = indices_dir / dataset
+ if dataset_dir.exists():
+ # Look for index files
+ index_files = list(dataset_dir.glob("*.index")) + list(
+ dataset_dir.glob("*_disk.index")
+ )
+ if index_files:
+ args.index_path = str(
+ index_files[0].with_suffix("")
+ ) # Remove .index extension
+ print(f"Using index: {args.index_path}")
+ break
+
+ if not args.index_path:
+ print(
+ "No indices found. The data download should have included pre-built indices."
+ )
+ print(
+ "Please check the data/indices/ directory or provide --index-path manually."
+ )
+ sys.exit(1)
+
+ # Detect dataset type from index path to select the correct ground truth
+ index_path_str = str(args.index_path)
+ if "rpj_wiki" in index_path_str:
+ dataset_type = "rpj_wiki"
+ elif "dpr" in index_path_str:
+ dataset_type = "dpr"
+ else:
+ # Fallback: try to infer from the index directory name
+ dataset_type = Path(args.index_path).name
+ print(
+ f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'."
+ )
+
+ queries_file = data_root / "queries" / "nq_open.jsonl"
+ golden_results_file = (
+ data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
+ )
+
+ print(f"INFO: Detected dataset type: {dataset_type}")
+ print(f"INFO: Using queries file: {queries_file}")
+ print(f"INFO: Using ground truth file: {golden_results_file}")
+
+ try:
+ searcher = LeannSearcher(args.index_path)
+ queries = load_queries(queries_file)
+
+ with open(golden_results_file, "r") as f:
+ golden_results_data = json.load(f)
+
+ num_eval_queries = min(args.num_queries, len(queries))
+ queries = queries[:num_eval_queries]
+
+ print(f"\nRunning evaluation on {num_eval_queries} queries...")
+ recall_scores = []
+ search_times = []
+
+ for i in range(num_eval_queries):
+ start_time = time.time()
+ new_results = searcher.search(
+ queries[i], top_k=args.top_k, ef=args.ef_search
+ )
+ search_times.append(time.time() - start_time)
+
+ # Correct Recall Calculation: Based on TEXT content
+ new_texts = {result.text for result in new_results}
+
+ # Get golden texts directly from the searcher's passage manager
+ golden_ids = golden_results_data["indices"][i][: args.top_k]
+ golden_texts = get_golden_texts(searcher, golden_ids)
+
+ overlap = len(new_texts & golden_texts)
+ recall = overlap / len(golden_texts) if golden_texts else 0
+ recall_scores.append(recall)
+
+ print("\n--- EVALUATION RESULTS ---")
+ print(f"Query: {queries[i]}")
+ print(f"New Results: {new_texts}")
+ print(f"Golden Results: {golden_texts}")
+ print(f"Overlap: {overlap}")
+ print(f"Recall: {recall}")
+ print(f"Search Time: {search_times[-1]:.4f}s")
+ print("--------------------------------")
+
+ avg_recall = np.mean(recall_scores) if recall_scores else 0
+ avg_time = np.mean(search_times) if search_times else 0
+
+ print("\n🎉 --- Evaluation Complete ---")
+ print(f"Avg. Recall@{args.top_k} (efSearch={args.ef_search}): {avg_recall:.4f}")
+ print(f"Avg. Search Time: {avg_time:.4f}s")
+
+ except Exception as e:
+ print(f"\n❌ An error occurred during evaluation: {e}")
+ import traceback
+
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/wechat/__init__.py b/apps/wechat/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apps/wechat/__main__.py b/apps/wechat/__main__.py
new file mode 100644
index 0000000..7d1c4c7
--- /dev/null
+++ b/apps/wechat/__main__.py
@@ -0,0 +1,230 @@
+import os
+import asyncio
+import dotenv
+import argparse
+from pathlib import Path
+from typing import List, Any, Optional
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+from llama_index.core.node_parser import SentenceSplitter
+import requests
+import time
+
+dotenv.load_dotenv()
+
+# Default WeChat export directory
+DEFAULT_WECHAT_EXPORT_DIR = "./wechat_export_direct"
+
+def create_leann_index_from_multiple_wechat_exports(
+ export_dirs: List[Path],
+ index_path: str = "wechat_history_index.leann",
+ max_count: int = -1,
+):
+ """
+ Create LEANN index from multiple WeChat export data sources.
+
+ Args:
+ export_dirs: List of Path objects pointing to WeChat export directories
+ index_path: Path to save the LEANN index
+ max_count: Maximum number of chat entries to process per export
+ """
+ print("Creating LEANN index from multiple WeChat export data sources...")
+
+ # Load documents using WeChatHistoryReader from local readers module
+ from .readers import WeChatHistoryReader
+
+ reader = WeChatHistoryReader()
+
+ INDEX_DIR = Path(index_path).parent
+
+ if not INDEX_DIR.exists():
+ print(f"--- Index directory not found, building new index ---")
+ all_documents = []
+ total_processed = 0
+
+ # Process each WeChat export directory
+ for i, export_dir in enumerate(export_dirs):
+ print(
+ f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}"
+ )
+
+ try:
+ documents = reader.load_data(
+ wechat_export_dir=str(export_dir),
+ max_count=max_count,
+ concatenate_messages=True, # Disable concatenation - one message per document
+ )
+ if documents:
+ print(f"Loaded {len(documents)} chat documents from {export_dir}")
+ all_documents.extend(documents)
+ total_processed += len(documents)
+
+ # Check if we've reached the max count
+ if max_count > 0 and total_processed >= max_count:
+ print(f"Reached max count of {max_count} documents")
+ break
+ else:
+ print(f"No documents loaded from {export_dir}")
+ except Exception as e:
+ print(f"Error processing {export_dir}: {e}")
+ continue
+
+ if not all_documents:
+ print("No documents loaded from any source. Exiting.")
+ return None
+
+ print(
+ f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports"
+ )
+
+ # Create text splitter with 256 chunk size
+ text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
+
+ # Convert Documents to text strings and chunk them
+ all_texts = []
+ for doc in all_documents:
+ # Split the document into chunks
+ nodes = text_splitter.get_nodes_from_documents([doc])
+ for node in nodes:
+ text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
+ all_texts.append(text)
+
+ print(
+ f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
+ )
+
+ # Create LEANN index directory
+ print(f"--- Index directory not found, building new index ---")
+ INDEX_DIR.mkdir(exist_ok=True)
+
+ print(f"--- Building new LEANN index ---")
+
+ print(f"\n[PHASE 1] Building Leann index...")
+
+ # Use HNSW backend for better macOS compatibility
+ builder = LeannBuilder(
+ backend_name="hnsw",
+ embedding_model="Qwen/Qwen3-Embedding-0.6B",
+ graph_degree=32,
+ complexity=64,
+ is_compact=True,
+ is_recompute=True,
+ num_threads=1, # Force single-threaded mode
+ )
+
+ print(f"Adding {len(all_texts)} chat chunks to index...")
+ for chunk_text in all_texts:
+ builder.add_text(chunk_text)
+
+ builder.build_index(index_path)
+ print(f"\nLEANN index built at {index_path}!")
+ else:
+ print(f"--- Using existing index at {INDEX_DIR} ---")
+
+ return index_path
+
+async def query_leann_index(index_path: str, query: str):
+ """
+ Query the LEANN index.
+
+ Args:
+ index_path: Path to the LEANN index
+ query: The query string
+ """
+ print(f"\n[PHASE 2] Starting Leann chat session...")
+ chat = LeannChat(index_path=index_path)
+
+ print(f"You: {query}")
+ chat_response = chat.ask(
+ query,
+ top_k=20,
+ recompute_beighbor_embeddings=True,
+ complexity=16,
+ beam_width=1,
+ llm_config={
+ "type": "openai",
+ "model": "gpt-4o",
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ },
+ llm_kwargs={"temperature": 0.0, "max_tokens": 1000},
+ )
+ print(f"Leann: {chat_response}")
+
+async def main():
+ """Main function with integrated WeChat export functionality."""
+
+ # Parse command line arguments
+ parser = argparse.ArgumentParser(
+ description="LEANN WeChat History Reader - Create and query WeChat chat history index"
+ )
+ parser.add_argument(
+ "--export-dir",
+ type=str,
+ default=DEFAULT_WECHAT_EXPORT_DIR,
+ help=f"Directory to store WeChat exports (default: {DEFAULT_WECHAT_EXPORT_DIR})",
+ )
+ parser.add_argument(
+ "--index-dir",
+ type=str,
+ default="./wechat_history_magic_test_11Debug_new",
+ help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
+ )
+ parser.add_argument(
+ "--max-entries",
+ type=int,
+ default=50,
+ help="Maximum number of chat entries to process (default: 5000)",
+ )
+ parser.add_argument(
+ "--query",
+ type=str,
+ default=None,
+ help="Single query to run (default: runs example queries)",
+ )
+ parser.add_argument(
+ "--force-export",
+ action="store_true",
+ default=False,
+ help="Force re-export of WeChat data even if exports exist",
+ )
+
+ args = parser.parse_args()
+
+ INDEX_DIR = Path(args.index_dir)
+ INDEX_PATH = str(INDEX_DIR / "wechat_history.leann")
+
+ print(f"Using WeChat export directory: {args.export_dir}")
+ print(f"Index directory: {INDEX_DIR}")
+ print(f"Max entries: {args.max_entries}")
+
+ # Initialize WeChat reader with export capabilities
+ from .readers import WeChatHistoryReader
+
+ reader = WeChatHistoryReader()
+
+ # Find existing exports or create new ones using the centralized method
+ export_dirs = reader.find_or_export_wechat_data(args.export_dir)
+ if not export_dirs:
+ print("Failed to find or export WeChat data. Exiting.")
+ return
+
+ # Create or load the LEANN index from all sources
+ index_path = create_leann_index_from_multiple_wechat_exports(
+ export_dirs, INDEX_PATH, max_count=args.max_entries
+ )
+
+ if index_path:
+ if args.query:
+ # Run single query
+ await query_leann_index(index_path, args.query)
+ else:
+ # Example queries
+ queries = [
+ "我想买魔术师约翰逊的球衣,给我一些对应聊天记录?",
+ ]
+
+ for query in queries:
+ print("\n" + "=" * 60)
+ await query_leann_index(index_path, query)
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/apps/wechat/readers.py b/apps/wechat/readers.py
new file mode 100644
index 0000000..7524dcb
--- /dev/null
+++ b/apps/wechat/readers.py
@@ -0,0 +1,719 @@
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import List, Any, Dict, Optional
+from llama_index.core import Document
+from llama_index.core.readers.base import BaseReader
+from datetime import datetime
+
+class WeChatHistoryReader(BaseReader):
+ """
+ WeChat chat history reader that extracts chat data from exported JSON files.
+
+ Reads WeChat chat history from exported JSON files (from wechat-exporter tool)
+ and creates documents with embedded metadata similar to the Chrome history reader structure.
+
+ Also includes utilities for automatic WeChat chat history export.
+ """
+
+ def __init__(self) -> None:
+ """Initialize."""
+ self.packages_dir = Path(__file__).parent.parent.parent / "packages"
+ self.wechat_exporter_dir = self.packages_dir / "wechat-exporter"
+ self.wechat_decipher_dir = self.packages_dir / "wechat-decipher-macos"
+
+ def check_wechat_running(self) -> bool:
+ """Check if WeChat is currently running."""
+ try:
+ result = subprocess.run(["pgrep", "-f", "WeChat"], capture_output=True, text=True)
+ return result.returncode == 0
+ except Exception:
+ return False
+
+ def install_wechattweak(self) -> bool:
+ """Install WeChatTweak CLI tool."""
+ try:
+ # Create wechat-exporter directory if it doesn't exist
+ self.wechat_exporter_dir.mkdir(parents=True, exist_ok=True)
+
+ wechattweak_path = self.wechat_exporter_dir / "wechattweak-cli"
+ if not wechattweak_path.exists():
+ print("Downloading WeChatTweak CLI...")
+ subprocess.run([
+ "curl", "-L", "-o", str(wechattweak_path),
+ "https://github.com/JettChenT/WeChatTweak-CLI/releases/latest/download/wechattweak-cli"
+ ], check=True)
+
+ # Make executable
+ wechattweak_path.chmod(0o755)
+
+ # Install WeChatTweak
+ print("Installing WeChatTweak...")
+ subprocess.run(["sudo", str(wechattweak_path), "install"], check=True)
+ return True
+ except Exception as e:
+ print(f"Error installing WeChatTweak: {e}")
+ return False
+
+ def restart_wechat(self):
+ """Restart WeChat to apply WeChatTweak."""
+ try:
+ print("Restarting WeChat...")
+ subprocess.run(["pkill", "-f", "WeChat"], check=False)
+ time.sleep(2)
+ subprocess.run(["open", "-a", "WeChat"], check=True)
+ time.sleep(5) # Wait for WeChat to start
+ except Exception as e:
+ print(f"Error restarting WeChat: {e}")
+
+ def check_api_available(self) -> bool:
+ """Check if WeChatTweak API is available."""
+ try:
+ result = subprocess.run([
+ "curl", "-s", "http://localhost:48065/wechat/allcontacts"
+ ], capture_output=True, text=True, timeout=5)
+ return result.returncode == 0 and result.stdout.strip()
+ except Exception:
+ return False
+
+
+
+
+ def _extract_readable_text(self, content: str) -> str:
+ """
+ Extract readable text from message content, removing XML and system messages.
+
+ Args:
+ content: The raw message content (can be string or dict)
+
+ Returns:
+ Cleaned, readable text
+ """
+ if not content:
+ return ""
+
+ # Handle dictionary content (like quoted messages)
+ if isinstance(content, dict):
+ # Extract text from dictionary structure
+ text_parts = []
+ if 'title' in content:
+ text_parts.append(str(content['title']))
+ if 'quoted' in content:
+ text_parts.append(str(content['quoted']))
+ if 'content' in content:
+ text_parts.append(str(content['content']))
+ if 'text' in content:
+ text_parts.append(str(content['text']))
+
+ if text_parts:
+ return " | ".join(text_parts)
+ else:
+ # If we can't extract meaningful text from dict, return empty
+ return ""
+
+ # Handle string content
+ if not isinstance(content, str):
+ return ""
+
+ # Remove common prefixes like "wxid_xxx:\n"
+ clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
+ clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
+
+ # If it's just XML or system message, return empty
+ if clean_content.strip().startswith('<') or 'recalled a message' in clean_content:
+ return ""
+
+ return clean_content.strip()
+
+ def _is_text_message(self, content: str) -> bool:
+ """
+ Check if a message contains readable text content.
+
+ Args:
+ content: The message content (can be string or dict)
+
+ Returns:
+ True if the message contains readable text, False otherwise
+ """
+ if not content:
+ return False
+
+ # Handle dictionary content
+ if isinstance(content, dict):
+ # Check if dict has any readable text fields
+ text_fields = ['title', 'quoted', 'content', 'text']
+ for field in text_fields:
+ if field in content and content[field]:
+ return True
+ return False
+
+ # Handle string content
+ if not isinstance(content, str):
+ return False
+
+ # Skip image messages (contain XML with img tags)
+ if '
0 and not clean_content.strip().startswith('<'):
+ return True
+
+ return False
+
+ def _concatenate_messages(self, messages: List[Dict], max_length: int = 128,
+ time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
+ """
+ Concatenate messages based on length and time rules.
+
+ Args:
+ messages: List of message dictionaries
+ max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint.
+ time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint.
+ overlap_messages: Number of messages to overlap between consecutive groups
+
+ Returns:
+ List of concatenated message groups
+ """
+ if not messages:
+ return []
+
+ concatenated_groups = []
+ current_group = []
+ current_length = 0
+ last_timestamp = None
+
+ for message in messages:
+ # Extract message info
+ content = message.get('content', '')
+ message_text = message.get('message', '')
+ create_time = message.get('createTime', 0)
+ from_user = message.get('fromUser', '')
+ to_user = message.get('toUser', '')
+ is_sent_from_self = message.get('isSentFromSelf', False)
+
+ # Extract readable text
+ readable_text = self._extract_readable_text(content)
+ if not readable_text:
+ readable_text = message_text
+
+ # Skip empty messages
+ if not readable_text.strip():
+ continue
+
+ # Check time window constraint (only if time_window_minutes != -1)
+ if time_window_minutes != -1 and last_timestamp is not None and create_time > 0:
+ time_diff_minutes = (create_time - last_timestamp) / 60
+ if time_diff_minutes > time_window_minutes:
+ # Time gap too large, start new group
+ if current_group:
+ concatenated_groups.append({
+ 'messages': current_group,
+ 'total_length': current_length,
+ 'start_time': current_group[0].get('createTime', 0),
+ 'end_time': current_group[-1].get('createTime', 0)
+ })
+ # Keep last few messages for overlap
+ if overlap_messages > 0 and len(current_group) > overlap_messages:
+ current_group = current_group[-overlap_messages:]
+ current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
+ else:
+ current_group = []
+ current_length = 0
+
+ # Check length constraint (only if max_length != -1)
+ message_length = len(readable_text)
+ if max_length != -1 and current_length + message_length > max_length and current_group:
+ # Current group would exceed max length, save it and start new
+ concatenated_groups.append({
+ 'messages': current_group,
+ 'total_length': current_length,
+ 'start_time': current_group[0].get('createTime', 0),
+ 'end_time': current_group[-1].get('createTime', 0)
+ })
+ # Keep last few messages for overlap
+ if overlap_messages > 0 and len(current_group) > overlap_messages:
+ current_group = current_group[-overlap_messages:]
+ current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
+ else:
+ current_group = []
+ current_length = 0
+
+ # Add message to current group
+ current_group.append(message)
+ current_length += message_length
+ last_timestamp = create_time
+
+ # Add the last group if it exists
+ if current_group:
+ concatenated_groups.append({
+ 'messages': current_group,
+ 'total_length': current_length,
+ 'start_time': current_group[0].get('createTime', 0),
+ 'end_time': current_group[-1].get('createTime', 0)
+ })
+
+ return concatenated_groups
+
+ def _create_concatenated_content(self, message_group: Dict, contact_name: str) -> str:
+ """
+ Create concatenated content from a group of messages.
+
+ Args:
+ message_group: Dictionary containing messages and metadata
+ contact_name: Name of the contact
+
+ Returns:
+ Formatted concatenated content
+ """
+ messages = message_group['messages']
+ start_time = message_group['start_time']
+ end_time = message_group['end_time']
+
+ # Format timestamps
+ if start_time:
+ try:
+ start_timestamp = datetime.fromtimestamp(start_time)
+ start_time_str = start_timestamp.strftime('%Y-%m-%d %H:%M:%S')
+ except:
+ start_time_str = str(start_time)
+ else:
+ start_time_str = "Unknown"
+
+ if end_time:
+ try:
+ end_timestamp = datetime.fromtimestamp(end_time)
+ end_time_str = end_timestamp.strftime('%Y-%m-%d %H:%M:%S')
+ except:
+ end_time_str = str(end_time)
+ else:
+ end_time_str = "Unknown"
+
+ # Build concatenated message content
+ message_parts = []
+ for message in messages:
+ content = message.get('content', '')
+ message_text = message.get('message', '')
+ create_time = message.get('createTime', 0)
+ is_sent_from_self = message.get('isSentFromSelf', False)
+
+ # Extract readable text
+ readable_text = self._extract_readable_text(content)
+ if not readable_text:
+ readable_text = message_text
+
+ # Format individual message
+ if create_time:
+ try:
+ timestamp = datetime.fromtimestamp(create_time)
+ # change to YYYY-MM-DD HH:MM:SS
+ time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
+ except:
+ time_str = str(create_time)
+ else:
+ time_str = "Unknown"
+
+ sender = "[Me]" if is_sent_from_self else "[Contact]"
+ message_parts.append(f"({time_str}) {sender}: {readable_text}")
+
+ concatenated_text = "\n".join(message_parts)
+
+ # Create final document content
+ doc_content = f"""
+Contact: {contact_name}
+Time Range: {start_time_str} - {end_time_str}
+Messages ({len(messages)} messages, {message_group['total_length']} chars):
+
+{concatenated_text}
+"""
+ # TODO @yichuan give better format and rich info here!
+ doc_content = f"""
+{concatenated_text}
+"""
+ return doc_content, contact_name
+
+ def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
+ """
+ Load WeChat chat history data from exported JSON files.
+
+ Args:
+ input_dir: Directory containing exported WeChat JSON files
+ **load_kwargs:
+ max_count (int): Maximum amount of chat entries to read.
+ wechat_export_dir (str): Custom path to WeChat export directory.
+ include_non_text (bool): Whether to include non-text messages (images, emojis, etc.)
+ concatenate_messages (bool): Whether to concatenate messages based on length rules.
+ max_length (int): Maximum length for concatenated message groups (default: 1000).
+ time_window_minutes (int): Time window in minutes to group messages together (default: 30).
+ overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
+ """
+ docs: List[Document] = []
+ max_count = load_kwargs.get('max_count', 1000)
+ wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
+ include_non_text = load_kwargs.get('include_non_text', False)
+ concatenate_messages = load_kwargs.get('concatenate_messages', False)
+ max_length = load_kwargs.get('max_length', 1000)
+ time_window_minutes = load_kwargs.get('time_window_minutes', 30)
+
+ # Default WeChat export path
+ if wechat_export_dir is None:
+ wechat_export_dir = "./wechat_export_test"
+
+ if not os.path.exists(wechat_export_dir):
+ print(f"WeChat export directory not found at: {wechat_export_dir}")
+ return docs
+
+ try:
+ # Find all JSON files in the export directory
+ json_files = list(Path(wechat_export_dir).glob("*.json"))
+ print(f"Found {len(json_files)} WeChat chat history files")
+
+ count = 0
+ for json_file in json_files:
+ if count >= max_count and max_count > 0:
+ break
+
+ try:
+ with open(json_file, 'r', encoding='utf-8') as f:
+ chat_data = json.load(f)
+
+ # Extract contact name from filename
+ contact_name = json_file.stem
+
+ if concatenate_messages:
+ # Filter messages to only include readable text messages
+ readable_messages = []
+ for message in chat_data:
+ try:
+ content = message.get('content', '')
+ if not include_non_text and not self._is_text_message(content):
+ continue
+
+ readable_text = self._extract_readable_text(content)
+ if not readable_text and not include_non_text:
+ continue
+
+ readable_messages.append(message)
+ except Exception as e:
+ print(f"Error processing message in {json_file}: {e}")
+ continue
+
+ # Concatenate messages based on rules
+ message_groups = self._concatenate_messages(
+ readable_messages,
+ max_length=-1,
+ time_window_minutes=-1,
+ overlap_messages=0 # Keep 2 messages overlap between groups
+ )
+
+ # Create documents from concatenated groups
+ for message_group in message_groups:
+ if count >= max_count and max_count > 0:
+ break
+
+ doc_content, contact_name = self._create_concatenated_content(message_group, contact_name)
+ doc = Document(text=doc_content, metadata={"contact_name": contact_name})
+ docs.append(doc)
+ count += 1
+
+ print(f"Created {len(message_groups)} concatenated message groups for {contact_name}")
+
+ else:
+ # Original single-message processing
+ for message in chat_data:
+ if count >= max_count and max_count > 0:
+ break
+
+ # Extract message information
+ from_user = message.get('fromUser', '')
+ to_user = message.get('toUser', '')
+ content = message.get('content', '')
+ message_text = message.get('message', '')
+ create_time = message.get('createTime', 0)
+ is_sent_from_self = message.get('isSentFromSelf', False)
+
+ # Handle content that might be dict or string
+ try:
+ # Check if this is a readable text message
+ if not include_non_text and not self._is_text_message(content):
+ continue
+
+ # Extract readable text
+ readable_text = self._extract_readable_text(content)
+ if not readable_text and not include_non_text:
+ continue
+ except Exception as e:
+ # Skip messages that cause processing errors
+ print(f"Error processing message in {json_file}: {e}")
+ continue
+
+ # Convert timestamp to readable format
+ if create_time:
+ try:
+ timestamp = datetime.fromtimestamp(create_time)
+ time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
+ except:
+ time_str = str(create_time)
+ else:
+ time_str = "Unknown"
+
+ # Create document content with metadata header and contact info
+ doc_content = f"""
+Contact: {contact_name}
+Is sent from self: {is_sent_from_self}
+Time: {time_str}
+Message: {readable_text if readable_text else message_text}
+"""
+
+ # Create document with embedded metadata
+ doc = Document(text=doc_content, metadata={})
+ docs.append(doc)
+ count += 1
+
+ except Exception as e:
+ print(f"Error reading {json_file}: {e}")
+ continue
+
+ print(f"Loaded {len(docs)} WeChat chat documents")
+
+ except Exception as e:
+ print(f"Error reading WeChat history: {e}")
+ return docs
+
+ return docs
+
+ @staticmethod
+ def find_wechat_export_dirs() -> List[Path]:
+ """
+ Find all WeChat export directories.
+
+ Returns:
+ List of Path objects pointing to WeChat export directories
+ """
+ export_dirs = []
+
+ # Look for common export directory names
+ possible_dirs = [
+ Path("./wechat_export_test"),
+ Path("./wechat_export"),
+ Path("./wechat_chat_history"),
+ Path("./chat_export")
+ ]
+
+ for export_dir in possible_dirs:
+ if export_dir.exists() and export_dir.is_dir():
+ json_files = list(export_dir.glob("*.json"))
+ if json_files:
+ export_dirs.append(export_dir)
+ print(f"Found WeChat export directory: {export_dir} with {len(json_files)} files")
+
+ print(f"Found {len(export_dirs)} WeChat export directories")
+ return export_dirs
+
+ @staticmethod
+ def export_chat_to_file(output_file: str = "wechat_chat_export.txt", max_count: int = 1000, export_dir: str = None, include_non_text: bool = False):
+ """
+ Export WeChat chat history to a text file.
+
+ Args:
+ output_file: Path to the output file
+ max_count: Maximum number of entries to export
+ export_dir: Directory containing WeChat JSON files
+ include_non_text: Whether to include non-text messages
+ """
+ if export_dir is None:
+ export_dir = "./wechat_export_test"
+
+ if not os.path.exists(export_dir):
+ print(f"WeChat export directory not found at: {export_dir}")
+ return
+
+ try:
+ json_files = list(Path(export_dir).glob("*.json"))
+
+ with open(output_file, 'w', encoding='utf-8') as f:
+ count = 0
+ for json_file in json_files:
+ if count >= max_count and max_count > 0:
+ break
+
+ try:
+ with open(json_file, 'r', encoding='utf-8') as json_f:
+ chat_data = json.load(json_f)
+
+ contact_name = json_file.stem
+ f.write(f"\n=== Chat with {contact_name} ===\n")
+
+ for message in chat_data:
+ if count >= max_count and max_count > 0:
+ break
+
+ from_user = message.get('fromUser', '')
+ content = message.get('content', '')
+ message_text = message.get('message', '')
+ create_time = message.get('createTime', 0)
+
+ # Skip non-text messages unless requested
+ if not include_non_text:
+ reader = WeChatHistoryReader()
+ if not reader._is_text_message(content):
+ continue
+ readable_text = reader._extract_readable_text(content)
+ if not readable_text:
+ continue
+ message_text = readable_text
+
+ if create_time:
+ try:
+ timestamp = datetime.fromtimestamp(create_time)
+ time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
+ except:
+ time_str = str(create_time)
+ else:
+ time_str = "Unknown"
+
+ f.write(f"[{time_str}] {from_user}: {message_text}\n")
+ count += 1
+
+ except Exception as e:
+ print(f"Error processing {json_file}: {e}")
+ continue
+
+ print(f"Exported {count} chat entries to {output_file}")
+
+ except Exception as e:
+ print(f"Error exporting WeChat chat history: {e}")
+
+ def export_wechat_chat_history(self, export_dir: str = "./wechat_export_direct") -> Optional[Path]:
+ """
+ Export WeChat chat history using wechat-exporter tool.
+
+ Args:
+ export_dir: Directory to save exported chat history
+
+ Returns:
+ Path to export directory if successful, None otherwise
+ """
+ try:
+ import subprocess
+ import sys
+
+ # Create export directory
+ export_path = Path(export_dir)
+ export_path.mkdir(exist_ok=True)
+
+ print(f"Exporting WeChat chat history to {export_path}...")
+
+ # Check if wechat-exporter directory exists
+ if not self.wechat_exporter_dir.exists():
+ print(f"wechat-exporter directory not found at: {self.wechat_exporter_dir}")
+ return None
+
+ # Install requirements if needed
+ requirements_file = self.wechat_exporter_dir / "requirements.txt"
+ if requirements_file.exists():
+ print("Installing wechat-exporter requirements...")
+ subprocess.run([
+ "uv", "pip", "install", "-r", str(requirements_file)
+ ], check=True)
+
+ # Run the export command
+ print("Running wechat-exporter...")
+ result = subprocess.run([
+ sys.executable, str(self.wechat_exporter_dir / "main.py"),
+ "export-all", str(export_path)
+ ], capture_output=True, text=True, check=True)
+
+ print("Export command output:")
+ print(result.stdout)
+ if result.stderr:
+ print("Export errors:")
+ print(result.stderr)
+
+ # Check if export was successful
+ if export_path.exists() and any(export_path.glob("*.json")):
+ json_files = list(export_path.glob("*.json"))
+ print(f"Successfully exported {len(json_files)} chat history files to {export_path}")
+ return export_path
+ else:
+ print("Export completed but no JSON files found")
+ return None
+
+ except subprocess.CalledProcessError as e:
+ print(f"Export command failed: {e}")
+ print(f"Command output: {e.stdout}")
+ print(f"Command errors: {e.stderr}")
+ return None
+ except Exception as e:
+ print(f"Export failed: {e}")
+ print("Please ensure WeChat is running and WeChatTweak is installed.")
+ return None
+
+ def find_or_export_wechat_data(self, export_dir: str = "./wechat_export_direct") -> List[Path]:
+ """
+ Find existing WeChat exports or create new ones.
+
+ Args:
+ export_dir: Directory to save exported chat history if needed
+
+ Returns:
+ List of Path objects pointing to WeChat export directories
+ """
+ export_dirs = []
+
+ # Look for existing exports in common locations
+ possible_export_dirs = [
+ Path("./wechat_database_export"),
+ Path("./wechat_export_test"),
+ Path("./wechat_export"),
+ Path("./wechat_export_direct"),
+ Path("./wechat_chat_history"),
+ Path("./chat_export")
+ ]
+
+ for export_dir_path in possible_export_dirs:
+ if export_dir_path.exists() and any(export_dir_path.glob("*.json")):
+ export_dirs.append(export_dir_path)
+ print(f"Found existing export: {export_dir_path}")
+
+ # If no existing exports, try to export automatically
+ if not export_dirs:
+ print("No existing WeChat exports found. Starting direct export...")
+
+ # Try to export using wechat-exporter
+ exported_path = self.export_wechat_chat_history(export_dir)
+ if exported_path:
+ export_dirs = [exported_path]
+ else:
+ print("Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.")
+
+ return export_dirs
\ No newline at end of file
diff --git a/examples/data/2501.14312v1 (1).pdf b/examples/data/2501.14312v1 (1).pdf
deleted file mode 100644
index 230732b..0000000
Binary files a/examples/data/2501.14312v1 (1).pdf and /dev/null differ
diff --git a/examples/data/2506.08276v1.pdf b/examples/data/2506.08276v1.pdf
deleted file mode 100644
index 2756eef..0000000
--- a/examples/data/2506.08276v1.pdf
+++ /dev/null
@@ -1,7905 +0,0 @@
-%PDF-1.5
-%
-1 0 obj
-<< /Lang (en) /Metadata 3 0 R /Names 4 0 R /OpenAction 5 0 R /Outlines 6 0 R /PageMode /UseOutlines /Pages 7 0 R /Type /Catalog /ViewerPreferences << /DisplayDocTitle true >> >>
-endobj
-2 0 obj
-<< /Author (Yichuan Wang; Shu Liu; Zhifei Li; Yongji Wu; Ziming Mao; Yilong Zhao; Xiao Yan; Zhiying Xu; Yang Zhou; Ion Stoica; Sewon Min; Matei Zaharia; Joseph E. Gonzalez) /CreationDate (D:20250611152430+00'00') /Creator (arXiv GenPDF \(tex2pdf:\)) /DOI (https://doi.org/10.48550/arXiv.2506.08276) /Keywords () /License (http://arxiv.org/licenses/nonexclusive-distrib/1.0/) /ModDate (D:20250611152430+00'00') /PTEX.Fullbanner (This is pdfTeX, Version 3.141592653-2.6-1.40.25 \(TeX Live 2023\) kpathsea version 6.3.5) /Producer (pikepdf 8.15.1) /Title (LEANN: A Low-Storage Vector Index) /Trapped /False /arXivID (https://arxiv.org/abs/2506.08276v1) >>
-endobj
-3 0 obj
-<< /Subtype /XML /Type /Metadata /Length 13436 >>
-stream
-
-
-
-
-
-
-
- Adobe PDF Schema
- pdf
- http://ns.adobe.com/pdf/1.3/
-
-
-
- Trapped
- Text
- internal
- Indication if the document has been modified to include trapping information
-
-
-
-
-
- XMP Media Management Schema
- xmpMM
- http://ns.adobe.com/xap/1.0/mm/
-
-
-
- DocumentID
- URI
- internal
- UUID based identifier for all versions and renditions of a document
-
-
- InstanceID
- URI
- internal
- UUID based identifier for specific incarnation of a document
-
-
- VersionID
- Text
- internal
- Document version identifier
-
-
- RenditionClass
- RenditionClass
- internal
- The manner in which a document is rendered
-
-
-
-
-
- PRISM Basic Metadata
- prism
- http://prismstandard.org/namespaces/basic/3.0/
-
-
-
- complianceProfile
- Text
- internal
- PRISM specification compliance profile to which this document adheres
-
-
- publicationName
- Text
- external
- Publication name
-
-
- aggregationType
- Text
- external
- Publication type
-
-
- bookEdition
- Text
- external
- Edition of the book in which the document was published
-
-
- volume
- Text
- external
- Publication volume number
-
-
- number
- Text
- external
- Publication issue number within a volume
-
-
- pageRange
- Text
- external
- Page range for the document within the print version of its publication
-
-
- issn
- Text
- external
- ISSN for the printed publication in which the document was published
-
-
- eIssn
- Text
- external
- ISSN for the electronic publication in which the document was published
-
-
- isbn
- Text
- external
- ISBN for the publication in which the document was published
-
-
- doi
- Text
- external
- Digital Object Identifier for the document
-
-
- url
- URL
- external
- URL at which the document can be found
-
-
- byteCount
- Integer
- internal
- Approximate file size in octets
-
-
- pageCount
- Integer
- internal
- Number of pages in the print version of the document
-
-
- subtitle
- Text
- external
- Document's subtitle
-
-
-
-
-
-
- pikepdf 8.15.1
-
- 1.5
- application/pdf
-
- LEANN: A Low-Storage Vector Index
- arXiv
-
-
- 2025-06-11T15:24:30Z
-
-
-
-
- Text
-
-
-
- Yichuan WangShu LiuZhifei LiYongji WuZiming MaoYilong ZhaoXiao YanZhiying XuYang ZhouIon StoicaSewon MinMatei ZahariaJoseph E. Gonzalez
- main.tex
-
-
- en
-
-
- https://arxiv.org/abs/2506.08276v1
- 2025-06-11T15:24:30Z
- 2025-06-11T15:24:30Z
- 2025-06-11T15:24:36.250178+00:00
- arXiv GenPDF (tex2pdf:)
- uuid:75fd75f2-b182-4bbb-ac9b-620a88d7aeae
- uuid:8d18d13e-fdc9-4541-91dc-efd13959bb18
- 1
- default
- three
- Proceedings of Make sure to enter the correct conference title from your rights confirmation email (Conference acronym 'XX)
- book
- 1
- 1
- XXXXXXX.XXXXXXX
- 15
- 15
-
- http://arxiv.org/licenses/nonexclusive-distrib/1.0/cs.DBcs.LG
-
-
-
-
-endstream
-endobj
-4 0 obj
-<< /Dests 8 0 R >>
-endobj
-5 0 obj
-<< /D [ 9 0 R /Fit ] /S /GoTo >>
-endobj
-6 0 obj
-<< /Count 11 /First 10 0 R /Last 11 0 R /Type /Outlines >>
-endobj
-7 0 obj
-<< /Count 15 /Kids [ 12 0 R 13 0 R 14 0 R ] /Type /Pages >>
-endobj
-8 0 obj
-<< /Kids [ 15 0 R 16 0 R 17 0 R 18 0 R 19 0 R ] /Limits [ (ALG@line.1) (table.caption.9) ] >>
-endobj
-9 0 obj
-<< /Annots [ 20 0 R 21 0 R 22 0 R 23 0 R 24 0 R 25 0 R 26 0 R 27 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R 36 0 R ] /Contents [ 37 0 R 38 0 R ] /MediaBox [ 0 0 612 792 ] /Parent 12 0 R /Resources 39 0 R /Type /Page >>
-endobj
-10 0 obj
-<< /A 40 0 R /Next 41 0 R /Parent 6 0 R /Title 42 0 R >>
-endobj
-11 0 obj
-<< /A 43 0 R /Parent 6 0 R /Prev 44 0 R /Title 45 0 R >>
-endobj
-12 0 obj
-<< /Count 6 /Kids [ 9 0 R 46 0 R 47 0 R 48 0 R 49 0 R 50 0 R ] /Parent 7 0 R /Type /Pages >>
-endobj
-13 0 obj
-<< /Count 6 /Kids [ 51 0 R 52 0 R 53 0 R 54 0 R 55 0 R 56 0 R ] /Parent 7 0 R /Type /Pages >>
-endobj
-14 0 obj
-<< /Count 3 /Kids [ 57 0 R 58 0 R 59 0 R ] /Parent 7 0 R /Type /Pages >>
-endobj
-15 0 obj
-<< /Kids [ 60 0 R 61 0 R 62 0 R 63 0 R 64 0 R 65 0 R ] /Limits [ (ALG@line.1) (cite.asai2023self) ] >>
-endobj
-16 0 obj
-<< /Kids [ 66 0 R 67 0 R 68 0 R 69 0 R 70 0 R 71 0 R ] /Limits [ (cite.aumuller2020ann) (cite.munyampirwa2024down) ] >>
-endobj
-17 0 obj
-<< /Kids [ 72 0 R 73 0 R 74 0 R 75 0 R 76 0 R 77 0 R ] /Limits [ (cite.nsg) (equation.5.1) ] >>
-endobj
-18 0 obj
-<< /Kids [ 78 0 R 79 0 R 80 0 R 81 0 R 82 0 R 83 0 R ] /Limits [ (figure.caption.10) (section.5) ] >>
-endobj
-19 0 obj
-<< /Kids [ 84 0 R 85 0 R 86 0 R ] /Limits [ (section.6) (table.caption.9) ] >>
-endobj
-20 0 obj
-<< /A << /D (Hfootnote.1) /S /GoTo >> /Border [ 0 0 0 ] /C [ 1 0 0 ] /H /I /Rect [ 53.004 360.913 55.494 366.522 ] /Subtype /Link /Type /Annot >>
-endobj
-21 0 obj
-<< /A << /D (cite.izacard2021unsupervised) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 188.421 321.946 199.679 330.2 ] /Subtype /Link /Type /Annot >>
-endobj
-22 0 obj
-<< /A << /D (cite.lin2022pretrained) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 202.564 321.946 213.822 330.2 ] /Subtype /Link /Type /Annot >>
-endobj
-23 0 obj
-<< /A << /D (cite.karpukhin2020dense) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 150.818 298.036 162.075 306.29 ] /Subtype /Link /Type /Annot >>
-endobj
-24 0 obj
-<< /A << /D (cite.zamani2023conversational) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 165.19 298.036 176.448 306.29 ] /Subtype /Link /Type /Annot >>
-endobj
-25 0 obj
-<< /A << /D (cite.craswell2020overview) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 220.739 274.215 231.997 282.379 ] /Subtype /Link /Type /Annot >>
-endobj
-26 0 obj
-<< /A << /D (cite.zhang2018visual) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 234.667 274.125 245.925 282.379 ] /Subtype /Link /Type /Annot >>
-endobj
-27 0 obj
-<< /A << /D (cite.he2019streaming) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 201.078 190.529 212.336 198.693 ] /Subtype /Link /Type /Annot >>
-endobj
-28 0 obj
-<< /A << /D (cite.work-in-progress) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 215.026 190.439 226.284 198.693 ] /Subtype /Link /Type /Annot >>
-endobj
-29 0 obj
-<< /A << /D (cite.wang2024mememo) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 228.973 190.439 240.231 198.693 ] /Subtype /Link /Type /Annot >>
-endobj
-30 0 obj
-<< /A << /D (cite.yin2024devicers) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 242.921 190.439 254.179 198.693 ] /Subtype /Link /Type /Annot >>
-endobj
-31 0 obj
-<< /A << /D (cite.shao2024scaling) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 75.586 154.574 86.844 162.828 ] /Subtype /Link /Type /Annot >>
-endobj
-32 0 obj
-<< /A << /D (cite.wang2021comprehensive_survey) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 340.008 543.703 351.265 551.957 ] /Subtype /Link /Type /Annot >>
-endobj
-33 0 obj
-<< /A << /D (cite.pq) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 394.996 507.837 406.254 516.091 ] /Subtype /Link /Type /Annot >>
-endobj
-34 0 obj
-<< /A << /D (cite.hnsw) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 555.225 436.106 566.483 444.36 ] /Subtype /Link /Type /Annot >>
-endobj
-35 0 obj
-<< /A << /D (cite.shao2024scaling) /S /GoTo >> /Border [ 0 0 0 ] /C [ 0 1 0 ] /H /I /Rect [ 434.339 113.317 445.596 121.571 ] /Subtype /Link /Type /Annot >>
-endobj
-36 0 obj
-<< /A << /S /URI /URI (https://arxiv.org/abs/2506.08276v1) >> /BS << /W 0 >> /NM (fitz-L0) /Rect [ 12 227.68 32 564.32 ] /Subtype /Link >>
-endobj
-37 0 obj
-<< /Filter /FlateDecode /Length 139 >>
-stream
-xEM
-19E.1Mtą ;E*.;*{L@y9v֣TaxL#koC4¢Bb\ZmE{&&=X\B鑁)VL>m``o6&
-endstream
-endobj
-38 0 obj
-<< /Filter /FlateDecode /Length 5551 >>
-stream
-xڥ[[ȱ~_GOԅg^l2gݙp-0_2+@h<=KEjV_f/^++ kU{XID(*V^lT_OM~y5|_PݤzW@ïR\I)g~XmbJDRZ5
Ff}0:"ve64A/#u(>YGH&\$5ի-3C~ijE&'?ʇ+,.&Wf/;NDOa~ۏ*CʪqpXeϑ~:\]">/aZfō̈́VO]AJp-}_Xd }EziH}NgJ&}މf|p+wbF4Ϣq/揋
-ipF3zC32^+EZgwG
ka`ܷ9vd"HI8(+[U0U(Xv6yKOH^E~=pU8>}P벦k~:veSYPfER?C_ ?i$TUQ<*wA\ˏC]E]o8m_íi&ahYcAe!uw(@4Zsz##k MrT
鄓7n)/GQ%W7;$
-In6u~Ĝ!;zylܗG8_M}t/7
kb\14*ႎ*/WTx#5sms:iONcW~L4mK8}(4Dc'~|ĝk5"o#Ϲzt풔)RŜ9{<^&qCZwrnϫ,}K%{$y nHs~ +Ke #N ,oI8Nᚸ[rx$!Q双(|+NGd"2e
-N&p?8GA-tvw*܍YmMס:Y
--3ZމG1Dz{:psh6#ilϥ ~:a,jb˴N t|G@j&NIF2>6/zKU)DlK
-OvFa';FM<+ڎ>.@-$ڙ*115<:I:u|{)4&D1:źmZ-X*0e=E3SR"tq\[{t
i3| ;s~:L\lak8