diff --git a/.gitmodules b/.gitmodules index 1899ae5..c1cd540 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ [submodule "packages/leann-backend-diskann/third_party/DiskANN"] path = packages/leann-backend-diskann/third_party/DiskANN - url = https://github.com/yichuan520030910320/DiskANN.git + url = https://github.com/yichuan-w/DiskANN.git [submodule "packages/leann-backend-hnsw/third_party/faiss"] path = packages/leann-backend-hnsw/third_party/faiss - url = https://github.com/yichuan520030910320/faiss.git + url = https://github.com/yichuan-w/faiss.git [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"] path = packages/leann-backend-hnsw/third_party/msgpack-c url = https://github.com/msgpack/msgpack-c.git diff --git a/README.md b/README.md index 536c8f4..1e0140c 100755 --- a/README.md +++ b/README.md @@ -12,11 +12,13 @@ The smallest vector index in the world. RAG Everything with LEANN! -LEANN is a revolutionary vector database that makes personal AI accessible to everyone. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using **97% less storage** than traditional solutions **without accuracy loss**. +LEANN is a revolutionary vector database that democratizes personal AI. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using **[97% less storage]** than traditional solutions **without accuracy loss**. + +LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276) + +**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can search your **[file system](#process-any-documents-pdf-txt-md)**, **[emails](#search-your-entire-life)**, **[browser history](#time-machine-for-the-web)**, **[chat history](#wechat-detective)**, or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy. -RAG your **[emails](#-search-your-entire-life)**, **[browser history](#-time-machine-for-the-web)**, **[WeChat](#-wechat-detective)**, or 60M documents on your laptop, in nearly zero cost. No cloud, no API keys, completely private. -LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Read more →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276) ## Why LEANN? @@ -30,16 +32,16 @@ LEANN achieves this through *graph-based selective recomputation* with *high-deg 🔒 **Privacy:** Your data never leaves your laptop. No OpenAI, no cloud, no "terms of service". -🪶 **Lightweight:** Smart graph pruning means less storage, less memory usage, better performance on your existing hardware. +🪶 **Lightweight:** Graph-based recomputation eliminates heavy embedding storage, while smart graph pruning and CSR format minimize graph storage overhead. Always less storage, less memory usage! -📈 **Scalability:** Organize our messy personal data that would crash traditional vector DBs, with performance that gets better as your data grows more personalized. +📈 **Scalability:** Handle messy personal data that would crash traditional vector DBs, easily managing your growing personalized data and agent generated memory! ✨ **No Accuracy Loss:** Maintain the same search quality as heavyweight solutions while using 97% less storage. ## Quick Start in 1 minute ```bash -git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann +git clone git@github.com:yichuan-w/LEANN.git leann cd leann git submodule update --init --recursive ``` @@ -125,7 +127,7 @@ print(results) LEANN supports RAGing a lot of data sources, like .pdf, .txt, .md, and also supports RAGing your WeChat, Google Search History, and more. -### 📚 Process Any Documents (.pdf, .txt, .md) +### Process Any Documents (.pdf, .txt, .md) Above we showed the Python API, while this CLI script demonstrates the same concepts while directly processing PDFs and documents. @@ -142,7 +144,7 @@ Uses Ollama `qwen3:8b` by default. For other models: `--llm openai --model gpt-4 **Works with any text format** - research papers, personal notes, presentations. Built with LlamaIndex for document parsing. -### 🕵️ Search Your Entire Life +### Search Your Entire Life ```bash python examples/mail_reader_leann.py # "What did my boss say about the Christmas party last year?" @@ -181,7 +183,7 @@ Once the index is built, you can ask questions like: - "Show me emails about travel expenses" -### 🌐 Time Machine for the Web +### Time Machine for the Web ```bash python examples/google_history_reader_leann.py # "What was that AI paper I read last month?" @@ -236,7 +238,7 @@ Once the index is built, you can ask questions like: -### 💬 WeChat Detective +### WeChat Detective ```bash python examples/wechat_history_reader_leann.py diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py index c3aee02..19320a5 100644 --- a/examples/history_data/wechat_history.py +++ b/examples/history_data/wechat_history.py @@ -197,8 +197,8 @@ class WeChatHistoryReader(BaseReader): Args: messages: List of message dictionaries - max_length: Maximum length for concatenated message groups - time_window_minutes: Time window in minutes to group messages together + max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint. + time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint. overlap_messages: Number of messages to overlap between consecutive groups Returns: @@ -230,8 +230,8 @@ class WeChatHistoryReader(BaseReader): if not readable_text.strip(): continue - # Check time window constraint - if last_timestamp is not None and create_time > 0: + # Check time window constraint (only if time_window_minutes != -1) + if time_window_minutes != -1 and last_timestamp is not None and create_time > 0: time_diff_minutes = (create_time - last_timestamp) / 60 if time_diff_minutes > time_window_minutes: # Time gap too large, start new group @@ -250,9 +250,9 @@ class WeChatHistoryReader(BaseReader): current_group = [] current_length = 0 - # Check length constraint + # Check length constraint (only if max_length != -1) message_length = len(readable_text) - if current_length + message_length > max_length and current_group: + if max_length != -1 and current_length + message_length > max_length and current_group: # Current group would exceed max length, save it and start new concatenated_groups.append({ 'messages': current_group, @@ -431,9 +431,9 @@ Contact: {contact_name} # Concatenate messages based on rules message_groups = self._concatenate_messages( readable_messages, - max_length=max_length, - time_window_minutes=time_window_minutes, - overlap_messages=2 # Keep 2 messages overlap between groups + max_length=-1, + time_window_minutes=-1, + overlap_messages=0 # Keep 2 messages overlap between groups ) # Create documents from concatenated groups diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py index 3d1147e..e15e876 100644 --- a/examples/wechat_history_reader_leann.py +++ b/examples/wechat_history_reader_leann.py @@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports( documents = reader.load_data( wechat_export_dir=str(export_dir), max_count=max_count, - concatenate_messages=True, # Disable concatenation - one message per document + concatenate_messages=False, # Disable concatenation - one message per document ) if documents: print(f"Loaded {len(documents)} chat documents from {export_dir}") @@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports( ) # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) + text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64) # Convert Documents to text strings and chunk them all_texts = [] @@ -224,7 +224,7 @@ async def query_leann_index(index_path: str, query: str): query, top_k=20, recompute_beighbor_embeddings=True, - complexity=64, + complexity=128, beam_width=1, llm_config={ "type": "openai",