diff --git a/README.md b/README.md index 215a9cd..0a9fe02 100755 --- a/README.md +++ b/README.md @@ -149,6 +149,8 @@ python ./examples/main_cli_example.py **Works with any text format** - research papers, personal notes, presentations. Built with LlamaIndex for document parsing. ### Search Your Entire Life + +**Note:** You need to grant full disk access to your terminal/VS Code in System Preferences → Privacy & Security → Full Disk Access. ```bash python examples/mail_reader_leann.py # "What's the number of class recommend to take per semester for incoming EECS students?" diff --git a/demo.ipynb b/demo.ipynb index 8d5240e..44bb9e3 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -16,9 +16,96 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO: Computing embeddings for 1 texts using SentenceTransformer, model: 'facebook/contriever-msmarco'\n", + "INFO: Using cached model: facebook/contriever-msmarco\n", + "INFO: Starting embedding computation...\n", + "INFO: Generated 1 embeddings, dimension: 768\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Writing passages: 100%|██████████| 5/5 [00:00<00:00, 16345.69chunk/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO: Computing embeddings for 5 texts using SentenceTransformer, model: 'facebook/contriever-msmarco'\n", + "INFO: Using cached model: facebook/contriever-msmarco\n", + "INFO: Starting embedding computation...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 50.05it/s]\n", + "WARNING:leann_backend_hnsw.hnsw_backend:Converting data to float32, shape: (5, 768)\n", + "INFO:leann_backend_hnsw.hnsw_backend:INFO: Converting HNSW index to CSR-pruned format...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO: Generated 5 embeddings, dimension: 768\n", + "M: 64 for level: 0\n", + "Starting conversion: knowledge.index -> knowledge.csr.tmp\n", + "[0.00s] Reading Index HNSW header...\n", + "[0.00s] Header read: d=768, ntotal=5\n", + "[0.00s] Reading HNSW struct vectors...\n", + " Reading vector (dtype=, fmt='d')... Count=6, Bytes=48\n", + "[0.00s] Read assign_probas (6)\n", + " Reading vector (dtype=, fmt='i')... Count=7, Bytes=28\n", + "[0.21s] Read cum_nneighbor_per_level (7)\n", + " Reading vector (dtype=, fmt='i')... Count=5, Bytes=20\n", + "[0.33s] Read levels (5)\n", + "[0.44s] Probing for compact storage flag...\n", + "[0.44s] Found compact flag: False\n", + "[0.44s] Compact flag is False, reading original format...\n", + "[0.44s] Probing for potential extra byte before non-compact offsets...\n", + "[0.44s] Found and consumed an unexpected 0x00 byte.\n", + " Reading vector (dtype=, fmt='Q')... Count=6, Bytes=48\n", + "[0.44s] Read offsets (6)\n", + "[0.54s] Attempting to read neighbors vector...\n", + " Reading vector (dtype=, fmt='i')... Count=320, Bytes=1280\n", + "[0.54s] Read neighbors (320)\n", + "[0.65s] Read scalar params (ep=4, max_lvl=0)\n", + "[0.65s] Checking for storage data...\n", + "[0.65s] Found storage fourcc: 49467849.\n", + "[0.65s] Converting to CSR format...\n", + "[0.65s] Conversion loop finished. \n", + "[0.65s] Running validation checks...\n", + " Checking total valid neighbor count...\n", + " OK: Total valid neighbors = 20\n", + " Checking final pointer indices...\n", + " OK: Final pointers match data size.\n", + "[0.65s] Deleting original neighbors and offsets arrays...\n", + " CSR Stats: |data|=20, |level_ptr|=10\n", + "[0.76s] Writing CSR HNSW graph data in FAISS-compatible order...\n", + " Pruning embeddings: Writing NULL storage marker.\n", + "[0.87s] Conversion complete.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:leann_backend_hnsw.hnsw_backend:✅ CSR conversion successful.\n", + "INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'knowledge.index'\n" + ] + } + ], "source": [ "from leann.api import LeannBuilder\n", "\n", @@ -40,9 +127,92 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n", + "[read_HNSW NL v4] Read levels vector, size: 5\n", + "[read_HNSW NL v4] Reading Compact Storage format indices...\n", + "[read_HNSW NL v4] Read compact_level_ptr, size: 10\n", + "[read_HNSW NL v4] Read compact_node_offsets, size: 6\n", + "[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n", + "[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n", + "[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n", + "[read_HNSW NL v4] Reading neighbors data into memory.\n", + "[read_HNSW NL v4] Read neighbors data, size: 20\n", + "[read_HNSW NL v4] Finished reading metadata and CSR indices.\n", + "INFO: Skipping external storage loading, since is_recompute is true.\n", + "INFO: Terminating server process (PID: 1311) for backend leann_backend_hnsw.hnsw_embedding_server...\n", + "INFO: Server process 1311 terminated.\n", + "🔍 DEBUG LeannSearcher.search() called:\n", + " Query: 'programming languages'\n", + " Top_k: 2\n", + " Additional kwargs: {}\n", + "DEBUG: Found process on port 5557: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5557 --model-name Qwen/Qwen3-Embedding-0.6B --passages-file wechat_history_try_new/wechat_history.leann.meta.json\n", + "DEBUG: model_matches: False, passages_matches: False, overall: False\n", + "⚠️ Port 5557 has incompatible server, trying next port...\n", + "DEBUG: Found process on port 5558: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5558 --model-name facebook/contriever --passages-file all_google/chrome_history.leann.meta.json\n", + "DEBUG: model_matches: False, passages_matches: False, overall: False\n", + "⚠️ Port 5558 has incompatible server, trying next port...\n", + "DEBUG: Found process on port 5559: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5559 --model-name facebook/contriever --passages-file allemail_text/mail_documents.leann.meta.json\n", + "DEBUG: model_matches: False, passages_matches: False, overall: False\n", + "⚠️ Port 5559 has incompatible server, trying next port...\n", + "⚠️ Using port 5560 instead of 5557\n", + "INFO: Starting embedding server on port 5560...\n", + "INFO: Command: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever-msmarco --passages-file knowledge.leann.meta.json\n", + "INFO: Server process started with PID: 5895\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Embedding server is ready!\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: INFO: Registering backend 'diskann'\n", + "DEBUG: Found process on port 5560: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever-msmarco --passages-file knowledge.leann.meta.json[leann_backend_hnsw.hnsw_embedding_server LOG]: INFO: Registering backend 'hnsw'\n", + "\n", + "DEBUG: model_matches: True, passages_matches: True, overall: True\n", + "✅ Existing server process (PID 5895) is compatible\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: INFO:datasets:PyTorch version 2.7.1 available.\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever-msmarco\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever-msmarco. Creating a new one with mean pooling.\n", + " Generated embedding shape: (1, 768)\n", + " Embedding time: 5.077293157577515 seconds\n", + "ZmqDistanceComputer initialized: d=768, metric=0\n", + " Search time: 0.1401360034942627 seconds\n", + " Backend returned: labels=2 results\n", + " Processing 2 passage IDs:\n", + " 1. passage_id='0' -> SUCCESS: C# is a powerful programming language...\n", + " 2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is very popular...\n", + " Final enriched results: 2 passages\n" + ] + }, + { + "data": { + "text/plain": [ + "[SearchResult(id='0', score=np.float32(1.444752), text='C# is a powerful programming language', metadata={}),\n", + " SearchResult(id='1', score=np.float32(1.394647), text='Python is a powerful programming language and it is very popular', metadata={})]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from leann.api import LeannSearcher\n", "\n", @@ -60,9 +230,89 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:leann.chat:Attempting to create LLM of type='ollama' with model='llama3.2:1b'\n", + "INFO:leann.chat:Initializing OllamaChat with model='llama3.2:1b' and host='http://localhost:11434'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n", + "[read_HNSW NL v4] Read levels vector, size: 5\n", + "[read_HNSW NL v4] Reading Compact Storage format indices...\n", + "[read_HNSW NL v4] Read compact_level_ptr, size: 10\n", + "[read_HNSW NL v4] Read compact_node_offsets, size: 6\n", + "[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n", + "[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n", + "[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n", + "[read_HNSW NL v4] Reading neighbors data into memory.\n", + "[read_HNSW NL v4] Read neighbors data, size: 20\n", + "[read_HNSW NL v4] Finished reading metadata and CSR indices.\n", + "INFO: Skipping external storage loading, since is_recompute is true.\n", + "🔍 DEBUG LeannSearcher.search() called:\n", + " Query: 'Compare the two retrieved programming languages and say which one is more popular today.'\n", + " Top_k: 2\n", + " Additional kwargs: {}\n", + "DEBUG: Found process on port 5557: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5557 --model-name Qwen/Qwen3-Embedding-0.6B --passages-file wechat_history_try_new/wechat_history.leann.meta.json\n", + "DEBUG: model_matches: False, passages_matches: False, overall: False\n", + "⚠️ Port 5557 has incompatible server, trying next port...\n", + "DEBUG: Found process on port 5558: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5558 --model-name facebook/contriever --passages-file all_google/chrome_history.leann.meta.json\n", + "DEBUG: model_matches: False, passages_matches: False, overall: False\n", + "⚠️ Port 5558 has incompatible server, trying next port...\n", + "DEBUG: Found process on port 5559: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5559 --model-name facebook/contriever --passages-file allemail_text/mail_documents.leann.meta.json\n", + "DEBUG: model_matches: False, passages_matches: False, overall: False\n", + "⚠️ Port 5559 has incompatible server, trying next port...\n", + "DEBUG: Found process on port 5560: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever-msmarco --passages-file knowledge.leann.meta.json\n", + "DEBUG: model_matches: True, passages_matches: True, overall: True\n", + "✅ Found compatible server on port 5560\n", + "✅ Using existing compatible server on port 5560\n", + "DEBUG: Found process on port 5560: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever-msmarco --passages-file knowledge.leann.meta.json\n", + "DEBUG: model_matches: True, passages_matches: True, overall: True\n", + "✅ Found compatible server on port 5560\n", + "✅ Using existing compatible server on port 5560\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:leann.chat:Sending request to Ollama: {'model': 'llama3.2:1b', 'prompt': 'Here is some retrieved context that might help answer your question:\\n\\nPython is a powerful programming language and it is very popular\\n\\nC# is a powerful programming language\\n\\nQuestion: Compare the two retrieved programming languages and say which one is more popular today.\\n\\nPlease provide the best answer you can based on this context and your knowledge.', 'stream': False, 'options': {}}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Generated embedding shape: (1, 768)\n", + " Embedding time: 0.09699082374572754 seconds\n", + "ZmqDistanceComputer initialized: d=768, metric=0\n", + " Search time: 0.028768062591552734 seconds\n", + " Backend returned: labels=2 results\n", + " Processing 2 passage IDs:\n", + " 1. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is very popular...\n", + " 2. passage_id='0' -> SUCCESS: C# is a powerful programming language...\n", + " Final enriched results: 2 passages\n" + ] + }, + { + "data": { + "text/plain": [ + "\"Based on my analysis, I would say that C# is currently more popular than Python. Here's why:\\n\\n1. **Industry adoption**: C# has been widely adopted in the .NET ecosystem, which includes Microsoft products like Visual Studio, Azure, and Office 365. This extensive industry support contributes to its higher popularity.\\n2. **Microsoft's focus**: As a native Windows development language, C# is closely tied to Microsoft's ecosystem, making it more integrated with Windows-based systems. This integration has helped boost C#'s popularity among developers working on Windows-related projects.\\n3. **Cross-platform compatibility**: While Python is gaining traction in other areas, such as data science and machine learning, C# remains particularly popular for developing Windows desktop applications, mobile apps, and games that run on both Windows and macOS/Android/iOS platforms.\\n\\nThat being said, Python's popularity has been growing rapidly due to its ease of use, flexibility, and extensive libraries (e.g., NumPy, pandas, scikit-learn). Its adoption is expanding into areas like web development (with Django and Flask), data science (with TensorFlow and Keras), and more. However, when compared to C#, Python's popularity appears to be higher in certain segments.\\n\\nKeep in mind that both languages have their strengths and weaknesses, and the choice between them ultimately depends on the specific project requirements and the target audience.\"" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from leann.api import LeannChat\n", "\n",