From 455f93fb7c12f83508c3dcf3d7b99ffe045f19db Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Sun, 27 Jul 2025 18:20:13 -0700 Subject: [PATCH 1/9] fix emaple and add pypi example --- README.md | 35 ++++----- demo.ipynb | 141 +--------------------------------- packages/leann/README.md | 17 ++-- packages/leann/pyproject.toml | 6 +- 4 files changed, 26 insertions(+), 173 deletions(-) diff --git a/README.md b/README.md index 6bdebfc..f4af89a 100755 --- a/README.md +++ b/README.md @@ -114,32 +114,23 @@ Our declarative API makes RAG as easy as writing a config file. [Try in this ipynb file →](demo.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) ```python -from leann.api import LeannBuilder, LeannSearcher, LeannChat +from leann import LeannBuilder, LeannSearcher, LeannChat +from pathlib import Path +INDEX_PATH = str(Path("./").resolve() / "demo.leann") -# 1. Build the index (no embeddings stored!) +# Build an index builder = LeannBuilder(backend_name="hnsw") -builder.add_text("C# is a powerful programming language") -builder.add_text("Python is a powerful programming language and it is very popular") -builder.add_text("Machine learning transforms industries") -builder.add_text("Neural networks process complex data") -builder.add_text("Leann is a great storage saving engine for RAG on your MacBook") -builder.build_index("knowledge.leann") +builder.add_text("LEANN saves 97% storage compared to traditional vector databases.") +builder.add_text("Tung Tung Tung Sahur called—they need their banana‑crocodile hybrid back") +builder.build_index(INDEX_PATH) -# 2. Search with real-time embeddings -searcher = LeannSearcher("knowledge.leann") -results = searcher.search("programming languages", top_k=2) +# Search +searcher = LeannSearcher(INDEX_PATH) +results = searcher.search("fantastical AI-generated creatures", top_k=1) -# 3. Chat with LEANN using retrieved results -llm_config = { - "type": "ollama", - "model": "llama3.2:1b" -} - -chat = LeannChat(index_path="knowledge.leann", llm_config=llm_config) -response = chat.ask( - "Compare the two retrieved programming languages and say which one is more popular today.", - top_k=2, -) +# Chat with your data +chat = LeannChat(INDEX_PATH, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"}) +response = chat.ask("How much storage does LEANN save?", top_k=1) ``` ## RAG on Everything! diff --git a/demo.ipynb b/demo.ipynb index 016302c..e91ec01 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Quick Start in 30s\n", + "# Quick Start \n", "\n", "**Home GitHub Repository:** [LEANN on GitHub](https://github.com/yichuan-w/LEANN)\n", "\n", @@ -49,68 +49,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing passages: 100%|██████████| 5/5 [00:00<00:00, 17077.79chunk/s]\n", - "Batches: 100%|██████████| 1/1 [00:00<00:00, 36.43it/s]\n", - "WARNING:leann_backend_hnsw.hnsw_backend:Converting data to float32, shape: (5, 768)\n", - "INFO:leann_backend_hnsw.hnsw_backend:INFO: Converting HNSW index to CSR-pruned format...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "M: 64 for level: 0\n", - "Starting conversion: index.index -> index.csr.tmp\n", - "[0.00s] Reading Index HNSW header...\n", - "[0.00s] Header read: d=768, ntotal=5\n", - "[0.00s] Reading HNSW struct vectors...\n", - " Reading vector (dtype=, fmt='d')... Count=6, Bytes=48\n", - "[0.00s] Read assign_probas (6)\n", - " Reading vector (dtype=, fmt='i')... Count=7, Bytes=28\n", - "[0.14s] Read cum_nneighbor_per_level (7)\n", - " Reading vector (dtype=, fmt='i')... Count=5, Bytes=20\n", - "[0.24s] Read levels (5)\n", - "[0.33s] Probing for compact storage flag...\n", - "[0.33s] Found compact flag: False\n", - "[0.33s] Compact flag is False, reading original format...\n", - "[0.33s] Probing for potential extra byte before non-compact offsets...\n", - "[0.33s] Found and consumed an unexpected 0x00 byte.\n", - " Reading vector (dtype=, fmt='Q')... Count=6, Bytes=48\n", - "[0.33s] Read offsets (6)\n", - "[0.41s] Attempting to read neighbors vector...\n", - " Reading vector (dtype=, fmt='i')... Count=320, Bytes=1280\n", - "[0.41s] Read neighbors (320)\n", - "[0.54s] Read scalar params (ep=4, max_lvl=0)\n", - "[0.54s] Checking for storage data...\n", - "[0.54s] Found storage fourcc: 49467849.\n", - "[0.54s] Converting to CSR format...\n", - "[0.54s] Conversion loop finished. \n", - "[0.54s] Running validation checks...\n", - " Checking total valid neighbor count...\n", - " OK: Total valid neighbors = 20\n", - " Checking final pointer indices...\n", - " OK: Final pointers match data size.\n", - "[0.54s] Deleting original neighbors and offsets arrays...\n", - " CSR Stats: |data|=20, |level_ptr|=10\n", - "[0.63s] Writing CSR HNSW graph data in FAISS-compatible order...\n", - " Pruning embeddings: Writing NULL storage marker.\n", - "[0.71s] Conversion complete.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:leann_backend_hnsw.hnsw_backend:✅ CSR conversion successful.\n", - "INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'index.index'\n" - ] - } - ], + "outputs": [], "source": [ "from leann.api import LeannBuilder\n", "\n", @@ -136,81 +75,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:leann.api:🔍 LeannSearcher.search() called:\n", - "INFO:leann.api: Query: 'programming languages'\n", - "INFO:leann.api: Top_k: 2\n", - "INFO:leann.api: Additional kwargs: {}\n", - "INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n", - "INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n", - "INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n", - "INFO:leann.embedding_server_manager:Port 5560 has incompatible server, trying next port...\n", - "INFO:leann.embedding_server_manager:Port 5561 has incompatible server, trying next port...\n", - "INFO:leann.embedding_server_manager:Port 5562 has incompatible server, trying next port...\n", - "INFO:leann.embedding_server_manager:Starting embedding server on port 5563...\n", - "INFO:leann.embedding_server_manager:Command: /Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5563 --model-name facebook/contriever --passages-file /Users/yichuan/Desktop/code/test_leann_pip/LEANN/content/index.meta.json\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "INFO:leann.embedding_server_manager:Server process started with PID: 31699\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n", - "[read_HNSW NL v4] Read levels vector, size: 5\n", - "[read_HNSW NL v4] Reading Compact Storage format indices...\n", - "[read_HNSW NL v4] Read compact_level_ptr, size: 10\n", - "[read_HNSW NL v4] Read compact_node_offsets, size: 6\n", - "[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n", - "[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n", - "[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n", - "[read_HNSW NL v4] Reading neighbors data into memory.\n", - "[read_HNSW NL v4] Read neighbors data, size: 20\n", - "[read_HNSW NL v4] Finished reading metadata and CSR indices.\n", - "INFO: Skipping external storage loading, since is_recompute is true.\n", - "INFO: Registering backend 'hnsw'\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Traceback (most recent call last):\n", - " File \"\", line 198, in _run_module_as_main\n", - " File \"\", line 88, in _run_code\n", - " File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann_backend_hnsw/hnsw_embedding_server.py\", line 323, in \n", - " create_hnsw_embedding_server(\n", - " File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann_backend_hnsw/hnsw_embedding_server.py\", line 98, in create_hnsw_embedding_server\n", - " passages = PassageManager(passage_sources)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/api.py\", line 127, in __init__\n", - " raise FileNotFoundError(f\"Passage index file not found: {index_file}\")\n", - "FileNotFoundError: Passage index file not found: /Users/yichuan/Desktop/code/test_leann_pip/LEANN/index.passages.idx\n", - "ERROR:leann.embedding_server_manager:Server terminated during startup.\n" - ] - }, - { - "ename": "RuntimeError", - "evalue": "Failed to start embedding server on port 5563", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mleann\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mapi\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LeannSearcher\n\u001b[32m 3\u001b[39m searcher = LeannSearcher(\u001b[33m\"\u001b[39m\u001b[33mindex\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m results = \u001b[43msearcher\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprogramming languages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_k\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m results\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/api.py:439\u001b[39m, in \u001b[36mLeannSearcher.search\u001b[39m\u001b[34m(self, query, top_k, complexity, beam_width, prune_ratio, recompute_embeddings, pruning_strategy, expected_zmq_port, **kwargs)\u001b[39m\n\u001b[32m 437\u001b[39m start_time = time.time()\n\u001b[32m 438\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recompute_embeddings:\n\u001b[32m--> \u001b[39m\u001b[32m439\u001b[39m zmq_port = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbackend_impl\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_ensure_server_running\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 440\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmeta_path_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 441\u001b[39m \u001b[43m \u001b[49m\u001b[43mport\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_zmq_port\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 442\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 443\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 444\u001b[39m \u001b[38;5;28;01mdel\u001b[39;00m expected_zmq_port\n\u001b[32m 445\u001b[39m zmq_time = time.time() - start_time\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/searcher_base.py:81\u001b[39m, in \u001b[36mBaseSearcher._ensure_server_running\u001b[39m\u001b[34m(self, passages_source_file, port, **kwargs)\u001b[39m\n\u001b[32m 72\u001b[39m server_started, actual_port = \u001b[38;5;28mself\u001b[39m.embedding_server_manager.start_server(\n\u001b[32m 73\u001b[39m port=port,\n\u001b[32m 74\u001b[39m model_name=\u001b[38;5;28mself\u001b[39m.embedding_model,\n\u001b[32m (...)\u001b[39m\u001b[32m 78\u001b[39m enable_warmup=kwargs.get(\u001b[33m\"\u001b[39m\u001b[33menable_warmup\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[32m 79\u001b[39m )\n\u001b[32m 80\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m server_started:\n\u001b[32m---> \u001b[39m\u001b[32m81\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 82\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to start embedding server on port \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mactual_port\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 83\u001b[39m )\n\u001b[32m 85\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m actual_port\n", - "\u001b[31mRuntimeError\u001b[39m: Failed to start embedding server on port 5563" - ] - } - ], + "outputs": [], "source": [ "from leann.api import LeannSearcher\n", "\n", diff --git a/packages/leann/README.md b/packages/leann/README.md index 0488c3d..4281ef1 100644 --- a/packages/leann/README.md +++ b/packages/leann/README.md @@ -16,25 +16,24 @@ uv pip install leann[diskann] ```python from leann import LeannBuilder, LeannSearcher, LeannChat +from pathlib import Path +INDEX_PATH = str(Path("./").resolve() / "demo.leann") # Build an index builder = LeannBuilder(backend_name="hnsw") builder.add_text("LEANN saves 97% storage compared to traditional vector databases.") -builder.build_index("my_index.leann") +builder.add_text("Tung Tung Tung Sahur called—they need their banana‑crocodile hybrid back") +builder.build_index(INDEX_PATH) # Search -searcher = LeannSearcher("my_index.leann") -results = searcher.search("storage savings", top_k=3) +searcher = LeannSearcher(INDEX_PATH) +results = searcher.search("fantastical AI-generated creatures", top_k=1) # Chat with your data -chat = LeannChat("my_index.leann", llm_config={"type": "ollama", "model": "llama3.2:1b"}) -response = chat.ask("How much storage does LEANN save?") +chat = LeannChat(INDEX_PATH, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"}) +response = chat.ask("How much storage does LEANN save?", top_k=1) ``` -## Documentation - -For full documentation, visit [https://leann.readthedocs.io](https://leann.readthedocs.io) - ## License MIT License diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml index a6db993..74ab903 100644 --- a/packages/leann/pyproject.toml +++ b/packages/leann/pyproject.toml @@ -36,7 +36,5 @@ diskann = [ ] [project.urls] -Homepage = "https://github.com/yourusername/leann" -Documentation = "https://leann.readthedocs.io" -Repository = "https://github.com/yourusername/leann" -Issues = "https://github.com/yourusername/leann/issues" +Repository = "https://github.com/yichuan-w/LEANN" +Issues = "https://github.com/yichuan-w/LEANN/issues" From 51c41acd82115049f9f11ab524d021f484b55695 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 27 Jul 2025 20:40:42 -0700 Subject: [PATCH 2/9] docs: add comprehensive CONTRIBUTING.md guide with pre-commit setup --- README.md | 2 +- docs/CONTRIBUTING.md | 220 +++++++++++++++++++++++++++++++++++++++++++ docs/contributing.md | 11 --- pyproject.toml | 1 + 4 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 docs/CONTRIBUTING.md delete mode 100644 docs/contributing.md diff --git a/README.md b/README.md index f4af89a..e4feb4c 100755 --- a/README.md +++ b/README.md @@ -451,7 +451,7 @@ If you find Leann useful, please cite: ## ✨ [Detailed Features →](docs/features.md) -## 🤝 [Contributing →](docs/contributing.md) +## 🤝 [Contributing →](docs/CONTRIBUTING.md) ## [FAQ →](docs/faq.md) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 0000000..67331bb --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,220 @@ +# 🤝 Contributing + +We welcome contributions! Leann is built by the community, for the community. + +## Ways to Contribute + +- 🐛 **Bug Reports**: Found an issue? Let us know! +- 💡 **Feature Requests**: Have an idea? We'd love to hear it! +- 🔧 **Code Contributions**: PRs welcome for all skill levels +- 📖 **Documentation**: Help make Leann more accessible +- 🧪 **Benchmarks**: Share your performance results + +## 🚀 Development Setup + +### Prerequisites + +1. **Install uv** (fast Python package installer): + ```bash + curl -LsSf https://astral.sh/uv/install.sh | sh + ``` + +2. **Clone the repository**: + ```bash + git clone https://github.com/LEANN-RAG/LEANN-RAG.git + cd LEANN-RAG + ``` + +3. **Install system dependencies**: + + **macOS:** + ```bash + brew install llvm libomp boost protobuf zeromq pkgconf + ``` + + **Ubuntu/Debian:** + ```bash + sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler \ + libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev + ``` + +4. **Build from source**: + ```bash + # macOS + CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync + + # Ubuntu/Debian + uv sync + ``` + +## 🔨 Pre-commit Hooks + +We use pre-commit hooks to ensure code quality and consistency. This runs automatically before each commit. + +### Setup Pre-commit + +1. **Install pre-commit** (already included when you run `uv sync`): + ```bash + uv pip install pre-commit + ``` + +2. **Install the git hooks**: + ```bash + pre-commit install + ``` + +3. **Run pre-commit manually** (optional): + ```bash + pre-commit run --all-files + ``` + +### Pre-commit Checks + +Our pre-commit configuration includes: +- **Trailing whitespace removal** +- **End-of-file fixing** +- **YAML validation** +- **Large file prevention** +- **Merge conflict detection** +- **Debug statement detection** +- **Code formatting with ruff** +- **Code linting with ruff** + +## 🧪 Testing + +### Running Tests + +```bash +# Run all tests +uv run pytest + +# Run specific test file +uv run pytest test/test_filename.py + +# Run with coverage +uv run pytest --cov=leann +``` + +### Writing Tests + +- Place tests in the `test/` directory +- Follow the naming convention `test_*.py` +- Use descriptive test names that explain what's being tested +- Include both positive and negative test cases + +## 📝 Code Style + +We use `ruff` for both linting and formatting to ensure consistent code style. + +### Format Your Code + +```bash +# Format all files +ruff format + +# Check formatting without changing files +ruff format --check +``` + +### Lint Your Code + +```bash +# Run linter with auto-fix +ruff check --fix + +# Just check without fixing +ruff check +``` + +### Style Guidelines + +- Follow PEP 8 conventions +- Use descriptive variable names +- Add type hints where appropriate +- Write docstrings for all public functions and classes +- Keep functions focused and single-purpose + +## 🚦 CI/CD + +Our CI pipeline runs automatically on all pull requests. It includes: + +1. **Linting and Formatting**: Ensures code follows our style guidelines +2. **Multi-platform builds**: Tests on Ubuntu and macOS +3. **Python version matrix**: Tests on Python 3.9-3.13 +4. **Wheel building**: Ensures packages can be built and distributed + +### CI Commands + +The CI uses the same commands as pre-commit to ensure consistency: +```bash +# Linting +ruff check . + +# Format checking +ruff format --check . +``` + +Make sure your code passes these checks locally before pushing! + +## 🔄 Pull Request Process + +1. **Fork the repository** and create your branch from `main`: + ```bash + git checkout -b feature/your-feature-name + ``` + +2. **Make your changes**: + - Write clean, documented code + - Add tests for new functionality + - Update documentation as needed + +3. **Run pre-commit checks**: + ```bash + pre-commit run --all-files + ``` + +4. **Test your changes**: + ```bash + uv run pytest + ``` + +5. **Commit with descriptive messages**: + ```bash + git commit -m "feat: add new search algorithm" + ``` + + Follow [Conventional Commits](https://www.conventionalcommits.org/): + - `feat:` for new features + - `fix:` for bug fixes + - `docs:` for documentation changes + - `test:` for test additions/changes + - `refactor:` for code refactoring + - `perf:` for performance improvements + +6. **Push and create a pull request**: + - Provide a clear description of your changes + - Reference any related issues + - Include examples or screenshots if applicable + +## 📚 Documentation + +When adding new features or making significant changes: + +1. Update relevant documentation in `/docs` +2. Add docstrings to new functions/classes +3. Update README.md if needed +4. Include usage examples + +## 🤔 Getting Help + +- **Discord**: Join our community for discussions +- **Issues**: Check existing issues or create a new one +- **Discussions**: For general questions and ideas + +## 📄 License + +By contributing, you agree that your contributions will be licensed under the same license as the project (MIT). + +--- + +Thank you for contributing to LEANN! Every contribution, no matter how small, helps make the project better for everyone. 🌟 diff --git a/docs/contributing.md b/docs/contributing.md deleted file mode 100644 index 1cacc26..0000000 --- a/docs/contributing.md +++ /dev/null @@ -1,11 +0,0 @@ -# 🤝 Contributing - -We welcome contributions! Leann is built by the community, for the community. - -## Ways to Contribute - -- 🐛 **Bug Reports**: Found an issue? Let us know! -- 💡 **Feature Requests**: Have an idea? We'd love to hear it! -- 🔧 **Code Contributions**: PRs welcome for all skill levels -- 📖 **Documentation**: Help make Leann more accessible -- 🧪 **Benchmarks**: Share your performance results diff --git a/pyproject.toml b/pyproject.toml index 0856945..aac0f78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dev = [ "ruff>=0.1.0", "matplotlib", "huggingface-hub>=0.20.0", + "pre-commit>=3.5.0", ] diskann = [ From e6f612b5e885a8875c96f052748f19d6b55c6d0c Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Sun, 27 Jul 2025 20:44:28 -0700 Subject: [PATCH 3/9] fix install and readme --- README.md | 72 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index f4af89a..2c1711b 100755 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ Clone the repository to access all examples and install LEANN from [PyPI](https: git clone git@github.com:yichuan-w/LEANN.git leann cd leann uv venv +source .venv/bin/activate uv pip install leann ``` @@ -82,31 +83,7 @@ uv sync ``` - ## Quick Start in 30s @@ -137,6 +114,48 @@ response = chat.ask("How much storage does LEANN save?", top_k=1) LEANN supports RAG on various data sources including documents (.pdf, .txt, .md), Apple Mail, Google Search History, WeChat, and more. + +> **Generation Model Setup** +> LEANN supports multiple LLM providers for text generation (OpenAI API, HuggingFace, Ollama). + +
+🔑 OpenAI API Setup (Default) + +Set your OpenAI API key as an environment variable: + +```bash +export OPENAI_API_KEY="your-api-key-here" +``` + +
+ +
+🔧 Ollama Setup (Recommended for full privacy) + +**macOS:** + +First, [download Ollama for macOS](https://ollama.com/download/mac). + +```bash +# Pull a lightweight model (recommended for consumer hardware) +ollama pull llama3.2:1b +``` + +**Linux:** + +```bash +# Install Ollama +curl -fsSL https://ollama.ai/install.sh | sh + +# Start Ollama service manually +ollama serve & + +# Pull a lightweight model (recommended for consumer hardware) +ollama pull llama3.2:1b +``` + +
+ ### 📄 Personal Data Manager: Process Any Documents (.pdf, .txt, .md)! Ask questions directly about your personal PDFs, documents, and any directory containing your files! @@ -147,11 +166,6 @@ Ask questions directly about your personal PDFs, documents, and any directory co The example below asks a question about summarizing two papers (uses default data in `examples/data`): -```bash -# Drop your PDFs, .txt, .md files into examples/data/ -uv run ./examples/main_cli_example.py -``` - ``` # Or use python directly source .venv/bin/activate From e9d2d420bd47156434aee0fbbac2c3b226cad28c Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Sun, 27 Jul 2025 20:48:23 -0700 Subject: [PATCH 4/9] fix some readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 58d8ff2..b6580d1 100755 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ LEANN supports RAG on various data sources including documents (.pdf, .txt, .md) > **Generation Model Setup** +> > LEANN supports multiple LLM providers for text generation (OpenAI API, HuggingFace, Ollama).
@@ -176,6 +177,7 @@ python ./examples/main_cli_example.py ### 📧 Your Personal Email Secretary: RAG on Apple Mail! +> **Note:** The examples below currently support macOS only. Windows support coming soon.

@@ -465,7 +467,7 @@ If you find Leann useful, please cite: ## ✨ [Detailed Features →](docs/features.md) -## 🤝 [Contributing →](docs/CONTRIBUTING.md) +## 🤝 [Contributing →](docs/contributing.md) ## [FAQ →](docs/faq.md) From 5c8921673aff042e5d351377138b3e956a30c04c Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 27 Jul 2025 21:19:29 -0700 Subject: [PATCH 5/9] fix: auto-detect normalized embeddings and use cosine distance (#8) * fix: auto-detect normalized embeddings and use cosine distance - Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere) - Automatically set distance_metric='cosine' for normalized embeddings - Add warnings when using non-optimal distance metrics - Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2) - Fix DiskANN zmq_port compatibility with lazy loading strategy - Add documentation for normalized embeddings feature This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric. * style: format --- docs/normalized_embeddings.md | 75 +++++++++++++++++++ examples/main_cli_example.py | 24 +++++- .../leann_backend_diskann/diskann_backend.py | 59 +++++++++++---- .../leann_backend_hnsw/hnsw_backend.py | 10 ++- packages/leann-core/src/leann/api.py | 71 ++++++++++++++++++ uv.lock | 10 +-- 6 files changed, 223 insertions(+), 26 deletions(-) create mode 100644 docs/normalized_embeddings.md diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md new file mode 100644 index 0000000..d6f285e --- /dev/null +++ b/docs/normalized_embeddings.md @@ -0,0 +1,75 @@ +# Normalized Embeddings Support in LEANN + +LEANN now automatically detects normalized embedding models and sets the appropriate distance metric for optimal performance. + +## What are Normalized Embeddings? + +Normalized embeddings are vectors with L2 norm = 1 (unit vectors). These embeddings are optimized for cosine similarity rather than Maximum Inner Product Search (MIPS). + +## Automatic Detection + +When you create a `LeannBuilder` instance with a normalized embedding model, LEANN will: + +1. **Automatically set `distance_metric="cosine"`** if not specified +2. **Show a warning** if you manually specify a different distance metric +3. **Provide optimal search performance** with the correct metric + +## Supported Normalized Embedding Models + +### OpenAI +All OpenAI text embedding models are normalized: +- `text-embedding-ada-002` +- `text-embedding-3-small` +- `text-embedding-3-large` + +### Voyage AI +All Voyage AI embedding models are normalized: +- `voyage-2` +- `voyage-3` +- `voyage-large-2` +- `voyage-multilingual-2` +- `voyage-code-2` + +### Cohere +All Cohere embedding models are normalized: +- `embed-english-v3.0` +- `embed-multilingual-v3.0` +- `embed-english-light-v3.0` +- `embed-multilingual-light-v3.0` + +## Example Usage + +```python +from leann.api import LeannBuilder + +# Automatic detection - will use cosine distance +builder = LeannBuilder( + backend_name="hnsw", + embedding_model="text-embedding-3-small", + embedding_mode="openai" +) +# Warning: Detected normalized embeddings model 'text-embedding-3-small'... +# Automatically setting distance_metric='cosine' + +# Manual override (not recommended) +builder = LeannBuilder( + backend_name="hnsw", + embedding_model="text-embedding-3-small", + embedding_mode="openai", + distance_metric="mips" # Will show warning +) +# Warning: Using 'mips' distance metric with normalized embeddings... +``` + +## Non-Normalized Embeddings + +Models like `facebook/contriever` and other sentence-transformers models that are not normalized will continue to use MIPS by default, which is optimal for them. + +## Why This Matters + +Using the wrong distance metric with normalized embeddings can lead to: +- **Poor search quality** due to HNSW's early termination with narrow score ranges +- **Incorrect ranking** of search results +- **Suboptimal performance** compared to using the correct metric + +For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py). \ No newline at end of file diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py index adf0261..502821c 100644 --- a/examples/main_cli_example.py +++ b/examples/main_cli_example.py @@ -30,17 +30,22 @@ async def main(args): all_texts = [] for doc in documents: nodes = node_parser.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) + if nodes: + all_texts.extend(node.get_content() for node in nodes) print("--- Index directory not found, building new index ---") print("\n[PHASE 1] Building Leann index...") + # LeannBuilder now automatically detects normalized embeddings and sets appropriate distance metric + print(f"Using {args.embedding_model} with {args.embedding_mode} mode") + # Use HNSW backend for better macOS compatibility builder = LeannBuilder( backend_name="hnsw", - embedding_model="facebook/contriever", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + # distance_metric is automatically set based on embedding model graph_degree=32, complexity=64, is_compact=True, @@ -89,6 +94,19 @@ if __name__ == "__main__": default="Qwen/Qwen3-0.6B", help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).", ) + parser.add_argument( + "--embedding-model", + type=str, + default="facebook/contriever", + help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small').", + ) + parser.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx"], + help="The embedding backend mode.", + ) parser.add_argument( "--host", type=str, diff --git a/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py b/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py index 7ca2810..b73f36f 100644 --- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py +++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py @@ -163,18 +163,44 @@ class DiskannSearcher(BaseSearcher): self.num_threads = kwargs.get("num_threads", 8) - fake_zmq_port = 6666 + # For DiskANN, we need to reinitialize the index when zmq_port changes + # Store the initialization parameters for later use full_index_prefix = str(self.index_dir / self.index_path.stem) - self._index = diskannpy.StaticDiskFloatIndex( - metric_enum, - full_index_prefix, - self.num_threads, - kwargs.get("num_nodes_to_cache", 0), - 1, - fake_zmq_port, # Initial port, can be updated at runtime - "", - "", - ) + self._init_params = { + "metric_enum": metric_enum, + "full_index_prefix": full_index_prefix, + "num_threads": self.num_threads, + "num_nodes_to_cache": kwargs.get("num_nodes_to_cache", 0), + "cache_mechanism": 1, + "pq_prefix": "", + "partition_prefix": "", + } + self._diskannpy = diskannpy + self._current_zmq_port = None + self._index = None + logger.debug("DiskANN searcher initialized (index will be loaded on first search)") + + def _ensure_index_loaded(self, zmq_port: int): + """Ensure the index is loaded with the correct zmq_port.""" + if self._index is None or self._current_zmq_port != zmq_port: + # Need to (re)load the index with the correct zmq_port + with suppress_cpp_output_if_needed(): + if self._index is not None: + logger.debug(f"Reloading DiskANN index with new zmq_port: {zmq_port}") + else: + logger.debug(f"Loading DiskANN index with zmq_port: {zmq_port}") + + self._index = self._diskannpy.StaticDiskFloatIndex( + self._init_params["metric_enum"], + self._init_params["full_index_prefix"], + self._init_params["num_threads"], + self._init_params["num_nodes_to_cache"], + self._init_params["cache_mechanism"], + zmq_port, + self._init_params["pq_prefix"], + self._init_params["partition_prefix"], + ) + self._current_zmq_port = zmq_port def search( self, @@ -212,14 +238,15 @@ class DiskannSearcher(BaseSearcher): Returns: Dict with 'labels' (list of lists) and 'distances' (ndarray) """ - # Handle zmq_port compatibility: DiskANN can now update port at runtime + # Handle zmq_port compatibility: Ensure index is loaded with correct port if recompute_embeddings: if zmq_port is None: raise ValueError("zmq_port must be provided if recompute_embeddings is True") - current_port = self._index.get_zmq_port() - if zmq_port != current_port: - logger.debug(f"Updating DiskANN zmq_port from {current_port} to {zmq_port}") - self._index.set_zmq_port(zmq_port) + self._ensure_index_loaded(zmq_port) + else: + # If not recomputing, we still need an index, use a default port + if self._index is None: + self._ensure_index_loaded(6666) # Default port when not recomputing # DiskANN doesn't support "proportional" strategy if pruning_strategy == "proportional": diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py index a6bd852..e1afb36 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py @@ -28,6 +28,12 @@ def get_metric_map(): } +def normalize_l2(data: np.ndarray) -> np.ndarray: + norms = np.linalg.norm(data, axis=1, keepdims=True) + norms[norms == 0] = 1 # Avoid division by zero + return data / norms + + @register_backend("hnsw") class HNSWBackend(LeannBackendFactoryInterface): @staticmethod @@ -76,7 +82,7 @@ class HNSWBuilder(LeannBackendBuilderInterface): index.hnsw.efConstruction = self.efConstruction if self.distance_metric.lower() == "cosine": - faiss.normalize_L2(data) + data = normalize_l2(data) index.add(data.shape[0], faiss.swig_ptr(data)) index_file = index_dir / f"{index_prefix}.index" @@ -186,7 +192,7 @@ class HNSWSearcher(BaseSearcher): if query.dtype != np.float32: query = query.astype(np.float32) if self.distance_metric == "cosine": - faiss.normalize_L2(query) + query = normalize_l2(query) params = faiss.SearchParametersHNSW() if zmq_port is not None: diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 66bb7a5..5848b87 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -7,6 +7,7 @@ import json import logging import pickle import time +import warnings from dataclasses import dataclass, field from pathlib import Path from typing import Any, Literal @@ -163,6 +164,76 @@ class LeannBuilder: self.embedding_model = embedding_model self.dimensions = dimensions self.embedding_mode = embedding_mode + + # Check if we need to use cosine distance for normalized embeddings + normalized_embeddings_models = { + # OpenAI models + ("openai", "text-embedding-ada-002"), + ("openai", "text-embedding-3-small"), + ("openai", "text-embedding-3-large"), + # Voyage AI models + ("voyage", "voyage-2"), + ("voyage", "voyage-3"), + ("voyage", "voyage-large-2"), + ("voyage", "voyage-multilingual-2"), + ("voyage", "voyage-code-2"), + # Cohere models + ("cohere", "embed-english-v3.0"), + ("cohere", "embed-multilingual-v3.0"), + ("cohere", "embed-english-light-v3.0"), + ("cohere", "embed-multilingual-light-v3.0"), + } + + # Also check for patterns in model names + is_normalized = False + current_model_lower = embedding_model.lower() + current_mode_lower = embedding_mode.lower() + + # Check exact matches + for mode, model in normalized_embeddings_models: + if (current_mode_lower == mode and current_model_lower == model) or ( + mode in current_mode_lower and model in current_model_lower + ): + is_normalized = True + break + + # Check patterns + if not is_normalized: + # OpenAI patterns + if "openai" in current_mode_lower or "openai" in current_model_lower: + if any( + pattern in current_model_lower + for pattern in ["text-embedding", "ada", "3-small", "3-large"] + ): + is_normalized = True + # Voyage patterns + elif "voyage" in current_mode_lower or "voyage" in current_model_lower: + is_normalized = True + # Cohere patterns + elif "cohere" in current_mode_lower or "cohere" in current_model_lower: + if "embed" in current_model_lower: + is_normalized = True + + # Handle distance metric + if is_normalized and "distance_metric" not in backend_kwargs: + backend_kwargs["distance_metric"] = "cosine" + warnings.warn( + f"Detected normalized embeddings model '{embedding_model}' with mode '{embedding_mode}'. " + f"Automatically setting distance_metric='cosine' for optimal performance. " + f"Normalized embeddings (L2 norm = 1) should use cosine similarity instead of MIPS.", + UserWarning, + stacklevel=2, + ) + elif is_normalized and backend_kwargs.get("distance_metric", "").lower() != "cosine": + current_metric = backend_kwargs.get("distance_metric", "mips") + warnings.warn( + f"Warning: Using '{current_metric}' distance metric with normalized embeddings model " + f"'{embedding_model}' may lead to suboptimal search results. " + f"Consider using 'cosine' distance metric for better performance.", + UserWarning, + stacklevel=2, + ) + self.backend_kwargs = backend_kwargs self.chunks: list[dict[str, Any]] = [] diff --git a/uv.lock b/uv.lock index a46abc5..0a32b65 100644 --- a/uv.lock +++ b/uv.lock @@ -1847,7 +1847,7 @@ wheels = [ [[package]] name = "leann-backend-diskann" -version = "0.1.13" +version = "0.1.14" source = { editable = "packages/leann-backend-diskann" } dependencies = [ { name = "leann-core" }, @@ -1858,14 +1858,14 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.1.13" }, + { name = "leann-core", specifier = "==0.1.14" }, { name = "numpy" }, { name = "protobuf", specifier = ">=3.19.0" }, ] [[package]] name = "leann-backend-hnsw" -version = "0.1.13" +version = "0.1.14" source = { editable = "packages/leann-backend-hnsw" } dependencies = [ { name = "leann-core" }, @@ -1877,7 +1877,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.1.13" }, + { name = "leann-core", specifier = "==0.1.14" }, { name = "msgpack", specifier = ">=1.0.0" }, { name = "numpy" }, { name = "pyzmq", specifier = ">=23.0.0" }, @@ -1885,7 +1885,7 @@ requires-dist = [ [[package]] name = "leann-core" -version = "0.1.13" +version = "0.1.14" source = { editable = "packages/leann-core" } dependencies = [ { name = "accelerate" }, From 6f5d5e4a77d159182a76278bdce6b75041d2b356 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Sun, 27 Jul 2025 21:50:09 -0700 Subject: [PATCH 6/9] fix some readme --- README.md | 9 +++++---- docs/CONTRIBUTING.md | 6 +++--- docs/normalized_embeddings.md | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b6580d1..4683957 100755 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ LEANN achieves this through *graph-based selective recomputation* with *high-deg 🪶 **Lightweight:** Graph-based recomputation eliminates heavy embedding storage, while smart graph pruning and CSR format minimize graph storage overhead. Always less storage, less memory usage! +📦 **Portable:** Transfer your entire knowledge base between devices (even with others) with minimal cost - your personal AI memory travels with you. + 📈 **Scalability:** Handle messy personal data that would crash traditional vector DBs, easily managing your growing personalized data and agent generated memory! ✨ **No Accuracy Loss:** Maintain the same search quality as heavyweight solutions while using 97% less storage. @@ -85,7 +87,7 @@ uv sync -## Quick Start in 30s +## Quick Start Our declarative API makes RAG as easy as writing a config file. [Try in this ipynb file →](demo.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) @@ -116,7 +118,6 @@ LEANN supports RAG on various data sources including documents (.pdf, .txt, .md) > **Generation Model Setup** -> > LEANN supports multiple LLM providers for text generation (OpenAI API, HuggingFace, Ollama).

@@ -467,10 +468,10 @@ If you find Leann useful, please cite: ## ✨ [Detailed Features →](docs/features.md) -## 🤝 [Contributing →](docs/contributing.md) +## 🤝 [CONTRIBUTING →](docs/CONTRIBUTING.md) -## [FAQ →](docs/faq.md) +## ❓ [FAQ →](docs/faq.md) ## 📈 [Roadmap →](docs/roadmap.md) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 67331bb..4a37e26 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -26,7 +26,7 @@ We welcome contributions! Leann is built by the community, for the community. ``` 3. **Install system dependencies**: - + **macOS:** ```bash brew install llvm libomp boost protobuf zeromq pkgconf @@ -42,7 +42,7 @@ We welcome contributions! Leann is built by the community, for the community. ```bash # macOS CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync - + # Ubuntu/Debian uv sync ``` @@ -182,7 +182,7 @@ Make sure your code passes these checks locally before pushing! ```bash git commit -m "feat: add new search algorithm" ``` - + Follow [Conventional Commits](https://www.conventionalcommits.org/): - `feat:` for new features - `fix:` for bug fixes diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md index d6f285e..46213e5 100644 --- a/docs/normalized_embeddings.md +++ b/docs/normalized_embeddings.md @@ -72,4 +72,4 @@ Using the wrong distance metric with normalized embeddings can lead to: - **Incorrect ranking** of search results - **Suboptimal performance** compared to using the correct metric -For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py). \ No newline at end of file +For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py). From e9ee68747228d9fd8e55b36641f75c6dcd25891e Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Sun, 27 Jul 2025 21:56:05 -0700 Subject: [PATCH 7/9] nit: fix readme --- README.md | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4683957..5165026 100755 --- a/README.md +++ b/README.md @@ -41,11 +41,18 @@ LEANN achieves this through *graph-based selective recomputation* with *high-deg ## Installation -> **Prerequisites:** Install uv first if you don't have it: -> ```bash -> curl -LsSf https://astral.sh/uv/install.sh | sh -> ``` -> 📖 [Detailed uv installation methods →](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) +
+📦 Prerequisites: Install uv (if you don't have it) + +Install uv first if you don't have it: + +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +📖 [Detailed uv installation methods →](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) + +
LEANN provides two installation methods: **pip install** (quick and easy) and **build from source** (recommended for development). @@ -87,13 +94,14 @@ uv sync -## Quick Start +## Quick Star Our declarative API makes RAG as easy as writing a config file. -[Try in this ipynb file →](demo.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb) ```python -from leann import LeannBuilder, LeannSearcher, LeannChat +from leann import LeannBuilder, LeannSearcher, LeannCha from pathlib import Path INDEX_PATH = str(Path("./").resolve() / "demo.leann") @@ -260,7 +268,7 @@ The default Chrome profile path is configured for a typical macOS setup. If you 1. Open Terminal 2. Run: `ls ~/Library/Application\ Support/Google/Chrome/` 3. Look for folders like "Default", "Profile 1", "Profile 2", etc. -4. Use the full path as your `--chrome-profile` argument +4. Use the full path as your `--chrome-profile` argumen **Common Chrome profile locations:** - macOS: `~/Library/Application Support/Google/Chrome/Default` @@ -303,7 +311,7 @@ sudo packages/wechat-exporter/wechattweak-cli install **Troubleshooting:** - **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41) -- **Export errors**: If you encounter the error below, try restarting WeChat +- **Export errors**: If you encounter the error below, try restarting WeCha ``` Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed. Failed to find or export WeChat data. Exiting. @@ -358,7 +366,7 @@ leann search my-docs "machine learning concepts" leann ask my-docs --interactive # List all your indexes -leann list +leann lis ``` **Key CLI features:** @@ -443,7 +451,7 @@ Options: ```bash uv pip install -e ".[dev]" # Install dev dependencies -python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR dataset +python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR datase python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia ``` From b2eba23e219d79870d01368d2c4c55a06c5b11b6 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Mon, 28 Jul 2025 05:05:30 +0000 Subject: [PATCH 8/9] chore: release v0.1.15 --- packages/leann-backend-diskann/pyproject.toml | 4 ++-- packages/leann-backend-hnsw/pyproject.toml | 4 ++-- packages/leann-core/pyproject.toml | 2 +- packages/leann/pyproject.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml index f8f38bc..ae3b3a9 100644 --- a/packages/leann-backend-diskann/pyproject.toml +++ b/packages/leann-backend-diskann/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-diskann" -version = "0.1.14" -dependencies = ["leann-core==0.1.14", "numpy", "protobuf>=3.19.0"] +version = "0.1.15" +dependencies = ["leann-core==0.1.15", "numpy", "protobuf>=3.19.0"] [tool.scikit-build] # Key: simplified CMake path diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml index 82a46b8..b989d6d 100644 --- a/packages/leann-backend-hnsw/pyproject.toml +++ b/packages/leann-backend-hnsw/pyproject.toml @@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-hnsw" -version = "0.1.14" +version = "0.1.15" description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." dependencies = [ - "leann-core==0.1.14", + "leann-core==0.1.15", "numpy", "pyzmq>=23.0.0", "msgpack>=1.0.0", diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml index a8a9983..3b66c69 100644 --- a/packages/leann-core/pyproject.toml +++ b/packages/leann-core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann-core" -version = "0.1.14" +version = "0.1.15" description = "Core API and plugin system for LEANN" readme = "README.md" requires-python = ">=3.9" diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml index 74ab903..6727621 100644 --- a/packages/leann/pyproject.toml +++ b/packages/leann/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann" -version = "0.1.14" +version = "0.1.15" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" readme = "README.md" requires-python = ">=3.9" From 261006c36ac4b37b10904822133105bd53cfcea9 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 27 Jul 2025 22:07:36 -0700 Subject: [PATCH 9/9] docs: revert --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5165026..91b55c6 100755 --- a/README.md +++ b/README.md @@ -94,14 +94,14 @@ uv sync -## Quick Star +## Quick Start Our declarative API makes RAG as easy as writing a config file. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb) ```python -from leann import LeannBuilder, LeannSearcher, LeannCha +from leann import LeannBuilder, LeannSearcher, LeannChat from pathlib import Path INDEX_PATH = str(Path("./").resolve() / "demo.leann") @@ -268,7 +268,7 @@ The default Chrome profile path is configured for a typical macOS setup. If you 1. Open Terminal 2. Run: `ls ~/Library/Application\ Support/Google/Chrome/` 3. Look for folders like "Default", "Profile 1", "Profile 2", etc. -4. Use the full path as your `--chrome-profile` argumen +4. Use the full path as your `--chrome-profile` argument **Common Chrome profile locations:** - macOS: `~/Library/Application Support/Google/Chrome/Default` @@ -311,7 +311,7 @@ sudo packages/wechat-exporter/wechattweak-cli install **Troubleshooting:** - **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41) -- **Export errors**: If you encounter the error below, try restarting WeCha +- **Export errors**: If you encounter the error below, try restarting WeChat ``` Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed. Failed to find or export WeChat data. Exiting. @@ -366,7 +366,7 @@ leann search my-docs "machine learning concepts" leann ask my-docs --interactive # List all your indexes -leann lis +leann list ``` **Key CLI features:** @@ -451,7 +451,7 @@ Options: ```bash uv pip install -e ".[dev]" # Install dev dependencies -python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR datase +python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR dataset python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia ```