diff --git a/.gitmodules b/.gitmodules index 8c49b3e..1899ae5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,13 @@ [submodule "packages/leann-backend-hnsw/third_party/faiss"] path = packages/leann-backend-hnsw/third_party/faiss url = https://github.com/yichuan520030910320/faiss.git +[submodule "packages/leann-backend-hnsw/third_party/msgpack-c"] + path = packages/leann-backend-hnsw/third_party/msgpack-c + url = https://github.com/msgpack/msgpack-c.git + branch = cpp_master +[submodule "packages/leann-backend-hnsw/third_party/cppzmq"] + path = packages/leann-backend-hnsw/third_party/cppzmq + url = https://github.com/zeromq/cppzmq.git +[submodule "packages/leann-backend-hnsw/third_party/libzmq"] + path = packages/leann-backend-hnsw/third_party/libzmq + url = https://github.com/zeromq/libzmq.git diff --git a/README.md b/README.md index 83f0c83..11f2468 100755 --- a/README.md +++ b/README.md @@ -28,13 +28,15 @@ ### ๐ŸŽฏ Why Leann? Traditional RAG systems face a fundamental trade-off: + - **๐Ÿ’พ Storage**: Storing embeddings for millions of documents requires massive disk space - **๐Ÿ”„ Freshness**: Pre-computed embeddings become stale when documents change - **๐Ÿ’ฐ Cost**: Vector databases are expensive to scale **Leann solves this by:** + - โœ… **Zero embedding storage** - Only graph structure is persisted -- โœ… **Real-time computation** - Embeddings computed on-demand with ms latency +- โœ… **Real-time computation** - Embeddings computed on-demand with ms latency - โœ… **Memory efficient** - Runs on consumer hardware (8GB RAM) - โœ… **Always fresh** - No stale embeddings, ever @@ -46,6 +48,18 @@ Traditional RAG systems face a fundamental trade-off: git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann cd leann git submodule update --init --recursive +``` + +**macOS:** +```bash +brew install llvm libomp +export CC=$(brew --prefix llvm)/bin/clang +export CXX=$(brew --prefix llvm)/bin/clang++ +uv sync +``` + +**Linux (Ubuntu/Debian):** +```bash uv sync ``` @@ -78,28 +92,20 @@ uv run examples/document_search.py **PDF RAG Demo (using LlamaIndex for document parsing and Leann for indexing/search)** This demo showcases how to build a RAG system for PDF documents using Leann. -1. Place your PDF files (and other supported formats like .docx, .pptx, .xlsx) into the `examples/data/` directory. -2. Ensure you have an `OPENAI_API_KEY` set in your environment variables or in a `.env` file for the LLM to function. + +1. Place your PDF files (and other supported formats like .docx, .pptx, .xlsx) into the `examples/data/` directory. +2. Ensure you have an `OPENAI_API_KEY` set in your environment variables or in a `.env` file for the LLM to function. ```bash uv run examples/main_cli_example.py ``` -## โš™๏ธ Developer Build Instructions (macOS/Linux) - -If you are building or modifying the C++ backends (e.g., DiskANN, HNSW), please ensure the following dependencies are installed: - -```bash -brew install boost protobuf zeromq -``` - -> On Linux, use your package manager (e.g., `apt install libboost-all-dev protobuf-compiler libprotobuf-dev libzmq3-dev`). - ### Regenerating Protobuf Files + If you modify any `.proto` files (such as `embedding.proto`), or if you see errors about protobuf version mismatch, **regenerate the C++ protobuf files** to match your installed version: ```bash -# From the leann/packages/leann-backend-diskann directory: +cd packages/leann-backend-diskann protoc --cpp_out=third_party/DiskANN/include --proto_path=third_party embedding.proto protoc --cpp_out=third_party/DiskANN/src --proto_path=third_party embedding.proto ``` @@ -109,6 +115,7 @@ This ensures the generated files are compatible with your system's protobuf libr ## โœจ Features ### ๐Ÿ”ฅ Core Features + - **๐Ÿ“Š Multiple Distance Functions**: L2, Cosine, MIPS (Maximum Inner Product Search) - **๐Ÿ—๏ธ Pluggable Backends**: DiskANN, HNSW/FAISS with unified API - **๐Ÿ”„ Real-time Embeddings**: Dynamic computation using optimized ZMQ servers @@ -116,6 +123,7 @@ This ensures the generated files are compatible with your system's protobuf libr - **๐ŸŽฏ Graph Pruning**: Advanced techniques for memory-efficient search ### ๐Ÿ› ๏ธ Technical Highlights + - **Zero-copy operations** for maximum performance - **SIMD-optimized** distance computations (AVX2/AVX512) - **Async embedding pipeline** with batched processing @@ -123,6 +131,7 @@ This ensures the generated files are compatible with your system's protobuf libr - **Recompute mode** for highest accuracy scenarios ### ๐ŸŽจ Developer Experience + - **Simple Python API** - Get started in minutes - **Extensible backend system** - Easy to add new algorithms - **Comprehensive examples** - From basic usage to production deployment @@ -132,19 +141,19 @@ This ensures the generated files are compatible with your system's protobuf libr ### Memory Usage Comparison -| System | 1M Documents | 10M Documents | 100M Documents | -|--------|-------------|---------------|----------------| -| Traditional Vector DB | 3.1 GB | 31 GB | 310 GB | -| **Leann** | **180 MB** | **1.2 GB** | **8.4 GB** | -| **Reduction** | **94.2%** | **96.1%** | **97.3%** | +| System | 1M Documents | 10M Documents | 100M Documents | +| --------------------- | ---------------- | ---------------- | ---------------- | +| Traditional Vector DB | 3.1 GB | 31 GB | 310 GB | +| **Leann** | **180 MB** | **1.2 GB** | **8.4 GB** | +| **Reduction** | **94.2%** | **96.1%** | **97.3%** | ### Query Performance -| Backend | Index Size | Query Time | Recall@10 | -|---------|------------|------------|-----------| -| DiskANN | 1M docs | 12ms | 0.95 | -| DiskANN + Recompute | 1M docs | 145ms | 0.98 | -| HNSW | 1M docs | 8ms | 0.93 | +| Backend | Index Size | Query Time | Recall@10 | +| ------------------- | ---------- | ---------- | --------- | +| DiskANN | 1M docs | 12ms | 0.95 | +| DiskANN + Recompute | 1M docs | 145ms | 0.98 | +| HNSW | 1M docs | 8ms | 0.93 | *Benchmarks run on AMD Ryzen 7 with 32GB RAM* @@ -166,26 +175,29 @@ This ensures the generated files are compatible with your system's protobuf libr ### Key Components 1. **๐Ÿง  Embedding Engine**: Real-time transformer inference with caching -2. **๐Ÿ“Š Graph Index**: Memory-efficient navigation structures +2. **๐Ÿ“Š Graph Index**: Memory-efficient navigation structures 3. **๐Ÿ”„ Search Coordinator**: Orchestrates embedding + graph search 4. **โšก Backend Adapters**: Pluggable algorithm implementations ## ๐ŸŽ“ Supported Models & Backends ### ๐Ÿค– Embedding Models + - **sentence-transformers/all-mpnet-base-v2** (default) - **sentence-transformers/all-MiniLM-L6-v2** (lightweight) - Any HuggingFace sentence-transformer model - Custom model support via API -### ๐Ÿ”ง Search Backends +### ๐Ÿ”ง Search Backends + - **DiskANN**: Microsoft's billion-scale ANN algorithm - **HNSW**: Hierarchical Navigable Small World graphs - **Coming soon**: ScaNN, Faiss-IVF, NGT ### ๐Ÿ“ Distance Functions + - **L2**: Euclidean distance for precise similarity -- **Cosine**: Angular similarity for normalized vectors +- **Cosine**: Angular similarity for normalized vectors - **MIPS**: Maximum Inner Product Search for recommendation systems ## ๐Ÿ”ฌ Paper @@ -209,6 +221,7 @@ If you find Leann useful, please cite: ## ๐ŸŒ Use Cases ### ๐Ÿ’ผ Enterprise RAG + ```python # Handle millions of documents with limited resources builder = LeannBuilder( @@ -219,7 +232,8 @@ builder = LeannBuilder( ) ``` -### ๐Ÿ”ฌ Research & Experimentation +### ๐Ÿ”ฌ Research & Experimentation + ```python # Quick prototyping with different algorithms for backend in ["diskann", "hnsw"]: @@ -228,6 +242,7 @@ for backend in ["diskann", "hnsw"]: ``` ### ๐Ÿš€ Real-time Applications + ```python # Sub-second response times chat = LeannChat("knowledge.leann") @@ -240,6 +255,7 @@ response = chat.ask("What is quantum computing?") We welcome contributions! Leann is built by the community, for the community. ### Ways to Contribute + - ๐Ÿ› **Bug Reports**: Found an issue? Let us know! - ๐Ÿ’ก **Feature Requests**: Have an idea? We'd love to hear it! - ๐Ÿ”ง **Code Contributions**: PRs welcome for all skill levels @@ -247,14 +263,17 @@ We welcome contributions! Leann is built by the community, for the community. - ๐Ÿงช **Benchmarks**: Share your performance results ### Development Setup + ```bash -git clone https://github.com/yourname/leann +git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann cd leann +git submodule update --init --recursive uv sync --dev uv run pytest tests/ ``` ### Quick Tests + ```bash # Sanity check all distance functions uv run python tests/sanity_checks/test_distance_functions.py @@ -262,17 +281,21 @@ uv run python tests/sanity_checks/test_distance_functions.py # Verify L2 implementation uv run python tests/sanity_checks/test_l2_verification.py ``` + ## โ“ FAQ ### Common Issues #### NCCL Topology Error + **Problem**: You encounter `ncclTopoComputePaths` error during document processing: + ``` ncclTopoComputePaths (system=, comm=comm@entry=0x5555a82fa3c0) at graph/paths.cc:688 ``` **Solution**: Set these environment variables before running your script: + ```bash export NCCL_TOPO_DUMP_FILE=/tmp/nccl_topo.xml export NCCL_DEBUG=INFO @@ -285,18 +308,21 @@ export NCCL_SOCKET_IFNAME=ens5 ## ๐Ÿ“ˆ Roadmap ### ๐ŸŽฏ Q1 2024 -- [x] DiskANN backend with MIPS/L2/Cosine support -- [x] HNSW backend integration -- [x] Real-time embedding pipeline -- [x] Memory-efficient graph pruning + +- [X] DiskANN backend with MIPS/L2/Cosine support +- [X] HNSW backend integration +- [X] Real-time embedding pipeline +- [X] Memory-efficient graph pruning ### ๐Ÿš€ Q2 2024 + - [ ] Distributed search across multiple nodes - [ ] ScaNN backend support - [ ] Advanced caching strategies - [ ] Kubernetes deployment guides ### ๐ŸŒŸ Q3 2024 + - [ ] GPU-accelerated embedding computation - [ ] Approximate distance functions - [ ] Integration with LangChain/LlamaIndex @@ -318,7 +344,7 @@ MIT License - see [LICENSE](LICENSE) for details. ## ๐Ÿ™ Acknowledgments - **Microsoft Research** for the DiskANN algorithm -- **Meta AI** for FAISS and optimization insights +- **Meta AI** for FAISS and optimization insights - **HuggingFace** for the transformer ecosystem - **Our amazing contributors** who make this possible @@ -330,4 +356,5 @@ MIT License - see [LICENSE](LICENSE) for details.

Made with โค๏ธ by the Leann team -

\ No newline at end of file +

+ diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN index 5108fe8..578542f 160000 --- a/packages/leann-backend-diskann/third_party/DiskANN +++ b/packages/leann-backend-diskann/third_party/DiskANN @@ -1 +1 @@ -Subproject commit 5108fe81a3ac465f35729a16bca5f4f8c3538c2b +Subproject commit 578542f2d46b6766a6bd0b0012b19b038c576321 diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt index 6865da3..9ab1f45 100644 --- a/packages/leann-backend-hnsw/CMakeLists.txt +++ b/packages/leann-backend-hnsw/CMakeLists.txt @@ -2,6 +2,32 @@ cmake_minimum_required(VERSION 3.24) project(leann_backend_hnsw_wrapper) +# Set OpenMP path for macOS +if(APPLE) + set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include") + set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include") + set(OpenMP_C_LIB_NAMES "omp") + set(OpenMP_CXX_LIB_NAMES "omp") + set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib") +endif() + +# Build ZeroMQ from source +set(ZMQ_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(ENABLE_DRAFTS OFF CACHE BOOL "" FORCE) +set(ENABLE_PRECOMPILED OFF CACHE BOOL "" FORCE) +set(WITH_PERF_TOOL OFF CACHE BOOL "" FORCE) +set(WITH_DOCS OFF CACHE BOOL "" FORCE) +set(BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(BUILD_STATIC ON CACHE BOOL "" FORCE) +add_subdirectory(third_party/libzmq) + +# Add cppzmq headers +include_directories(third_party/cppzmq) + +# Configure msgpack-c - disable boost dependency manually +add_compile_definitions(MSGPACK_NO_BOOST) +include_directories(third_party/msgpack-c/include) + set(FAISS_ENABLE_PYTHON ON CACHE BOOL "" FORCE) set(FAISS_ENABLE_GPU OFF CACHE BOOL "" FORCE) set(FAISS_ENABLE_EXTRAS OFF CACHE BOOL "" FORCE) diff --git a/packages/leann-backend-hnsw/third_party/cppzmq b/packages/leann-backend-hnsw/third_party/cppzmq new file mode 160000 index 0000000..3bcbd9d --- /dev/null +++ b/packages/leann-backend-hnsw/third_party/cppzmq @@ -0,0 +1 @@ +Subproject commit 3bcbd9dad2f57180aacd4b4aea292a74f0de7ef4 diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss index 2365db5..2547df4 160000 --- a/packages/leann-backend-hnsw/third_party/faiss +++ b/packages/leann-backend-hnsw/third_party/faiss @@ -1 +1 @@ -Subproject commit 2365db59a7ba253e8b075fbfa43a5c0d15dbda84 +Subproject commit 2547df4377ae097e2eabc9b019c15135b1fea2b4 diff --git a/packages/leann-backend-hnsw/third_party/libzmq b/packages/leann-backend-hnsw/third_party/libzmq new file mode 160000 index 0000000..3e5ce5c --- /dev/null +++ b/packages/leann-backend-hnsw/third_party/libzmq @@ -0,0 +1 @@ +Subproject commit 3e5ce5c1cd75bd93b2ab51d98e0239eb8628b953 diff --git a/packages/leann-backend-hnsw/third_party/msgpack-c b/packages/leann-backend-hnsw/third_party/msgpack-c new file mode 160000 index 0000000..9b801f0 --- /dev/null +++ b/packages/leann-backend-hnsw/third_party/msgpack-c @@ -0,0 +1 @@ +Subproject commit 9b801f087ab7434f2ab1ab3c0f48a966c19d3b70