Merge branch 'main' into feature/graph-partition-support

2025-08-07 23:57:28 -07:00
parent 0ec00e1a60 5567302316
commit a8421c0475
6 changed files with 132 additions and 78 deletions
--- a/README.md
+++ b/README.md
@@ -16,9 +16,7 @@ LEANN is an innovative vector database that democratizes personal AI. Transform
 LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration Fig →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
-**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can search your **[file system](#-personal-data-manager-process-any-documents-pdf-txt-md)**, **[emails](#-your-personal-email-secretary-rag-on-apple-mail)**, **[browser history](#-time-machine-for-the-web-rag-your-entire-browser-history)**, **[chat history](#-wechat-detective-unlock-your-golden-memories)**, or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
+**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can search your **[codebase](#-claude-code-integration-transform-your-development-workflow)**, **[file system](#-personal-data-manager-process-any-documents-pdf-txt-md)**, **[emails](#-your-personal-email-secretary-rag-on-apple-mail)**, **[browser history](#-time-machine-for-the-web-rag-your-entire-browser-history)**, **[chat history](#-wechat-detective-unlock-your-golden-memories)**, or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
 > **🚀 Claude Code Integration!** LEANN now provides native MCP integration for Claude Code users. Index your codebase and get intelligent code assistance directly in Claude Code. [Setup Guide →](packages/leann-mcp/README.md)
@@ -213,6 +211,30 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
 </details>
 ### 🚀 Claude Code Integration: Transform Your Development Workflow!
 **The future of code assistance is here.** Transform your development workflow with LEANN's native MCP integration for Claude Code. Index your entire codebase and get intelligent code assistance directly in your IDE.
 <p align="center">
  <img src="https://img.shields.io/badge/MCP-Native%20Integration-blue?style=flat-square" alt="MCP Integration">
  <a href="https://github.com/yichuan-w/LEANN/tree/feature/graph-partition-support?tab=readme-ov-file#rag-on-everything"><img src="https://img.shields.io/twitter/url?url=https%3A%2F%2Fgithub.com%2Fyichuan-w%2FLEANN&style=social" alt="Twitter"></a>
 </p>
 **Key features:**
 - 🔍 **Semantic code search** across your entire project
 - 📚 **Context-aware assistance** for debugging and development
 - 🚀 **Zero-config setup** with automatic language detection
 - 🔒 **Complete privacy** - your code never leaves your machine
 ```bash
 # Install LEANN globally for MCP integration
 uv tool install leann-core
 # Setup is automatic - just start using Claude Code!
 ```
 **Ready to supercharge your coding?** [Complete Setup Guide →](packages/leann-mcp/README.md)
 ### 📄 Personal Data Manager: Process Any Documents (`.pdf`, `.txt`, `.md`)!
 Ask questions directly about your personal PDFs, documents, and any directory containing your files!
@@ -417,7 +439,6 @@ Once the index is built, you can ask questions like:
 </details>
 ## 🖥️ Command Line Interface
 LEANN includes a powerful CLI for document processing and search. Perfect for quick document indexing and interactive chat.
--- a/packages/leann-backend-diskann/pyproject.toml
+++ b/packages/leann-backend-diskann/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "leann-backend-diskann"
-version = "0.2.1"
+version = "0.2.2"
-dependencies = ["leann-core==0.2.1", "numpy", "protobuf>=3.19.0"]
+dependencies = ["leann-core==0.2.2", "numpy", "protobuf>=3.19.0"]
 [tool.scikit-build]
 # Key: simplified CMake path
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "leann-backend-hnsw"
-version = "0.2.1"
+version = "0.2.2"
 description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
 dependencies = [
-    "leann-core==0.2.1",
+    "leann-core==0.2.2",
    "numpy",
    "pyzmq>=23.0.0",
    "msgpack>=1.0.0",
--- a/packages/leann-core/pyproject.toml
+++ b/packages/leann-core/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann-core"
-version = "0.2.1"
+version = "0.2.2"
 description = "Core API and plugin system for LEANN"
 readme = "README.md"
 requires-python = ">=3.9"
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -74,10 +74,11 @@ class LeannCLI:
            formatter_class=argparse.RawDescriptionHelpFormatter,
            epilog="""
 Examples:
-  leann build my-docs --docs ./documents    # Build index named my-docs
+  leann build my-docs --docs ./documents                    # Build index named my-docs
-  leann search my-docs "query"             # Search in my-docs index
+  leann build my-ppts --docs ./ --file-types .pptx,.pdf    # Index only PowerPoint and PDF files
-  leann ask my-docs "question"             # Ask my-docs index
+  leann search my-docs "query"                             # Search in my-docs index
-  leann list                              # List all stored indexes
+  leann ask my-docs "question"                             # Ask my-docs index
  leann list                                              # List all stored indexes
            """,
        )
@@ -99,6 +100,11 @@ Examples:
        build_parser.add_argument("--num-threads", type=int, default=1)
        build_parser.add_argument("--compact", action="store_true", default=True)
        build_parser.add_argument("--recompute", action="store_true", default=True)
        build_parser.add_argument(
            "--file-types",
            type=str,
            help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
        )
        # Search command
        search_parser = subparsers.add_parser("search", help="Search documents")
@@ -108,7 +114,12 @@ Examples:
        search_parser.add_argument("--complexity", type=int, default=64)
        search_parser.add_argument("--beam-width", type=int, default=1)
        search_parser.add_argument("--prune-ratio", type=float, default=0.0)
-        search_parser.add_argument("--recompute-embeddings", action="store_true")
+        search_parser.add_argument(
            "--recompute-embeddings",
            action="store_true",
            default=True,
            help="Recompute embeddings (default: True)",
        )
        search_parser.add_argument(
            "--pruning-strategy",
            choices=["global", "local", "proportional"],
@@ -131,7 +142,12 @@ Examples:
        ask_parser.add_argument("--complexity", type=int, default=32)
        ask_parser.add_argument("--beam-width", type=int, default=1)
        ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
-        ask_parser.add_argument("--recompute-embeddings", action="store_true")
+        ask_parser.add_argument(
            "--recompute-embeddings",
            action="store_true",
            default=True,
            help="Recompute embeddings (default: True)",
        )
        ask_parser.add_argument(
            "--pruning-strategy",
            choices=["global", "local", "proportional"],
@@ -254,8 +270,10 @@ Examples:
                    print(f'  leann search {example_name} "your query"')
                    print(f"  leann ask {example_name} --interactive")
-    def load_documents(self, docs_dir: str):
+    def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
        print(f"Loading documents from {docs_dir}...")
        if custom_file_types:
            print(f"Using custom file types: {custom_file_types}")
        # Try to use better PDF parsers first
        documents = []
@@ -287,66 +305,81 @@ Examples:
                documents.extend(default_docs)
        # Load other file types with default reader
-        code_extensions = [
+        if custom_file_types:
-            # Original document types
+            # Parse custom file types from comma-separated string
-            ".txt",
+            code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
-            ".md",
+            # Ensure extensions start with a dot
-            ".docx",
+            code_extensions = [ext if ext.startswith(".") else f".{ext}" for ext in code_extensions]
-            # Code files for Claude Code integration
+        else:
-            ".py",
+            # Use default supported file types
-            ".js",
+            code_extensions = [
-            ".ts",
+                # Original document types
-            ".jsx",
+                ".txt",
-            ".tsx",
+                ".md",
-            ".java",
+                ".docx",
-            ".cpp",
+                ".pptx",
-            ".c",
+                # Code files for Claude Code integration
-            ".h",
+                ".py",
-            ".hpp",
+                ".js",
-            ".cs",
+                ".ts",
-            ".go",
+                ".jsx",
-            ".rs",
+                ".tsx",
-            ".rb",
+                ".java",
-            ".php",
+                ".cpp",
-            ".swift",
+                ".c",
-            ".kt",
+                ".h",
-            ".scala",
+                ".hpp",
-            ".r",
+                ".cs",
-            ".sql",
+                ".go",
-            ".sh",
+                ".rs",
-            ".bash",
+                ".rb",
-            ".zsh",
+                ".php",
-            ".fish",
+                ".swift",
-            ".ps1",
+                ".kt",
-            ".bat",
+                ".scala",
-            # Config and markup files
+                ".r",
-            ".json",
+                ".sql",
-            ".yaml",
+                ".sh",
-            ".yml",
+                ".bash",
-            ".xml",
+                ".zsh",
-            ".toml",
+                ".fish",
-            ".ini",
+                ".ps1",
-            ".cfg",
+                ".bat",
-            ".conf",
+                # Config and markup files
-            ".html",
+                ".json",
-            ".css",
+                ".yaml",
-            ".scss",
+                ".yml",
-            ".less",
+                ".xml",
-            ".vue",
+                ".toml",
-            ".svelte",
+                ".ini",
-            # Data science
+                ".cfg",
-            ".ipynb",
+                ".conf",
-            ".R",
+                ".html",
-            ".py",
+                ".css",
-            ".jl",
+                ".scss",
-        ]
+                ".less",
-        other_docs = SimpleDirectoryReader(
+                ".vue",
-            docs_dir,
+                ".svelte",
-            recursive=True,
+                # Data science
-            encoding="utf-8",
+                ".ipynb",
-            required_exts=code_extensions,
+                ".R",
-        ).load_data(show_progress=True)
+                ".py",
-        documents.extend(other_docs)
+                ".jl",
            ]
        # Try to load other file types, but don't fail if none are found
        try:
            other_docs = SimpleDirectoryReader(
                docs_dir,
                recursive=True,
                encoding="utf-8",
                required_exts=code_extensions,
            ).load_data(show_progress=True)
            documents.extend(other_docs)
        except ValueError as e:
            if "No files found" in str(e):
                print("No additional files found for other supported types.")
            else:
                raise e
        all_texts = []
@@ -424,7 +457,7 @@ Examples:
            print(f"Index '{index_name}' already exists. Use --force to rebuild.")
            return
-        all_texts = self.load_documents(docs_dir)
+        all_texts = self.load_documents(docs_dir, args.file_types)
        if not all_texts:
            print("No documents found")
            return
--- a/packages/leann/pyproject.toml
+++ b/packages/leann/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann"
-version = "0.2.1"
+version = "0.2.2"
 description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
 readme = "README.md"
 requires-python = ">=3.9"