Merge branch 'main' into fix-mac-intel-build

2025-08-11 01:54:29 -07:00
parent 430969565e 239e35e2e6
commit b5c80edb03
10 changed files with 4021 additions and 3531 deletions
@@ -97,6 +97,7 @@ uv sync

 </details>

+
 ## Quick Start

 Our declarative API makes RAG as easy as writing a config file.
@@ -188,7 +189,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
 --force-rebuild         # Force rebuild index even if it exists

 # Embedding Parameters
--embedding-model MODEL  # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, or mlx-community/multilingual-e5-base-mlx
+--embedding-model MODEL  # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
 --embedding-mode MODE    # sentence-transformers, openai, mlx, or ollama

 # LLM Parameters (Text generation models)
@@ -222,9 +222,15 @@ python apps/document_rag.py --query "What are the main techniques LEANN explores

 3. **Use MLX on Apple Silicon** (optional optimization):
   ```bash
-   --embedding-mode mlx --embedding-model mlx-community/multilingual-e5-base-mlx
+   --embedding-mode mlx --embedding-model mlx-community/Qwen3-Embedding-0.6B-8bit
   ```
+    MLX might not be the best choice, as we tested and found that it only offers 1.3x acceleration compared to HF, so maybe using ollama is a better choice for embedding generation

+4. **Use Ollama**
+   ```bash
+   --embedding-mode ollama --embedding-model nomic-embed-text
+   ```
+   To discover additional embedding models in ollama, check out https://ollama.com/search?c=embedding or read more about embedding models at https://ollama.com/blog/embedding-models, please do check the model size that works best for you
 ### If Search Quality is Poor

 1. **Increase retrieval count**:
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"

 [project]
 name = "leann-backend-diskann"
-version = "0.2.5"
-dependencies = ["leann-core==0.2.5", "numpy", "protobuf>=3.19.0"]
+version = "0.2.7"
+dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"]

 [tool.scikit-build]
 # Key: simplified CMake path
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"

 [project]
 name = "leann-backend-hnsw"
-version = "0.2.5"
+version = "0.2.7"
 description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
 dependencies = [
-    "leann-core==0.2.5",
+    "leann-core==0.2.7",
    "numpy",
    "pyzmq>=23.0.0",
    "msgpack>=1.0.0",
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "leann-core"
-version = "0.2.5"
+version = "0.2.7"
 description = "Core API and plugin system for LEANN"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -31,6 +31,8 @@ dependencies = [
    "PyPDF2>=3.0.0",
    "pymupdf>=1.23.0",
    "pdfplumber>=0.10.0",
+    "nbconvert>=7.0.0",  # For .ipynb file support
+    "gitignore-parser>=0.1.12",  # For proper .gitignore handling
    "mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
    "mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
 ]
@@ -87,7 +87,9 @@ Examples:

        # Build command
        build_parser = subparsers.add_parser("build", help="Build document index")
-        build_parser.add_argument("index_name", help="Index name")
+        build_parser.add_argument(
+            "index_name", nargs="?", help="Index name (default: current directory name)"
+        )
        build_parser.add_argument(
            "--docs", type=str, default=".", help="Documents directory (default: current directory)"
        )
@@ -202,6 +204,37 @@ Examples:
        with open(global_registry, "w") as f:
            json.dump(projects, f, indent=2)

+    def _build_gitignore_parser(self, docs_dir: str):
+        """Build gitignore parser using gitignore-parser library."""
+        from gitignore_parser import parse_gitignore
+
+        # Try to parse the root .gitignore
+        gitignore_path = Path(docs_dir) / ".gitignore"
+
+        if gitignore_path.exists():
+            try:
+                # gitignore-parser automatically handles all subdirectory .gitignore files!
+                matches = parse_gitignore(str(gitignore_path))
+                print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)")
+                return matches
+            except Exception as e:
+                print(f"Warning: Could not parse .gitignore: {e}")
+        else:
+            print("📋 No .gitignore found")
+
+        # Fallback: basic pattern matching for essential files
+        essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
+
+        def basic_matches(file_path):
+            path_parts = Path(file_path).parts
+            return any(part in essential_patterns for part in path_parts)
+
+        return basic_matches
+
+    def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
+        """Check if a file should be excluded using gitignore parser."""
+        return gitignore_matches(str(relative_path))
+
    def list_indexes(self):
        print("Stored LEANN indexes:")

@@ -283,34 +316,49 @@ Examples:
        if custom_file_types:
            print(f"Using custom file types: {custom_file_types}")

-        # Try to use better PDF parsers first
+        # Build gitignore parser
+        gitignore_matches = self._build_gitignore_parser(docs_dir)
+
+        # Try to use better PDF parsers first, but only if PDFs are requested
        documents = []
        docs_path = Path(docs_dir)

-        for file_path in docs_path.rglob("*.pdf"):
-            print(f"Processing PDF: {file_path}")
+        # Check if we should process PDFs
+        should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types

-            # Try PyMuPDF first (best quality)
-            text = extract_pdf_text_with_pymupdf(str(file_path))
-            if text is None:
-                # Try pdfplumber
-                text = extract_pdf_text_with_pdfplumber(str(file_path))
+        if should_process_pdfs:
+            for file_path in docs_path.rglob("*.pdf"):
+                # Check if file matches any exclude pattern
+                relative_path = file_path.relative_to(docs_path)
+                if self._should_exclude_file(relative_path, gitignore_matches):
+                    continue

-            if text:
-                # Create a simple document structure
-                from llama_index.core import Document
+                print(f"Processing PDF: {file_path}")

-                doc = Document(text=text, metadata={"source": str(file_path)})
-                documents.append(doc)
-            else:
-                # Fallback to default reader
-                print(f"Using default reader for {file_path}")
-                default_docs = SimpleDirectoryReader(
-                    str(file_path.parent),
-                    filename_as_id=True,
-                    required_exts=[file_path.suffix],
-                ).load_data()
-                documents.extend(default_docs)
+                # Try PyMuPDF first (best quality)
+                text = extract_pdf_text_with_pymupdf(str(file_path))
+                if text is None:
+                    # Try pdfplumber
+                    text = extract_pdf_text_with_pdfplumber(str(file_path))
+
+                if text:
+                    # Create a simple document structure
+                    from llama_index.core import Document
+
+                    doc = Document(text=text, metadata={"source": str(file_path)})
+                    documents.append(doc)
+                else:
+                    # Fallback to default reader
+                    print(f"Using default reader for {file_path}")
+                    try:
+                        default_docs = SimpleDirectoryReader(
+                            str(file_path.parent),
+                            filename_as_id=True,
+                            required_exts=[file_path.suffix],
+                        ).load_data()
+                        documents.extend(default_docs)
+                    except Exception as e:
+                        print(f"Warning: Could not process {file_path}: {e}")

        # Load other file types with default reader
        if custom_file_types:
@@ -376,13 +424,34 @@ Examples:
            ]
        # Try to load other file types, but don't fail if none are found
        try:
+            # Create a custom file filter function using our PathSpec
+            def file_filter(file_path: str) -> bool:
+                """Return True if file should be included (not excluded)"""
+                try:
+                    docs_path_obj = Path(docs_dir)
+                    file_path_obj = Path(file_path)
+                    relative_path = file_path_obj.relative_to(docs_path_obj)
+                    return not self._should_exclude_file(relative_path, gitignore_matches)
+                except (ValueError, OSError):
+                    return True  # Include files that can't be processed
+
            other_docs = SimpleDirectoryReader(
                docs_dir,
                recursive=True,
                encoding="utf-8",
                required_exts=code_extensions,
+                file_extractor={},  # Use default extractors
+                filename_as_id=True,
            ).load_data(show_progress=True)
-            documents.extend(other_docs)
+
+            # Filter documents after loading based on gitignore rules
+            filtered_docs = []
+            for doc in other_docs:
+                file_path = doc.metadata.get("file_path", "")
+                if file_filter(file_path):
+                    filtered_docs.append(doc)
+
+            documents.extend(filtered_docs)
        except ValueError as e:
            if "No files found" in str(e):
                print("No additional files found for other supported types.")
@@ -455,7 +524,13 @@ Examples:

    async def build_index(self, args):
        docs_dir = args.docs
-        index_name = args.index_name
+        # Use current directory name if index_name not provided
+        if args.index_name:
+            index_name = args.index_name
+        else:
+            index_name = Path.cwd().name
+            print(f"Using current directory name as index: '{index_name}'")
+
        index_dir = self.indexes_dir / index_name
        index_path = self.get_index_path(index_name)

@@ -25,32 +25,61 @@ def handle_request(request):
                "tools": [
                    {
                        "name": "leann_search",
-                        "description": "Search LEANN index",
+                        "description": """🔍 Search code using natural language - like having a coding assistant who knows your entire codebase!
+
+🎯 **Perfect for**:
+- "How does authentication work?" → finds auth-related code
+- "Error handling patterns" → locates try-catch blocks and error logic
+- "Database connection setup" → finds DB initialization code
+- "API endpoint definitions" → locates route handlers
+- "Configuration management" → finds config files and usage
+
+💡 **Pro tip**: Use this before making any changes to understand existing patterns and conventions.""",
                        "inputSchema": {
                            "type": "object",
                            "properties": {
-                                "index_name": {"type": "string"},
-                                "query": {"type": "string"},
-                                "top_k": {"type": "integer", "default": 5},
+                                "index_name": {
+                                    "type": "string",
+                                    "description": "Name of the LEANN index to search. Use 'leann_list' first to see available indexes.",
+                                },
+                                "query": {
+                                    "type": "string",
+                                    "description": "Search query - can be natural language (e.g., 'how to handle errors') or technical terms (e.g., 'async function definition')",
+                                },
+                                "top_k": {
+                                    "type": "integer",
+                                    "default": 5,
+                                    "minimum": 1,
+                                    "maximum": 20,
+                                    "description": "Number of search results to return. Use 5-10 for focused results, 15-20 for comprehensive exploration.",
+                                },
+                                "complexity": {
+                                    "type": "integer",
+                                    "default": 32,
+                                    "minimum": 16,
+                                    "maximum": 128,
+                                    "description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.",
+                                },
                            },
                            "required": ["index_name", "query"],
                        },
                    },
                    {
-                        "name": "leann_ask",
-                        "description": "Ask question using LEANN RAG",
+                        "name": "leann_status",
+                        "description": "📊 Check the health and stats of your code indexes - like a medical checkup for your codebase knowledge!",
                        "inputSchema": {
                            "type": "object",
                            "properties": {
-                                "index_name": {"type": "string"},
-                                "question": {"type": "string"},
+                                "index_name": {
+                                    "type": "string",
+                                    "description": "Optional: Name of specific index to check. If not provided, shows status of all indexes.",
+                                }
                            },
-                            "required": ["index_name", "question"],
                        },
                    },
                    {
                        "name": "leann_list",
-                        "description": "List all LEANN indexes",
+                        "description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.",
                        "inputSchema": {"type": "object", "properties": {}},
                    },
                ]
@@ -63,19 +92,41 @@ def handle_request(request):

        try:
            if tool_name == "leann_search":
+                # Validate required parameters
+                if not args.get("index_name") or not args.get("query"):
+                    return {
+                        "jsonrpc": "2.0",
+                        "id": request.get("id"),
+                        "result": {
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "Error: Both index_name and query are required",
+                                }
+                            ]
+                        },
+                    }
+
+                # Build simplified command
                cmd = [
                    "leann",
                    "search",
                    args["index_name"],
                    args["query"],
-                    "--recompute-embeddings",
                    f"--top-k={args.get('top_k', 5)}",
+                    f"--complexity={args.get('complexity', 32)}",
                ]
+
                result = subprocess.run(cmd, capture_output=True, text=True)

-            elif tool_name == "leann_ask":
-                cmd = f'echo "{args["question"]}" | leann ask {args["index_name"]} --recompute-embeddings --llm ollama --model qwen3:8b'
-                result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+            elif tool_name == "leann_status":
+                if args.get("index_name"):
+                    # Check specific index status - for now, we'll use leann list and filter
+                    result = subprocess.run(["leann", "list"], capture_output=True, text=True)
+                    # We could enhance this to show more detailed status per index
+                else:
+                    # Show all indexes status
+                    result = subprocess.run(["leann", "list"], capture_output=True, text=True)

            elif tool_name == "leann_list":
                result = subprocess.run(["leann", "list"], capture_output=True, text=True)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "leann"
-version = "0.2.5"
+version = "0.2.7"
 description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -32,7 +32,7 @@ dependencies = [
    "pypdfium2>=4.30.0",
    # LlamaIndex core and readers - updated versions
    "llama-index>=0.12.44",
-    "llama-index-readers-file>=0.4.0",  # Essential for PDF parsing
+    "llama-index-readers-file>=0.4.0", # Essential for PDF parsing
    # "llama-index-readers-docling",  # Requires Python >= 3.10
    # "llama-index-node-parser-docling",  # Requires Python >= 3.10
    "llama-index-vector-stores-faiss>=0.4.0",
@@ -43,6 +43,9 @@ dependencies = [
    "mlx>=0.26.3; sys_platform == 'darwin'",
    "mlx-lm>=0.26.0; sys_platform == 'darwin'",
    "psutil>=5.8.0",
+    "pathspec>=0.12.1",
+    "nbconvert>=7.16.6",
+    "gitignore-parser>=0.1.12",
 ]

 [project.optional-dependencies]