Feat/claude code refine (#24)

* feat: Add Ollama embedding support for local embedding models * docs: Add clear documentation for Ollama embedding usage * fix: remove leann_ask * docs: remove ollama embedding extra instructions * simplify MCP interface for Claude Code - Remove unnecessary search parameters: search_mode, recompute_embeddings, file_types, min_score - Remove leann_clear tool (not needed for Claude Code workflow) - Streamline search to only use: query, index_name, top_k, complexity - Keep core tools: leann_index, leann_search, leann_status, leann_list 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * remove leann_index from MCP interface Users should use CLI command 'leann build' to create indexes first. MCP now only provides search functionality: - leann_search: search existing indexes - leann_status: check index health - leann_list: list available indexes This separates index creation (CLI) from search (Claude Code). 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * improve CLI with auto project name and .gitignore support - Make index_name optional, auto-use current directory name - Read .gitignore patterns and respect them during indexing - Add _read_gitignore_patterns() to parse .gitignore files - Add _should_exclude_file() for pattern matching - Apply exclusion patterns to both PDF and general file processing - Show helpful messages about gitignore usage Now users can simply run: leann build And it will use project name + respect .gitignore patterns. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
2025-08-09 20:37:17 -07:00
parent 3ff5aac8e0
commit 8b9c2be8c9
3 changed files with 171 additions and 38 deletions
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -86,7 +86,9 @@ Examples:

        # Build command
        build_parser = subparsers.add_parser("build", help="Build document index")
-        build_parser.add_argument("index_name", help="Index name")
+        build_parser.add_argument(
+            "index_name", nargs="?", help="Index name (default: current directory name)"
+        )
        build_parser.add_argument(
            "--docs", type=str, default=".", help="Documents directory (default: current directory)"
        )
@@ -201,6 +203,63 @@ Examples:
        with open(global_registry, "w") as f:
            json.dump(projects, f, indent=2)

+    def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
+        """Read .gitignore file and return patterns for exclusion."""
+        gitignore_path = Path(docs_dir) / ".gitignore"
+        patterns = []
+
+        # Add some essential patterns that should always be excluded
+        essential_patterns = [
+            ".git",
+            ".DS_Store",
+        ]
+        patterns.extend(essential_patterns)
+
+        if gitignore_path.exists():
+            try:
+                with open(gitignore_path, encoding="utf-8") as f:
+                    for line in f:
+                        line = line.strip()
+                        # Skip empty lines and comments
+                        if line and not line.startswith("#"):
+                            # Remove leading slash if present (make it relative)
+                            if line.startswith("/"):
+                                line = line[1:]
+                            patterns.append(line)
+                print(
+                    f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
+                )
+            except Exception as e:
+                print(f"Warning: Could not read .gitignore: {e}")
+        else:
+            print("📋 No .gitignore found, using minimal exclusion patterns")
+
+        return patterns
+
+    def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
+        """Check if a file should be excluded based on gitignore-style patterns."""
+        path_str = str(relative_path)
+
+        for pattern in exclude_patterns:
+            # Simple pattern matching (could be enhanced with full gitignore syntax)
+            if pattern.endswith("*"):
+                # Wildcard pattern
+                prefix = pattern[:-1]
+                if path_str.startswith(prefix):
+                    return True
+            elif "*" in pattern:
+                # Contains wildcard - simple glob-like matching
+                import fnmatch
+
+                if fnmatch.fnmatch(path_str, pattern):
+                    return True
+            else:
+                # Exact match or directory match
+                if path_str == pattern or path_str.startswith(pattern + "/"):
+                    return True
+
+        return False
+
    def list_indexes(self):
        print("Stored LEANN indexes:")

@@ -282,34 +341,49 @@ Examples:
        if custom_file_types:
            print(f"Using custom file types: {custom_file_types}")

-        # Try to use better PDF parsers first
+        # Read .gitignore patterns first
+        exclude_patterns = self._read_gitignore_patterns(docs_dir)
+
+        # Try to use better PDF parsers first, but only if PDFs are requested
        documents = []
        docs_path = Path(docs_dir)

-        for file_path in docs_path.rglob("*.pdf"):
-            print(f"Processing PDF: {file_path}")
+        # Check if we should process PDFs
+        should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types

-            # Try PyMuPDF first (best quality)
-            text = extract_pdf_text_with_pymupdf(str(file_path))
-            if text is None:
-                # Try pdfplumber
-                text = extract_pdf_text_with_pdfplumber(str(file_path))
+        if should_process_pdfs:
+            for file_path in docs_path.rglob("*.pdf"):
+                # Check if file matches any exclude pattern
+                relative_path = file_path.relative_to(docs_path)
+                if self._should_exclude_file(relative_path, exclude_patterns):
+                    continue

-            if text:
-                # Create a simple document structure
-                from llama_index.core import Document
+                print(f"Processing PDF: {file_path}")

-                doc = Document(text=text, metadata={"source": str(file_path)})
-                documents.append(doc)
-            else:
-                # Fallback to default reader
-                print(f"Using default reader for {file_path}")
-                default_docs = SimpleDirectoryReader(
-                    str(file_path.parent),
-                    filename_as_id=True,
-                    required_exts=[file_path.suffix],
-                ).load_data()
-                documents.extend(default_docs)
+                # Try PyMuPDF first (best quality)
+                text = extract_pdf_text_with_pymupdf(str(file_path))
+                if text is None:
+                    # Try pdfplumber
+                    text = extract_pdf_text_with_pdfplumber(str(file_path))
+
+                if text:
+                    # Create a simple document structure
+                    from llama_index.core import Document
+
+                    doc = Document(text=text, metadata={"source": str(file_path)})
+                    documents.append(doc)
+                else:
+                    # Fallback to default reader
+                    print(f"Using default reader for {file_path}")
+                    try:
+                        default_docs = SimpleDirectoryReader(
+                            str(file_path.parent),
+                            filename_as_id=True,
+                            required_exts=[file_path.suffix],
+                        ).load_data()
+                        documents.extend(default_docs)
+                    except Exception as e:
+                        print(f"Warning: Could not process {file_path}: {e}")

        # Load other file types with default reader
        if custom_file_types:
@@ -380,6 +454,7 @@ Examples:
                recursive=True,
                encoding="utf-8",
                required_exts=code_extensions,
+                exclude=exclude_patterns,
            ).load_data(show_progress=True)
            documents.extend(other_docs)
        except ValueError as e:
@@ -454,7 +529,13 @@ Examples:

    async def build_index(self, args):
        docs_dir = args.docs
-        index_name = args.index_name
+        # Use current directory name if index_name not provided
+        if args.index_name:
+            index_name = args.index_name
+        else:
+            index_name = Path.cwd().name
+            print(f"Using current directory name as index: '{index_name}'")
+
        index_dir = self.indexes_dir / index_name
        index_path = self.get_index_path(index_name)