From 8b9c2be8c921dd966dc23d55a055a951f27672fe Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sat, 9 Aug 2025 20:37:17 -0700 Subject: [PATCH] Feat/claude code refine (#24) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add Ollama embedding support for local embedding models * docs: Add clear documentation for Ollama embedding usage * fix: remove leann_ask * docs: remove ollama embedding extra instructions * simplify MCP interface for Claude Code - Remove unnecessary search parameters: search_mode, recompute_embeddings, file_types, min_score - Remove leann_clear tool (not needed for Claude Code workflow) - Streamline search to only use: query, index_name, top_k, complexity - Keep core tools: leann_index, leann_search, leann_status, leann_list 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * remove leann_index from MCP interface Users should use CLI command 'leann build' to create indexes first. MCP now only provides search functionality: - leann_search: search existing indexes - leann_status: check index health - leann_list: list available indexes This separates index creation (CLI) from search (Claude Code). 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * improve CLI with auto project name and .gitignore support - Make index_name optional, auto-use current directory name - Read .gitignore patterns and respect them during indexing - Add _read_gitignore_patterns() to parse .gitignore files - Add _should_exclude_file() for pattern matching - Apply exclusion patterns to both PDF and general file processing - Show helpful messages about gitignore usage Now users can simply run: leann build And it will use project name + respect .gitignore patterns. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --------- Co-authored-by: Claude --- README.md | 1 + packages/leann-core/src/leann/cli.py | 129 ++++++++++++++++++++++----- packages/leann-core/src/leann/mcp.py | 79 +++++++++++++--- 3 files changed, 171 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 40c07ec..748b252 100755 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ uv sync + ## Quick Start Our declarative API makes RAG as easy as writing a config file. diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index f307204..5171afa 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -86,7 +86,9 @@ Examples: # Build command build_parser = subparsers.add_parser("build", help="Build document index") - build_parser.add_argument("index_name", help="Index name") + build_parser.add_argument( + "index_name", nargs="?", help="Index name (default: current directory name)" + ) build_parser.add_argument( "--docs", type=str, default=".", help="Documents directory (default: current directory)" ) @@ -201,6 +203,63 @@ Examples: with open(global_registry, "w") as f: json.dump(projects, f, indent=2) + def _read_gitignore_patterns(self, docs_dir: str) -> list[str]: + """Read .gitignore file and return patterns for exclusion.""" + gitignore_path = Path(docs_dir) / ".gitignore" + patterns = [] + + # Add some essential patterns that should always be excluded + essential_patterns = [ + ".git", + ".DS_Store", + ] + patterns.extend(essential_patterns) + + if gitignore_path.exists(): + try: + with open(gitignore_path, encoding="utf-8") as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if line and not line.startswith("#"): + # Remove leading slash if present (make it relative) + if line.startswith("/"): + line = line[1:] + patterns.append(line) + print( + f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore" + ) + except Exception as e: + print(f"Warning: Could not read .gitignore: {e}") + else: + print("📋 No .gitignore found, using minimal exclusion patterns") + + return patterns + + def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool: + """Check if a file should be excluded based on gitignore-style patterns.""" + path_str = str(relative_path) + + for pattern in exclude_patterns: + # Simple pattern matching (could be enhanced with full gitignore syntax) + if pattern.endswith("*"): + # Wildcard pattern + prefix = pattern[:-1] + if path_str.startswith(prefix): + return True + elif "*" in pattern: + # Contains wildcard - simple glob-like matching + import fnmatch + + if fnmatch.fnmatch(path_str, pattern): + return True + else: + # Exact match or directory match + if path_str == pattern or path_str.startswith(pattern + "/"): + return True + + return False + def list_indexes(self): print("Stored LEANN indexes:") @@ -282,34 +341,49 @@ Examples: if custom_file_types: print(f"Using custom file types: {custom_file_types}") - # Try to use better PDF parsers first + # Read .gitignore patterns first + exclude_patterns = self._read_gitignore_patterns(docs_dir) + + # Try to use better PDF parsers first, but only if PDFs are requested documents = [] docs_path = Path(docs_dir) - for file_path in docs_path.rglob("*.pdf"): - print(f"Processing PDF: {file_path}") + # Check if we should process PDFs + should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types - # Try PyMuPDF first (best quality) - text = extract_pdf_text_with_pymupdf(str(file_path)) - if text is None: - # Try pdfplumber - text = extract_pdf_text_with_pdfplumber(str(file_path)) + if should_process_pdfs: + for file_path in docs_path.rglob("*.pdf"): + # Check if file matches any exclude pattern + relative_path = file_path.relative_to(docs_path) + if self._should_exclude_file(relative_path, exclude_patterns): + continue - if text: - # Create a simple document structure - from llama_index.core import Document + print(f"Processing PDF: {file_path}") - doc = Document(text=text, metadata={"source": str(file_path)}) - documents.append(doc) - else: - # Fallback to default reader - print(f"Using default reader for {file_path}") - default_docs = SimpleDirectoryReader( - str(file_path.parent), - filename_as_id=True, - required_exts=[file_path.suffix], - ).load_data() - documents.extend(default_docs) + # Try PyMuPDF first (best quality) + text = extract_pdf_text_with_pymupdf(str(file_path)) + if text is None: + # Try pdfplumber + text = extract_pdf_text_with_pdfplumber(str(file_path)) + + if text: + # Create a simple document structure + from llama_index.core import Document + + doc = Document(text=text, metadata={"source": str(file_path)}) + documents.append(doc) + else: + # Fallback to default reader + print(f"Using default reader for {file_path}") + try: + default_docs = SimpleDirectoryReader( + str(file_path.parent), + filename_as_id=True, + required_exts=[file_path.suffix], + ).load_data() + documents.extend(default_docs) + except Exception as e: + print(f"Warning: Could not process {file_path}: {e}") # Load other file types with default reader if custom_file_types: @@ -380,6 +454,7 @@ Examples: recursive=True, encoding="utf-8", required_exts=code_extensions, + exclude=exclude_patterns, ).load_data(show_progress=True) documents.extend(other_docs) except ValueError as e: @@ -454,7 +529,13 @@ Examples: async def build_index(self, args): docs_dir = args.docs - index_name = args.index_name + # Use current directory name if index_name not provided + if args.index_name: + index_name = args.index_name + else: + index_name = Path.cwd().name + print(f"Using current directory name as index: '{index_name}'") + index_dir = self.indexes_dir / index_name index_path = self.get_index_path(index_name) diff --git a/packages/leann-core/src/leann/mcp.py b/packages/leann-core/src/leann/mcp.py index f5a2cae..92f79e3 100755 --- a/packages/leann-core/src/leann/mcp.py +++ b/packages/leann-core/src/leann/mcp.py @@ -25,32 +25,61 @@ def handle_request(request): "tools": [ { "name": "leann_search", - "description": "Search LEANN index", + "description": """🔍 Search code using natural language - like having a coding assistant who knows your entire codebase! + +🎯 **Perfect for**: +- "How does authentication work?" → finds auth-related code +- "Error handling patterns" → locates try-catch blocks and error logic +- "Database connection setup" → finds DB initialization code +- "API endpoint definitions" → locates route handlers +- "Configuration management" → finds config files and usage + +💡 **Pro tip**: Use this before making any changes to understand existing patterns and conventions.""", "inputSchema": { "type": "object", "properties": { - "index_name": {"type": "string"}, - "query": {"type": "string"}, - "top_k": {"type": "integer", "default": 5}, + "index_name": { + "type": "string", + "description": "Name of the LEANN index to search. Use 'leann_list' first to see available indexes.", + }, + "query": { + "type": "string", + "description": "Search query - can be natural language (e.g., 'how to handle errors') or technical terms (e.g., 'async function definition')", + }, + "top_k": { + "type": "integer", + "default": 5, + "minimum": 1, + "maximum": 20, + "description": "Number of search results to return. Use 5-10 for focused results, 15-20 for comprehensive exploration.", + }, + "complexity": { + "type": "integer", + "default": 32, + "minimum": 16, + "maximum": 128, + "description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.", + }, }, "required": ["index_name", "query"], }, }, { - "name": "leann_ask", - "description": "Ask question using LEANN RAG", + "name": "leann_status", + "description": "📊 Check the health and stats of your code indexes - like a medical checkup for your codebase knowledge!", "inputSchema": { "type": "object", "properties": { - "index_name": {"type": "string"}, - "question": {"type": "string"}, + "index_name": { + "type": "string", + "description": "Optional: Name of specific index to check. If not provided, shows status of all indexes.", + } }, - "required": ["index_name", "question"], }, }, { "name": "leann_list", - "description": "List all LEANN indexes", + "description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.", "inputSchema": {"type": "object", "properties": {}}, }, ] @@ -63,19 +92,41 @@ def handle_request(request): try: if tool_name == "leann_search": + # Validate required parameters + if not args.get("index_name") or not args.get("query"): + return { + "jsonrpc": "2.0", + "id": request.get("id"), + "result": { + "content": [ + { + "type": "text", + "text": "Error: Both index_name and query are required", + } + ] + }, + } + + # Build simplified command cmd = [ "leann", "search", args["index_name"], args["query"], - "--recompute-embeddings", f"--top-k={args.get('top_k', 5)}", + f"--complexity={args.get('complexity', 32)}", ] + result = subprocess.run(cmd, capture_output=True, text=True) - elif tool_name == "leann_ask": - cmd = f'echo "{args["question"]}" | leann ask {args["index_name"]} --recompute-embeddings --llm ollama --model qwen3:8b' - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + elif tool_name == "leann_status": + if args.get("index_name"): + # Check specific index status - for now, we'll use leann list and filter + result = subprocess.run(["leann", "list"], capture_output=True, text=True) + # We could enhance this to show more detailed status per index + else: + # Show all indexes status + result = subprocess.run(["leann", "list"], capture_output=True, text=True) elif tool_name == "leann_list": result = subprocess.run(["leann", "list"], capture_output=True, text=True)