Feat/claude code refine (#24)

* feat: Add Ollama embedding support for local embedding models

* docs: Add clear documentation for Ollama embedding usage

* fix: remove leann_ask

* docs: remove ollama embedding extra instructions

* simplify MCP interface for Claude Code

- Remove unnecessary search parameters: search_mode, recompute_embeddings, file_types, min_score
- Remove leann_clear tool (not needed for Claude Code workflow)
- Streamline search to only use: query, index_name, top_k, complexity
- Keep core tools: leann_index, leann_search, leann_status, leann_list

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* remove leann_index from MCP interface

Users should use CLI command 'leann build' to create indexes first.
MCP now only provides search functionality:
- leann_search: search existing indexes
- leann_status: check index health
- leann_list: list available indexes

This separates index creation (CLI) from search (Claude Code).

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* improve CLI with auto project name and .gitignore support

- Make index_name optional, auto-use current directory name
- Read .gitignore patterns and respect them during indexing
- Add _read_gitignore_patterns() to parse .gitignore files
- Add _should_exclude_file() for pattern matching
- Apply exclusion patterns to both PDF and general file processing
- Show helpful messages about gitignore usage

Now users can simply run: leann build
And it will use project name + respect .gitignore patterns.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Andy Lee
2025-08-09 20:37:17 -07:00
committed by GitHub
parent 3ff5aac8e0
commit 8b9c2be8c9
3 changed files with 171 additions and 38 deletions

View File

@@ -97,6 +97,7 @@ uv sync
</details>
## Quick Start
Our declarative API makes RAG as easy as writing a config file.

View File

@@ -86,7 +86,9 @@ Examples:
# Build command
build_parser = subparsers.add_parser("build", help="Build document index")
build_parser.add_argument("index_name", help="Index name")
build_parser.add_argument(
"index_name", nargs="?", help="Index name (default: current directory name)"
)
build_parser.add_argument(
"--docs", type=str, default=".", help="Documents directory (default: current directory)"
)
@@ -201,6 +203,63 @@ Examples:
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
"""Read .gitignore file and return patterns for exclusion."""
gitignore_path = Path(docs_dir) / ".gitignore"
patterns = []
# Add some essential patterns that should always be excluded
essential_patterns = [
".git",
".DS_Store",
]
patterns.extend(essential_patterns)
if gitignore_path.exists():
try:
with open(gitignore_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
# Skip empty lines and comments
if line and not line.startswith("#"):
# Remove leading slash if present (make it relative)
if line.startswith("/"):
line = line[1:]
patterns.append(line)
print(
f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
)
except Exception as e:
print(f"Warning: Could not read .gitignore: {e}")
else:
print("📋 No .gitignore found, using minimal exclusion patterns")
return patterns
def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
"""Check if a file should be excluded based on gitignore-style patterns."""
path_str = str(relative_path)
for pattern in exclude_patterns:
# Simple pattern matching (could be enhanced with full gitignore syntax)
if pattern.endswith("*"):
# Wildcard pattern
prefix = pattern[:-1]
if path_str.startswith(prefix):
return True
elif "*" in pattern:
# Contains wildcard - simple glob-like matching
import fnmatch
if fnmatch.fnmatch(path_str, pattern):
return True
else:
# Exact match or directory match
if path_str == pattern or path_str.startswith(pattern + "/"):
return True
return False
def list_indexes(self):
print("Stored LEANN indexes:")
@@ -282,34 +341,49 @@ Examples:
if custom_file_types:
print(f"Using custom file types: {custom_file_types}")
# Try to use better PDF parsers first
# Read .gitignore patterns first
exclude_patterns = self._read_gitignore_patterns(docs_dir)
# Try to use better PDF parsers first, but only if PDFs are requested
documents = []
docs_path = Path(docs_dir)
for file_path in docs_path.rglob("*.pdf"):
print(f"Processing PDF: {file_path}")
# Check if we should process PDFs
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
# Try PyMuPDF first (best quality)
text = extract_pdf_text_with_pymupdf(str(file_path))
if text is None:
# Try pdfplumber
text = extract_pdf_text_with_pdfplumber(str(file_path))
if should_process_pdfs:
for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern
relative_path = file_path.relative_to(docs_path)
if self._should_exclude_file(relative_path, exclude_patterns):
continue
if text:
# Create a simple document structure
from llama_index.core import Document
print(f"Processing PDF: {file_path}")
doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
default_docs = SimpleDirectoryReader(
str(file_path.parent),
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
# Try PyMuPDF first (best quality)
text = extract_pdf_text_with_pymupdf(str(file_path))
if text is None:
# Try pdfplumber
text = extract_pdf_text_with_pdfplumber(str(file_path))
if text:
# Create a simple document structure
from llama_index.core import Document
doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
try:
default_docs = SimpleDirectoryReader(
str(file_path.parent),
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
except Exception as e:
print(f"Warning: Could not process {file_path}: {e}")
# Load other file types with default reader
if custom_file_types:
@@ -380,6 +454,7 @@ Examples:
recursive=True,
encoding="utf-8",
required_exts=code_extensions,
exclude=exclude_patterns,
).load_data(show_progress=True)
documents.extend(other_docs)
except ValueError as e:
@@ -454,7 +529,13 @@ Examples:
async def build_index(self, args):
docs_dir = args.docs
index_name = args.index_name
# Use current directory name if index_name not provided
if args.index_name:
index_name = args.index_name
else:
index_name = Path.cwd().name
print(f"Using current directory name as index: '{index_name}'")
index_dir = self.indexes_dir / index_name
index_path = self.get_index_path(index_name)

View File

@@ -25,32 +25,61 @@ def handle_request(request):
"tools": [
{
"name": "leann_search",
"description": "Search LEANN index",
"description": """🔍 Search code using natural language - like having a coding assistant who knows your entire codebase!
🎯 **Perfect for**:
- "How does authentication work?" → finds auth-related code
- "Error handling patterns" → locates try-catch blocks and error logic
- "Database connection setup" → finds DB initialization code
- "API endpoint definitions" → locates route handlers
- "Configuration management" → finds config files and usage
💡 **Pro tip**: Use this before making any changes to understand existing patterns and conventions.""",
"inputSchema": {
"type": "object",
"properties": {
"index_name": {"type": "string"},
"query": {"type": "string"},
"top_k": {"type": "integer", "default": 5},
"index_name": {
"type": "string",
"description": "Name of the LEANN index to search. Use 'leann_list' first to see available indexes.",
},
"query": {
"type": "string",
"description": "Search query - can be natural language (e.g., 'how to handle errors') or technical terms (e.g., 'async function definition')",
},
"top_k": {
"type": "integer",
"default": 5,
"minimum": 1,
"maximum": 20,
"description": "Number of search results to return. Use 5-10 for focused results, 15-20 for comprehensive exploration.",
},
"complexity": {
"type": "integer",
"default": 32,
"minimum": 16,
"maximum": 128,
"description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.",
},
},
"required": ["index_name", "query"],
},
},
{
"name": "leann_ask",
"description": "Ask question using LEANN RAG",
"name": "leann_status",
"description": "📊 Check the health and stats of your code indexes - like a medical checkup for your codebase knowledge!",
"inputSchema": {
"type": "object",
"properties": {
"index_name": {"type": "string"},
"question": {"type": "string"},
"index_name": {
"type": "string",
"description": "Optional: Name of specific index to check. If not provided, shows status of all indexes.",
}
},
"required": ["index_name", "question"],
},
},
{
"name": "leann_list",
"description": "List all LEANN indexes",
"description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.",
"inputSchema": {"type": "object", "properties": {}},
},
]
@@ -63,19 +92,41 @@ def handle_request(request):
try:
if tool_name == "leann_search":
# Validate required parameters
if not args.get("index_name") or not args.get("query"):
return {
"jsonrpc": "2.0",
"id": request.get("id"),
"result": {
"content": [
{
"type": "text",
"text": "Error: Both index_name and query are required",
}
]
},
}
# Build simplified command
cmd = [
"leann",
"search",
args["index_name"],
args["query"],
"--recompute-embeddings",
f"--top-k={args.get('top_k', 5)}",
f"--complexity={args.get('complexity', 32)}",
]
result = subprocess.run(cmd, capture_output=True, text=True)
elif tool_name == "leann_ask":
cmd = f'echo "{args["question"]}" | leann ask {args["index_name"]} --recompute-embeddings --llm ollama --model qwen3:8b'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
elif tool_name == "leann_status":
if args.get("index_name"):
# Check specific index status - for now, we'll use leann list and filter
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
# We could enhance this to show more detailed status per index
else:
# Show all indexes status
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
elif tool_name == "leann_list":
result = subprocess.run(["leann", "list"], capture_output=True, text=True)