Compare commits

...

4 Commits

Author SHA1 Message Date
Andy Lee
fe942329d6 fix: improve gitignore and Jupyter notebook support
- Add nbconvert dependency for .ipynb file support
- Replace manual gitignore parsing with gitignore-parser library
- Proper recursive .gitignore handling (all subdirectories)
- Fix compliance with Git gitignore behavior
- Simplify code and improve reliability

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-10 18:52:55 -07:00
yichuan520030910320
9801aa581b [Readme]update embedding model config according to reddit feedback 2025-08-09 21:33:33 -07:00
GitHub Actions
5e97916608 chore: release v0.2.6 2025-08-10 03:39:45 +00:00
Andy Lee
8b9c2be8c9 Feat/claude code refine (#24)
* feat: Add Ollama embedding support for local embedding models

* docs: Add clear documentation for Ollama embedding usage

* fix: remove leann_ask

* docs: remove ollama embedding extra instructions

* simplify MCP interface for Claude Code

- Remove unnecessary search parameters: search_mode, recompute_embeddings, file_types, min_score
- Remove leann_clear tool (not needed for Claude Code workflow)
- Streamline search to only use: query, index_name, top_k, complexity
- Keep core tools: leann_index, leann_search, leann_status, leann_list

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* remove leann_index from MCP interface

Users should use CLI command 'leann build' to create indexes first.
MCP now only provides search functionality:
- leann_search: search existing indexes
- leann_status: check index health
- leann_list: list available indexes

This separates index creation (CLI) from search (Claude Code).

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* improve CLI with auto project name and .gitignore support

- Make index_name optional, auto-use current directory name
- Read .gitignore patterns and respect them during indexing
- Add _read_gitignore_patterns() to parse .gitignore files
- Add _should_exclude_file() for pattern matching
- Apply exclusion patterns to both PDF and general file processing
- Show helpful messages about gitignore usage

Now users can simply run: leann build
And it will use project name + respect .gitignore patterns.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
2025-08-09 20:37:17 -07:00
10 changed files with 4021 additions and 3531 deletions

View File

@@ -97,6 +97,7 @@ uv sync
</details> </details>
## Quick Start ## Quick Start
Our declarative API makes RAG as easy as writing a config file. Our declarative API makes RAG as easy as writing a config file.
@@ -188,7 +189,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
--force-rebuild # Force rebuild index even if it exists --force-rebuild # Force rebuild index even if it exists
# Embedding Parameters # Embedding Parameters
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, or mlx-community/multilingual-e5-base-mlx --embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama --embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
# LLM Parameters (Text generation models) # LLM Parameters (Text generation models)

View File

@@ -222,9 +222,15 @@ python apps/document_rag.py --query "What are the main techniques LEANN explores
3. **Use MLX on Apple Silicon** (optional optimization): 3. **Use MLX on Apple Silicon** (optional optimization):
```bash ```bash
--embedding-mode mlx --embedding-model mlx-community/multilingual-e5-base-mlx --embedding-mode mlx --embedding-model mlx-community/Qwen3-Embedding-0.6B-8bit
``` ```
MLX might not be the best choice, as we tested and found that it only offers 1.3x acceleration compared to HF, so maybe using ollama is a better choice for embedding generation
4. **Use Ollama**
```bash
--embedding-mode ollama --embedding-model nomic-embed-text
```
To discover additional embedding models in ollama, check out https://ollama.com/search?c=embedding or read more about embedding models at https://ollama.com/blog/embedding-models, please do check the model size that works best for you
### If Search Quality is Poor ### If Search Quality is Poor
1. **Increase retrieval count**: 1. **Increase retrieval count**:

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.2.5" version = "0.2.6"
dependencies = ["leann-core==0.2.5", "numpy", "protobuf>=3.19.0"] dependencies = ["leann-core==0.2.6", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build] [tool.scikit-build]
# Key: simplified CMake path # Key: simplified CMake path

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.2.5" version = "0.2.6"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [ dependencies = [
"leann-core==0.2.5", "leann-core==0.2.6",
"numpy", "numpy",
"pyzmq>=23.0.0", "pyzmq>=23.0.0",
"msgpack>=1.0.0", "msgpack>=1.0.0",

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann-core" name = "leann-core"
version = "0.2.5" version = "0.2.6"
description = "Core API and plugin system for LEANN" description = "Core API and plugin system for LEANN"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"
@@ -31,6 +31,8 @@ dependencies = [
"PyPDF2>=3.0.0", "PyPDF2>=3.0.0",
"pymupdf>=1.23.0", "pymupdf>=1.23.0",
"pdfplumber>=0.10.0", "pdfplumber>=0.10.0",
"nbconvert>=7.0.0", # For .ipynb file support
"gitignore-parser>=0.1.12", # For proper .gitignore handling
"mlx>=0.26.3; sys_platform == 'darwin'", "mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin'", "mlx-lm>=0.26.0; sys_platform == 'darwin'",
] ]

View File

@@ -86,7 +86,9 @@ Examples:
# Build command # Build command
build_parser = subparsers.add_parser("build", help="Build document index") build_parser = subparsers.add_parser("build", help="Build document index")
build_parser.add_argument("index_name", help="Index name") build_parser.add_argument(
"index_name", nargs="?", help="Index name (default: current directory name)"
)
build_parser.add_argument( build_parser.add_argument(
"--docs", type=str, default=".", help="Documents directory (default: current directory)" "--docs", type=str, default=".", help="Documents directory (default: current directory)"
) )
@@ -201,6 +203,37 @@ Examples:
with open(global_registry, "w") as f: with open(global_registry, "w") as f:
json.dump(projects, f, indent=2) json.dump(projects, f, indent=2)
def _build_gitignore_parser(self, docs_dir: str):
"""Build gitignore parser using gitignore-parser library."""
from gitignore_parser import parse_gitignore
# Try to parse the root .gitignore
gitignore_path = Path(docs_dir) / ".gitignore"
if gitignore_path.exists():
try:
# gitignore-parser automatically handles all subdirectory .gitignore files!
matches = parse_gitignore(str(gitignore_path))
print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)")
return matches
except Exception as e:
print(f"Warning: Could not parse .gitignore: {e}")
else:
print("📋 No .gitignore found")
# Fallback: basic pattern matching for essential files
essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
def basic_matches(file_path):
path_parts = Path(file_path).parts
return any(part in essential_patterns for part in path_parts)
return basic_matches
def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
"""Check if a file should be excluded using gitignore parser."""
return gitignore_matches(str(relative_path))
def list_indexes(self): def list_indexes(self):
print("Stored LEANN indexes:") print("Stored LEANN indexes:")
@@ -282,34 +315,49 @@ Examples:
if custom_file_types: if custom_file_types:
print(f"Using custom file types: {custom_file_types}") print(f"Using custom file types: {custom_file_types}")
# Try to use better PDF parsers first # Build gitignore parser
gitignore_matches = self._build_gitignore_parser(docs_dir)
# Try to use better PDF parsers first, but only if PDFs are requested
documents = [] documents = []
docs_path = Path(docs_dir) docs_path = Path(docs_dir)
for file_path in docs_path.rglob("*.pdf"): # Check if we should process PDFs
print(f"Processing PDF: {file_path}") should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
# Try PyMuPDF first (best quality) if should_process_pdfs:
text = extract_pdf_text_with_pymupdf(str(file_path)) for file_path in docs_path.rglob("*.pdf"):
if text is None: # Check if file matches any exclude pattern
# Try pdfplumber relative_path = file_path.relative_to(docs_path)
text = extract_pdf_text_with_pdfplumber(str(file_path)) if self._should_exclude_file(relative_path, gitignore_matches):
continue
if text: print(f"Processing PDF: {file_path}")
# Create a simple document structure
from llama_index.core import Document
doc = Document(text=text, metadata={"source": str(file_path)}) # Try PyMuPDF first (best quality)
documents.append(doc) text = extract_pdf_text_with_pymupdf(str(file_path))
else: if text is None:
# Fallback to default reader # Try pdfplumber
print(f"Using default reader for {file_path}") text = extract_pdf_text_with_pdfplumber(str(file_path))
default_docs = SimpleDirectoryReader(
str(file_path.parent), if text:
filename_as_id=True, # Create a simple document structure
required_exts=[file_path.suffix], from llama_index.core import Document
).load_data()
documents.extend(default_docs) doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
try:
default_docs = SimpleDirectoryReader(
str(file_path.parent),
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
except Exception as e:
print(f"Warning: Could not process {file_path}: {e}")
# Load other file types with default reader # Load other file types with default reader
if custom_file_types: if custom_file_types:
@@ -375,13 +423,34 @@ Examples:
] ]
# Try to load other file types, but don't fail if none are found # Try to load other file types, but don't fail if none are found
try: try:
# Create a custom file filter function using our PathSpec
def file_filter(file_path: str) -> bool:
"""Return True if file should be included (not excluded)"""
try:
docs_path_obj = Path(docs_dir)
file_path_obj = Path(file_path)
relative_path = file_path_obj.relative_to(docs_path_obj)
return not self._should_exclude_file(relative_path, gitignore_matches)
except (ValueError, OSError):
return True # Include files that can't be processed
other_docs = SimpleDirectoryReader( other_docs = SimpleDirectoryReader(
docs_dir, docs_dir,
recursive=True, recursive=True,
encoding="utf-8", encoding="utf-8",
required_exts=code_extensions, required_exts=code_extensions,
file_extractor={}, # Use default extractors
filename_as_id=True,
).load_data(show_progress=True) ).load_data(show_progress=True)
documents.extend(other_docs)
# Filter documents after loading based on gitignore rules
filtered_docs = []
for doc in other_docs:
file_path = doc.metadata.get("file_path", "")
if file_filter(file_path):
filtered_docs.append(doc)
documents.extend(filtered_docs)
except ValueError as e: except ValueError as e:
if "No files found" in str(e): if "No files found" in str(e):
print("No additional files found for other supported types.") print("No additional files found for other supported types.")
@@ -454,7 +523,13 @@ Examples:
async def build_index(self, args): async def build_index(self, args):
docs_dir = args.docs docs_dir = args.docs
index_name = args.index_name # Use current directory name if index_name not provided
if args.index_name:
index_name = args.index_name
else:
index_name = Path.cwd().name
print(f"Using current directory name as index: '{index_name}'")
index_dir = self.indexes_dir / index_name index_dir = self.indexes_dir / index_name
index_path = self.get_index_path(index_name) index_path = self.get_index_path(index_name)

View File

@@ -25,32 +25,61 @@ def handle_request(request):
"tools": [ "tools": [
{ {
"name": "leann_search", "name": "leann_search",
"description": "Search LEANN index", "description": """🔍 Search code using natural language - like having a coding assistant who knows your entire codebase!
🎯 **Perfect for**:
- "How does authentication work?" → finds auth-related code
- "Error handling patterns" → locates try-catch blocks and error logic
- "Database connection setup" → finds DB initialization code
- "API endpoint definitions" → locates route handlers
- "Configuration management" → finds config files and usage
💡 **Pro tip**: Use this before making any changes to understand existing patterns and conventions.""",
"inputSchema": { "inputSchema": {
"type": "object", "type": "object",
"properties": { "properties": {
"index_name": {"type": "string"}, "index_name": {
"query": {"type": "string"}, "type": "string",
"top_k": {"type": "integer", "default": 5}, "description": "Name of the LEANN index to search. Use 'leann_list' first to see available indexes.",
},
"query": {
"type": "string",
"description": "Search query - can be natural language (e.g., 'how to handle errors') or technical terms (e.g., 'async function definition')",
},
"top_k": {
"type": "integer",
"default": 5,
"minimum": 1,
"maximum": 20,
"description": "Number of search results to return. Use 5-10 for focused results, 15-20 for comprehensive exploration.",
},
"complexity": {
"type": "integer",
"default": 32,
"minimum": 16,
"maximum": 128,
"description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.",
},
}, },
"required": ["index_name", "query"], "required": ["index_name", "query"],
}, },
}, },
{ {
"name": "leann_ask", "name": "leann_status",
"description": "Ask question using LEANN RAG", "description": "📊 Check the health and stats of your code indexes - like a medical checkup for your codebase knowledge!",
"inputSchema": { "inputSchema": {
"type": "object", "type": "object",
"properties": { "properties": {
"index_name": {"type": "string"}, "index_name": {
"question": {"type": "string"}, "type": "string",
"description": "Optional: Name of specific index to check. If not provided, shows status of all indexes.",
}
}, },
"required": ["index_name", "question"],
}, },
}, },
{ {
"name": "leann_list", "name": "leann_list",
"description": "List all LEANN indexes", "description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.",
"inputSchema": {"type": "object", "properties": {}}, "inputSchema": {"type": "object", "properties": {}},
}, },
] ]
@@ -63,19 +92,41 @@ def handle_request(request):
try: try:
if tool_name == "leann_search": if tool_name == "leann_search":
# Validate required parameters
if not args.get("index_name") or not args.get("query"):
return {
"jsonrpc": "2.0",
"id": request.get("id"),
"result": {
"content": [
{
"type": "text",
"text": "Error: Both index_name and query are required",
}
]
},
}
# Build simplified command
cmd = [ cmd = [
"leann", "leann",
"search", "search",
args["index_name"], args["index_name"],
args["query"], args["query"],
"--recompute-embeddings",
f"--top-k={args.get('top_k', 5)}", f"--top-k={args.get('top_k', 5)}",
f"--complexity={args.get('complexity', 32)}",
] ]
result = subprocess.run(cmd, capture_output=True, text=True) result = subprocess.run(cmd, capture_output=True, text=True)
elif tool_name == "leann_ask": elif tool_name == "leann_status":
cmd = f'echo "{args["question"]}" | leann ask {args["index_name"]} --recompute-embeddings --llm ollama --model qwen3:8b' if args.get("index_name"):
result = subprocess.run(cmd, shell=True, capture_output=True, text=True) # Check specific index status - for now, we'll use leann list and filter
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
# We could enhance this to show more detailed status per index
else:
# Show all indexes status
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
elif tool_name == "leann_list": elif tool_name == "leann_list":
result = subprocess.run(["leann", "list"], capture_output=True, text=True) result = subprocess.run(["leann", "list"], capture_output=True, text=True)

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann" name = "leann"
version = "0.2.5" version = "0.2.6"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -32,7 +32,7 @@ dependencies = [
"pypdfium2>=4.30.0", "pypdfium2>=4.30.0",
# LlamaIndex core and readers - updated versions # LlamaIndex core and readers - updated versions
"llama-index>=0.12.44", "llama-index>=0.12.44",
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing "llama-index-readers-file>=0.4.0", # Essential for PDF parsing
# "llama-index-readers-docling", # Requires Python >= 3.10 # "llama-index-readers-docling", # Requires Python >= 3.10
# "llama-index-node-parser-docling", # Requires Python >= 3.10 # "llama-index-node-parser-docling", # Requires Python >= 3.10
"llama-index-vector-stores-faiss>=0.4.0", "llama-index-vector-stores-faiss>=0.4.0",
@@ -43,6 +43,9 @@ dependencies = [
"mlx>=0.26.3; sys_platform == 'darwin'", "mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin'", "mlx-lm>=0.26.0; sys_platform == 'darwin'",
"psutil>=5.8.0", "psutil>=5.8.0",
"pathspec>=0.12.1",
"nbconvert>=7.16.6",
"gitignore-parser>=0.1.12",
] ]
[project.optional-dependencies] [project.optional-dependencies]

7318
uv.lock generated
View File

File diff suppressed because it is too large Load Diff