Files
LEANN/packages/leann-core/src/leann/cli.py

565 lines
18 KiB
Python

import argparse
import asyncio
from pathlib import Path
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from .api import LeannBuilder, LeannChat, LeannSearcher
def extract_pdf_text_with_pymupdf(file_path: str) -> str:
"""Extract text from PDF using PyMuPDF for better quality."""
try:
import fitz # PyMuPDF
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
except ImportError:
# Fallback to default reader
return None
def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
"""Extract text from PDF using pdfplumber for better quality."""
try:
import pdfplumber
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text
except ImportError:
# Fallback to default reader
return None
class LeannCLI:
def __init__(self):
# Always use project-local .leann directory (like .git)
self.indexes_dir = Path.cwd() / ".leann" / "indexes"
self.indexes_dir.mkdir(parents=True, exist_ok=True)
# Default parser for documents
self.node_parser = SentenceSplitter(
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
)
# Code-optimized parser
self.code_parser = SentenceSplitter(
chunk_size=512, # Larger chunks for code context
chunk_overlap=50, # Less overlap to preserve function boundaries
separator="\n", # Split by lines for code
paragraph_separator="\n\n", # Preserve logical code blocks
)
def get_index_path(self, index_name: str) -> str:
index_dir = self.indexes_dir / index_name
return str(index_dir / "documents.leann")
def index_exists(self, index_name: str) -> bool:
index_dir = self.indexes_dir / index_name
meta_file = index_dir / "documents.leann.meta.json"
return meta_file.exists()
def create_parser(self) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="leann",
description="LEANN - Local Enhanced AI Navigation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
leann build my-docs --docs ./documents # Build index named my-docs
leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index
leann list # List all stored indexes
""",
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Build command
build_parser = subparsers.add_parser("build", help="Build document index")
build_parser.add_argument("index_name", help="Index name")
build_parser.add_argument(
"--docs", type=str, default=".", help="Documents directory (default: current directory)"
)
build_parser.add_argument(
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
)
build_parser.add_argument("--embedding-model", type=str, default="facebook/contriever")
build_parser.add_argument("--force", "-f", action="store_true", help="Force rebuild")
build_parser.add_argument("--graph-degree", type=int, default=32)
build_parser.add_argument("--complexity", type=int, default=64)
build_parser.add_argument("--num-threads", type=int, default=1)
build_parser.add_argument("--compact", action="store_true", default=True)
build_parser.add_argument("--recompute", action="store_true", default=True)
# Search command
search_parser = subparsers.add_parser("search", help="Search documents")
search_parser.add_argument("index_name", help="Index name")
search_parser.add_argument("query", help="Search query")
search_parser.add_argument("--top-k", type=int, default=5)
search_parser.add_argument("--complexity", type=int, default=64)
search_parser.add_argument("--beam-width", type=int, default=1)
search_parser.add_argument("--prune-ratio", type=float, default=0.0)
search_parser.add_argument("--recompute-embeddings", action="store_true")
search_parser.add_argument(
"--pruning-strategy",
choices=["global", "local", "proportional"],
default="global",
)
# Ask command
ask_parser = subparsers.add_parser("ask", help="Ask questions")
ask_parser.add_argument("index_name", help="Index name")
ask_parser.add_argument(
"--llm",
type=str,
default="ollama",
choices=["simulated", "ollama", "hf", "openai"],
)
ask_parser.add_argument("--model", type=str, default="qwen3:8b")
ask_parser.add_argument("--host", type=str, default="http://localhost:11434")
ask_parser.add_argument("--interactive", "-i", action="store_true")
ask_parser.add_argument("--top-k", type=int, default=20)
ask_parser.add_argument("--complexity", type=int, default=32)
ask_parser.add_argument("--beam-width", type=int, default=1)
ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
ask_parser.add_argument("--recompute-embeddings", action="store_true")
ask_parser.add_argument(
"--pruning-strategy",
choices=["global", "local", "proportional"],
default="global",
)
# List command
subparsers.add_parser("list", help="List all indexes")
return parser
def register_project_dir(self):
"""Register current project directory in global registry"""
global_registry = Path.home() / ".leann" / "projects.json"
global_registry.parent.mkdir(exist_ok=True)
current_dir = str(Path.cwd())
# Load existing registry
projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
projects = json.load(f)
except Exception:
projects = []
# Add current directory if not already present
if current_dir not in projects:
projects.append(current_dir)
# Save registry
import json
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
def list_indexes(self):
print("Stored LEANN indexes:")
# Get all project directories with .leann
global_registry = Path.home() / ".leann" / "projects.json"
all_projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
all_projects = json.load(f)
except Exception:
pass
# Filter to only existing directories with .leann
valid_projects = []
for project_dir in all_projects:
project_path = Path(project_dir)
if project_path.exists() and (project_path / ".leann" / "indexes").exists():
valid_projects.append(project_path)
# Add current project if it has .leann but not in registry
current_path = Path.cwd()
if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects:
valid_projects.append(current_path)
if not valid_projects:
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
return
total_indexes = 0
current_dir = Path.cwd()
for project_path in valid_projects:
indexes_dir = project_path / ".leann" / "indexes"
if not indexes_dir.exists():
continue
index_dirs = [d for d in indexes_dir.iterdir() if d.is_dir()]
if not index_dirs:
continue
# Show project header
if project_path == current_dir:
print(f"\n📁 Current project ({project_path}):")
else:
print(f"\n📂 {project_path}:")
for index_dir in index_dirs:
total_indexes += 1
index_name = index_dir.name
meta_file = index_dir / "documents.leann.meta.json"
status = "" if meta_file.exists() else ""
print(f" {total_indexes}. {index_name} [{status}]")
if status == "":
size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (
1024 * 1024
)
print(f" Size: {size_mb:.1f} MB")
if total_indexes > 0:
print(f"\nTotal: {total_indexes} indexes across {len(valid_projects)} projects")
print("\nUsage (current project only):")
# Show example from current project
current_indexes_dir = current_dir / ".leann" / "indexes"
if current_indexes_dir.exists():
current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
if current_index_dirs:
example_name = current_index_dirs[0].name
print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive")
def load_documents(self, docs_dir: str):
print(f"Loading documents from {docs_dir}...")
# Try to use better PDF parsers first
documents = []
docs_path = Path(docs_dir)
for file_path in docs_path.rglob("*.pdf"):
print(f"Processing PDF: {file_path}")
# Try PyMuPDF first (best quality)
text = extract_pdf_text_with_pymupdf(str(file_path))
if text is None:
# Try pdfplumber
text = extract_pdf_text_with_pdfplumber(str(file_path))
if text:
# Create a simple document structure
from llama_index.core import Document
doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
default_docs = SimpleDirectoryReader(
str(file_path.parent),
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
# Load other file types with default reader
code_extensions = [
# Original document types
".txt",
".md",
".docx",
# Code files for Claude Code integration
".py",
".js",
".ts",
".jsx",
".tsx",
".java",
".cpp",
".c",
".h",
".hpp",
".cs",
".go",
".rs",
".rb",
".php",
".swift",
".kt",
".scala",
".r",
".sql",
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
# Config and markup files
".json",
".yaml",
".yml",
".xml",
".toml",
".ini",
".cfg",
".conf",
".html",
".css",
".scss",
".less",
".vue",
".svelte",
# Data science
".ipynb",
".R",
".py",
".jl",
]
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=code_extensions,
).load_data(show_progress=True)
documents.extend(other_docs)
all_texts = []
# Define code file extensions for intelligent chunking
code_file_exts = {
".py",
".js",
".ts",
".jsx",
".tsx",
".java",
".cpp",
".c",
".h",
".hpp",
".cs",
".go",
".rs",
".rb",
".php",
".swift",
".kt",
".scala",
".r",
".sql",
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
".json",
".yaml",
".yml",
".xml",
".toml",
".ini",
".cfg",
".conf",
".html",
".css",
".scss",
".less",
".vue",
".svelte",
".ipynb",
".R",
".jl",
}
for doc in documents:
# Check if this is a code file based on source path
source_path = doc.metadata.get("source", "")
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
# Use appropriate parser based on file type
parser = self.code_parser if is_code_file else self.node_parser
nodes = parser.get_nodes_from_documents([doc])
for node in nodes:
all_texts.append(node.get_content())
print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
return all_texts
async def build_index(self, args):
docs_dir = args.docs
index_name = args.index_name
index_dir = self.indexes_dir / index_name
index_path = self.get_index_path(index_name)
print(f"📂 Indexing: {Path(docs_dir).resolve()}")
if index_dir.exists() and not args.force:
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return
all_texts = self.load_documents(docs_dir)
if not all_texts:
print("No documents found")
return
index_dir.mkdir(parents=True, exist_ok=True)
print(f"Building index '{index_name}' with {args.backend} backend...")
builder = LeannBuilder(
backend_name=args.backend,
embedding_model=args.embedding_model,
graph_degree=args.graph_degree,
complexity=args.complexity,
is_compact=args.compact,
is_recompute=args.recompute,
num_threads=args.num_threads,
)
for chunk_text in all_texts:
builder.add_text(chunk_text)
builder.build_index(index_path)
print(f"Index built at {index_path}")
# Register this project directory in global registry
self.register_project_dir()
async def search_documents(self, args):
index_name = args.index_name
query = args.query
index_path = self.get_index_path(index_name)
if not self.index_exists(index_name):
print(
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
)
return
searcher = LeannSearcher(index_path=index_path)
results = searcher.search(
query,
top_k=args.top_k,
complexity=args.complexity,
beam_width=args.beam_width,
prune_ratio=args.prune_ratio,
recompute_embeddings=args.recompute_embeddings,
pruning_strategy=args.pruning_strategy,
)
print(f"Search results for '{query}' (top {len(results)}):")
for i, result in enumerate(results, 1):
print(f"{i}. Score: {result.score:.3f}")
print(f" {result.text[:200]}...")
print()
async def ask_questions(self, args):
index_name = args.index_name
index_path = self.get_index_path(index_name)
if not self.index_exists(index_name):
print(
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
)
return
print(f"Starting chat with index '{index_name}'...")
print(f"Using {args.model} ({args.llm})")
llm_config = {"type": args.llm, "model": args.model}
if args.llm == "ollama":
llm_config["host"] = args.host
chat = LeannChat(index_path=index_path, llm_config=llm_config)
if args.interactive:
print("LEANN Assistant ready! Type 'quit' to exit")
print("=" * 40)
while True:
user_input = input("\nYou: ").strip()
if user_input.lower() in ["quit", "exit", "q"]:
print("Goodbye!")
break
if not user_input:
continue
response = chat.ask(
user_input,
top_k=args.top_k,
complexity=args.complexity,
beam_width=args.beam_width,
prune_ratio=args.prune_ratio,
recompute_embeddings=args.recompute_embeddings,
pruning_strategy=args.pruning_strategy,
)
print(f"LEANN: {response}")
else:
query = input("Enter your question: ").strip()
if query:
response = chat.ask(
query,
top_k=args.top_k,
complexity=args.complexity,
beam_width=args.beam_width,
prune_ratio=args.prune_ratio,
recompute_embeddings=args.recompute_embeddings,
pruning_strategy=args.pruning_strategy,
)
print(f"LEANN: {response}")
async def run(self, args=None):
parser = self.create_parser()
if args is None:
args = parser.parse_args()
if not args.command:
parser.print_help()
return
if args.command == "list":
self.list_indexes()
elif args.command == "build":
await self.build_index(args)
elif args.command == "search":
await self.search_documents(args)
elif args.command == "ask":
await self.ask_questions(args)
else:
parser.print_help()
def main():
import dotenv
dotenv.load_dotenv()
cli = LeannCLI()
asyncio.run(cli.run())
if __name__ == "__main__":
main()