From 71e5f1774cbf0c033e38fc7221ffab289676f4db Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 21 Jul 2025 23:48:40 -0700 Subject: [PATCH] docs: cli --- README.md | 65 ++++++++++ packages/leann-core/src/leann/cli.py | 174 ++++++++++++++++----------- 2 files changed, 166 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index fccd5aa..72243ff 100755 --- a/README.md +++ b/README.md @@ -294,6 +294,71 @@ Once the index is built, you can ask questions like: +## 🖥️ Command Line Interface + +LEANN includes a powerful CLI for document processing and search. Perfect for quick document indexing and interactive chat. + +```bash +# Build an index from documents +leann build my-docs --docs ./documents + +# Search your documents +leann search my-docs "machine learning concepts" + +# Interactive chat with your documents +leann ask my-docs --interactive + +# List all your indexes +leann list +``` + +**Key CLI features:** +- Auto-detects document formats (PDF, TXT, MD, DOCX) +- Smart text chunking with overlap +- Multiple LLM providers (Ollama, OpenAI, HuggingFace) +- Organized index storage in `~/.leann/indexes/` +- Support for advanced search parameters + +
+📋 Click to expand: Complete CLI Reference + +**Build Command:** +```bash +leann build INDEX_NAME --docs DIRECTORY [OPTIONS] + +Options: + --backend {hnsw,diskann} Backend to use (default: hnsw) + --embedding-model MODEL Embedding model (default: facebook/contriever) + --graph-degree N Graph degree (default: 32) + --complexity N Build complexity (default: 64) + --force Force rebuild existing index + --compact Use compact storage (default: true) + --recompute Enable recomputation (default: true) +``` + +**Search Command:** +```bash +leann search INDEX_NAME QUERY [OPTIONS] + +Options: + --top-k N Number of results (default: 5) + --complexity N Search complexity (default: 64) + --recompute-embeddings Use recomputation for highest accuracy + --pruning-strategy {global,local,proportional} +``` + +**Ask Command:** +```bash +leann ask INDEX_NAME [OPTIONS] + +Options: + --llm {ollama,openai,hf} LLM provider (default: ollama) + --model MODEL Model name (default: qwen3:8b) + --interactive Interactive chat mode + --top-k N Retrieval count (default: 20) +``` + +
## 🏗️ Architecture & How It Works diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 854265b..0b830b8 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1,10 +1,6 @@ -#!/usr/bin/env python3 import argparse import asyncio -import sys from pathlib import Path -from typing import Optional -import os from llama_index.core import SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter @@ -16,20 +12,20 @@ class LeannCLI: def __init__(self): self.indexes_dir = Path.home() / ".leann" / "indexes" self.indexes_dir.mkdir(parents=True, exist_ok=True) - + self.node_parser = SentenceSplitter( chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n" ) - + def get_index_path(self, index_name: str) -> str: index_dir = self.indexes_dir / index_name return str(index_dir / "documents.leann") - + def index_exists(self, index_name: str) -> bool: index_dir = self.indexes_dir / index_name meta_file = index_dir / "documents.leann.meta.json" return meta_file.exists() - + def create_parser(self) -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="leann", @@ -41,24 +37,32 @@ Examples: leann search my-docs "query" # Search in my-docs index leann ask my-docs "question" # Ask my-docs index leann list # List all stored indexes - """ + """, ) - + subparsers = parser.add_subparsers(dest="command", help="Available commands") - + # Build command build_parser = subparsers.add_parser("build", help="Build document index") build_parser.add_argument("index_name", help="Index name") - build_parser.add_argument("--docs", type=str, required=True, help="Documents directory") - build_parser.add_argument("--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]) - build_parser.add_argument("--embedding-model", type=str, default="facebook/contriever") - build_parser.add_argument("--force", "-f", action="store_true", help="Force rebuild") + build_parser.add_argument( + "--docs", type=str, required=True, help="Documents directory" + ) + build_parser.add_argument( + "--backend", type=str, default="hnsw", choices=["hnsw", "diskann"] + ) + build_parser.add_argument( + "--embedding-model", type=str, default="facebook/contriever" + ) + build_parser.add_argument( + "--force", "-f", action="store_true", help="Force rebuild" + ) build_parser.add_argument("--graph-degree", type=int, default=32) build_parser.add_argument("--complexity", type=int, default=64) build_parser.add_argument("--num-threads", type=int, default=1) build_parser.add_argument("--compact", action="store_true", default=True) build_parser.add_argument("--recompute", action="store_true", default=True) - + # Search command search_parser = subparsers.add_parser("search", help="Search documents") search_parser.add_argument("index_name", help="Index name") @@ -68,12 +72,21 @@ Examples: search_parser.add_argument("--beam-width", type=int, default=1) search_parser.add_argument("--prune-ratio", type=float, default=0.0) search_parser.add_argument("--recompute-embeddings", action="store_true") - search_parser.add_argument("--pruning-strategy", choices=["global", "local", "proportional"], default="global") - + search_parser.add_argument( + "--pruning-strategy", + choices=["global", "local", "proportional"], + default="global", + ) + # Ask command ask_parser = subparsers.add_parser("ask", help="Ask questions") ask_parser.add_argument("index_name", help="Index name") - ask_parser.add_argument("--llm", type=str, default="ollama", choices=["simulated", "ollama", "hf", "openai"]) + ask_parser.add_argument( + "--llm", + type=str, + default="ollama", + choices=["simulated", "ollama", "hf", "openai"], + ) ask_parser.add_argument("--model", type=str, default="qwen3:8b") ask_parser.add_argument("--host", type=str, default="http://localhost:11434") ask_parser.add_argument("--interactive", "-i", action="store_true") @@ -82,81 +95,91 @@ Examples: ask_parser.add_argument("--beam-width", type=int, default=1) ask_parser.add_argument("--prune-ratio", type=float, default=0.0) ask_parser.add_argument("--recompute-embeddings", action="store_true") - ask_parser.add_argument("--pruning-strategy", choices=["global", "local", "proportional"], default="global") - + ask_parser.add_argument( + "--pruning-strategy", + choices=["global", "local", "proportional"], + default="global", + ) + # List command list_parser = subparsers.add_parser("list", help="List all indexes") - + return parser - + def list_indexes(self): print("Stored LEANN indexes:") - + if not self.indexes_dir.exists(): - print("No indexes found. Use 'leann build --docs ' to create one.") + print( + "No indexes found. Use 'leann build --docs ' to create one." + ) return - + index_dirs = [d for d in self.indexes_dir.iterdir() if d.is_dir()] - + if not index_dirs: - print("No indexes found. Use 'leann build --docs ' to create one.") + print( + "No indexes found. Use 'leann build --docs ' to create one." + ) return - + print(f"Found {len(index_dirs)} indexes:") for i, index_dir in enumerate(index_dirs, 1): index_name = index_dir.name status = "✓" if self.index_exists(index_name) else "✗" - + print(f" {i}. {index_name} [{status}]") if self.index_exists(index_name): meta_file = index_dir / "documents.leann.meta.json" - size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (1024 * 1024) + size_mb = sum( + f.stat().st_size for f in index_dir.iterdir() if f.is_file() + ) / (1024 * 1024) print(f" Size: {size_mb:.1f} MB") - + if index_dirs: example_name = index_dirs[0].name print(f"\nUsage:") - print(f" leann search {example_name} \"your query\"") + print(f' leann search {example_name} "your query"') print(f" leann ask {example_name} --interactive") - + def load_documents(self, docs_dir: str): print(f"Loading documents from {docs_dir}...") - + documents = SimpleDirectoryReader( docs_dir, recursive=True, encoding="utf-8", required_exts=[".pdf", ".txt", ".md", ".docx"], ).load_data(show_progress=True) - + all_texts = [] for doc in documents: nodes = self.node_parser.get_nodes_from_documents([doc]) for node in nodes: all_texts.append(node.get_content()) - + print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks") return all_texts - + async def build_index(self, args): docs_dir = args.docs index_name = args.index_name index_dir = self.indexes_dir / index_name index_path = self.get_index_path(index_name) - + if index_dir.exists() and not args.force: print(f"Index '{index_name}' already exists. Use --force to rebuild.") return - + all_texts = self.load_documents(docs_dir) if not all_texts: print("No documents found") return - + index_dir.mkdir(parents=True, exist_ok=True) - + print(f"Building index '{index_name}' with {args.backend} backend...") - + builder = LeannBuilder( backend_name=args.backend, embedding_model=args.embedding_model, @@ -166,103 +189,107 @@ Examples: is_recompute=args.recompute, num_threads=args.num_threads, ) - + for chunk_text in all_texts: builder.add_text(chunk_text) - + builder.build_index(index_path) print(f"Index built at {index_path}") - + async def search_documents(self, args): index_name = args.index_name query = args.query index_path = self.get_index_path(index_name) - + if not self.index_exists(index_name): - print(f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it.") + print( + f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it." + ) return - + searcher = LeannSearcher(index_path=index_path) results = searcher.search( - query, + query, top_k=args.top_k, complexity=args.complexity, beam_width=args.beam_width, prune_ratio=args.prune_ratio, recompute_embeddings=args.recompute_embeddings, - pruning_strategy=args.pruning_strategy + pruning_strategy=args.pruning_strategy, ) - + print(f"Search results for '{query}' (top {len(results)}):") for i, result in enumerate(results, 1): print(f"{i}. Score: {result.score:.3f}") print(f" {result.text[:200]}...") print() - + async def ask_questions(self, args): index_name = args.index_name index_path = self.get_index_path(index_name) - + if not self.index_exists(index_name): - print(f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it.") + print( + f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it." + ) return - + print(f"Starting chat with index '{index_name}'...") print(f"Using {args.model} ({args.llm})") - + llm_config = {"type": args.llm, "model": args.model} if args.llm == "ollama": llm_config["host"] = args.host - + chat = LeannChat(index_path=index_path, llm_config=llm_config) - + if args.interactive: print("LEANN Assistant ready! Type 'quit' to exit") print("=" * 40) - + while True: user_input = input("\nYou: ").strip() - if user_input.lower() in ['quit', 'exit', 'q']: + if user_input.lower() in ["quit", "exit", "q"]: print("Goodbye!") break - + if not user_input: continue - + response = chat.ask( - user_input, + user_input, top_k=args.top_k, complexity=args.complexity, beam_width=args.beam_width, prune_ratio=args.prune_ratio, recompute_embeddings=args.recompute_embeddings, - pruning_strategy=args.pruning_strategy + pruning_strategy=args.pruning_strategy, ) print(f"LEANN: {response}") else: query = input("Enter your question: ").strip() if query: response = chat.ask( - query, + query, top_k=args.top_k, complexity=args.complexity, beam_width=args.beam_width, prune_ratio=args.prune_ratio, recompute_embeddings=args.recompute_embeddings, - pruning_strategy=args.pruning_strategy + pruning_strategy=args.pruning_strategy, ) print(f"LEANN: {response}") - + async def run(self, args=None): parser = self.create_parser() - + if args is None: args = parser.parse_args() - + if not args.command: parser.print_help() return - + if args.command == "list": self.list_indexes() elif args.command == "build": @@ -277,11 +304,12 @@ Examples: def main(): import dotenv + dotenv.load_dotenv() - + cli = LeannCLI() asyncio.run(cli.run()) if __name__ == "__main__": - main() \ No newline at end of file + main()