Compare commits

..

11 Commits

Author SHA1 Message Date
Andy Lee
b55eeeae5f Merge remote-tracking branch 'origin/main' into feature/claude-code-research 2025-08-05 23:02:00 -07:00
Andy Lee
e890b2311f feat: Add Claude Code integration with MCP server 2025-08-05 14:03:36 -07:00
Andy Lee
f3d99fd118 feat: Claude Code integration ready - LEANN CLI works out of the box
 Verified LEANN CLI works perfectly with Claude Code
 Added integration guide with working examples
 Documented simple workflow for immediate use

Key findings:
- No code changes needed
- Just need --recompute-embeddings flag
- Search, ask, and build all work
- Ready for Claude Code agents and workflows
2025-08-05 12:27:58 -07:00
Andy Lee
8eee90bf80 docs: add a link 2025-08-04 20:10:14 -07:00
Andy Lee
649d4ad03e docs: Address all configuration guide feedback
- Fix grammar: 'If time is not a constraint' instead of 'time expense is not large'
- Highlight Qwen3-Embedding-0.6B performance (nearly OpenAI API level)
- Add OpenAI quick start section with configuration example
- Fold Cloud vs Local trade-offs into collapsible section
- Update HNSW as 'default and recommended for extreme low storage'
- Add DiskANN beta warning and explain PQ+rerank architecture
- Expand Ollama models: add qwen3:0.6b, 4b, 7b variants
- Note OpenAI as current default but recommend Ollama switch
- Add 'need to install extra software' warning for Ollama
- Remove incorrect latency numbers from search-complexity recommendations
2025-08-04 20:01:23 -07:00
Andy Lee
d9b6f195c5 docs: Improve configuration guide based on feedback
- List specific files in default data/ directory (2 AI papers, literature, tech report)
- Update examples to use English and better RAG-suitable queries
- Change full dataset reference to use --max-items -1
- Adjust small model guidance about upgrading to larger models when time allows
- Update top-k defaults to reflect actual default of 20
- Ensure consistent use of full model name Qwen/Qwen3-Embedding-0.6B
- Reorder optimization steps, move MLX to third position
- Remove incorrect chunk size tuning guidance
- Change README from 'Having trouble' to 'Need best practices'
2025-08-04 19:29:17 -07:00
Andy Lee
00f506c0bd docs: Adjust DiskANN positioning in features and roadmap
- features.md: Put HNSW/FAISS first as default, DiskANN as optional
- roadmap.md: Reorder to show HNSW integration before DiskANN
- Consistent with positioning DiskANN as advanced option for large-scale use
2025-08-04 17:53:27 -07:00
Andy Lee
e872dd1d23 docs: Weaken DiskANN emphasis in README
- Change backend description to emphasize HNSW as default
- DiskANN positioned as optional for billion-scale datasets
- Simplify evaluation commands to be more generic
2025-08-04 17:51:21 -07:00
Andy Lee
063c687ff7 chore: move evaluation data .gitattributes to correct location 2025-08-04 17:46:17 -07:00
Andy Lee
bb8ecd54d7 feat: add comprehensive configuration guide and update README
- Create docs/configuration-guide.md with detailed guidance on:
  - Embedding model selection (small/medium/large)
  - Index selection (HNSW vs DiskANN)
  - LLM engine and model comparison
  - Parameter tuning (build/search complexity, top-k)
  - Performance optimization tips
  - Deep dive into LEANN's recomputation feature
- Update README.md to link to the configuration guide
- Include latest 2025 model recommendations (Qwen3, DeepSeek-R1, O3-mini)
2025-08-04 17:41:27 -07:00
Andy Lee
716217ae24 docs: config guidance 2025-08-04 16:21:13 -07:00
6 changed files with 75 additions and 108 deletions

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.2.2" version = "0.2.1"
dependencies = ["leann-core==0.2.2", "numpy", "protobuf>=3.19.0"] dependencies = ["leann-core==0.2.1", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build] [tool.scikit-build]
# Key: simplified CMake path # Key: simplified CMake path

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.2.2" version = "0.2.1"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [ dependencies = [
"leann-core==0.2.2", "leann-core==0.2.1",
"numpy", "numpy",
"pyzmq>=23.0.0", "pyzmq>=23.0.0",
"msgpack>=1.0.0", "msgpack>=1.0.0",

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann-core" name = "leann-core"
version = "0.2.2" version = "0.2.1"
description = "Core API and plugin system for LEANN" description = "Core API and plugin system for LEANN"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -74,11 +74,10 @@ class LeannCLI:
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
leann build my-docs --docs ./documents # Build index named my-docs leann build my-docs --docs ./documents # Build index named my-docs
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files leann search my-docs "query" # Search in my-docs index
leann search my-docs "query" # Search in my-docs index leann ask my-docs "question" # Ask my-docs index
leann ask my-docs "question" # Ask my-docs index leann list # List all stored indexes
leann list # List all stored indexes
""", """,
) )
@@ -100,11 +99,6 @@ Examples:
build_parser.add_argument("--num-threads", type=int, default=1) build_parser.add_argument("--num-threads", type=int, default=1)
build_parser.add_argument("--compact", action="store_true", default=True) build_parser.add_argument("--compact", action="store_true", default=True)
build_parser.add_argument("--recompute", action="store_true", default=True) build_parser.add_argument("--recompute", action="store_true", default=True)
build_parser.add_argument(
"--file-types",
type=str,
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
)
# Search command # Search command
search_parser = subparsers.add_parser("search", help="Search documents") search_parser = subparsers.add_parser("search", help="Search documents")
@@ -114,12 +108,7 @@ Examples:
search_parser.add_argument("--complexity", type=int, default=64) search_parser.add_argument("--complexity", type=int, default=64)
search_parser.add_argument("--beam-width", type=int, default=1) search_parser.add_argument("--beam-width", type=int, default=1)
search_parser.add_argument("--prune-ratio", type=float, default=0.0) search_parser.add_argument("--prune-ratio", type=float, default=0.0)
search_parser.add_argument( search_parser.add_argument("--recompute-embeddings", action="store_true")
"--recompute-embeddings",
action="store_true",
default=True,
help="Recompute embeddings (default: True)",
)
search_parser.add_argument( search_parser.add_argument(
"--pruning-strategy", "--pruning-strategy",
choices=["global", "local", "proportional"], choices=["global", "local", "proportional"],
@@ -142,12 +131,7 @@ Examples:
ask_parser.add_argument("--complexity", type=int, default=32) ask_parser.add_argument("--complexity", type=int, default=32)
ask_parser.add_argument("--beam-width", type=int, default=1) ask_parser.add_argument("--beam-width", type=int, default=1)
ask_parser.add_argument("--prune-ratio", type=float, default=0.0) ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
ask_parser.add_argument( ask_parser.add_argument("--recompute-embeddings", action="store_true")
"--recompute-embeddings",
action="store_true",
default=True,
help="Recompute embeddings (default: True)",
)
ask_parser.add_argument( ask_parser.add_argument(
"--pruning-strategy", "--pruning-strategy",
choices=["global", "local", "proportional"], choices=["global", "local", "proportional"],
@@ -270,10 +254,8 @@ Examples:
print(f' leann search {example_name} "your query"') print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive") print(f" leann ask {example_name} --interactive")
def load_documents(self, docs_dir: str, custom_file_types: str | None = None): def load_documents(self, docs_dir: str):
print(f"Loading documents from {docs_dir}...") print(f"Loading documents from {docs_dir}...")
if custom_file_types:
print(f"Using custom file types: {custom_file_types}")
# Try to use better PDF parsers first # Try to use better PDF parsers first
documents = [] documents = []
@@ -305,81 +287,66 @@ Examples:
documents.extend(default_docs) documents.extend(default_docs)
# Load other file types with default reader # Load other file types with default reader
if custom_file_types: code_extensions = [
# Parse custom file types from comma-separated string # Original document types
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()] ".txt",
# Ensure extensions start with a dot ".md",
code_extensions = [ext if ext.startswith(".") else f".{ext}" for ext in code_extensions] ".docx",
else: # Code files for Claude Code integration
# Use default supported file types ".py",
code_extensions = [ ".js",
# Original document types ".ts",
".txt", ".jsx",
".md", ".tsx",
".docx", ".java",
".pptx", ".cpp",
# Code files for Claude Code integration ".c",
".py", ".h",
".js", ".hpp",
".ts", ".cs",
".jsx", ".go",
".tsx", ".rs",
".java", ".rb",
".cpp", ".php",
".c", ".swift",
".h", ".kt",
".hpp", ".scala",
".cs", ".r",
".go", ".sql",
".rs", ".sh",
".rb", ".bash",
".php", ".zsh",
".swift", ".fish",
".kt", ".ps1",
".scala", ".bat",
".r", # Config and markup files
".sql", ".json",
".sh", ".yaml",
".bash", ".yml",
".zsh", ".xml",
".fish", ".toml",
".ps1", ".ini",
".bat", ".cfg",
# Config and markup files ".conf",
".json", ".html",
".yaml", ".css",
".yml", ".scss",
".xml", ".less",
".toml", ".vue",
".ini", ".svelte",
".cfg", # Data science
".conf", ".ipynb",
".html", ".R",
".css", ".py",
".scss", ".jl",
".less", ]
".vue", other_docs = SimpleDirectoryReader(
".svelte", docs_dir,
# Data science recursive=True,
".ipynb", encoding="utf-8",
".R", required_exts=code_extensions,
".py", ).load_data(show_progress=True)
".jl", documents.extend(other_docs)
]
# Try to load other file types, but don't fail if none are found
try:
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=code_extensions,
).load_data(show_progress=True)
documents.extend(other_docs)
except ValueError as e:
if "No files found" in str(e):
print("No additional files found for other supported types.")
else:
raise e
all_texts = [] all_texts = []
@@ -457,7 +424,7 @@ Examples:
print(f"Index '{index_name}' already exists. Use --force to rebuild.") print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return return
all_texts = self.load_documents(docs_dir, args.file_types) all_texts = self.load_documents(docs_dir)
if not all_texts: if not all_texts:
print("No documents found") print("No documents found")
return return

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann" name = "leann"
version = "0.2.2" version = "0.2.1"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"