Compare commits

..

5 Commits

Author SHA1 Message Date
GitHub Actions
075d4bd167 chore: release v0.2.2 2025-08-08 01:58:40 +00:00
yichuan520030910320
e4bcc76f88 fix cli & make recompute default true 2025-08-07 18:58:04 -07:00
yichuan520030910320
710e83b1fd fix cli if there is no other type of doc to make it robust 2025-08-07 18:46:05 -07:00
yichuan520030910320
c96d653072 more support for type of docs in cli 2025-08-07 18:14:03 -07:00
Andy Lee
8b22d2b5d3 Merge pull request #19 from yichuan-w/feature/claude-code-research
Feature/claude code research
2025-08-05 23:02:34 -07:00
6 changed files with 108 additions and 75 deletions

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-diskann"
version = "0.2.1"
dependencies = ["leann-core==0.2.1", "numpy", "protobuf>=3.19.0"]
version = "0.2.2"
dependencies = ["leann-core==0.2.2", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build]
# Key: simplified CMake path

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-hnsw"
version = "0.2.1"
version = "0.2.2"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [
"leann-core==0.2.1",
"leann-core==0.2.2",
"numpy",
"pyzmq>=23.0.0",
"msgpack>=1.0.0",

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann-core"
version = "0.2.1"
version = "0.2.2"
description = "Core API and plugin system for LEANN"
readme = "README.md"
requires-python = ">=3.9"

View File

@@ -74,10 +74,11 @@ class LeannCLI:
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
leann build my-docs --docs ./documents # Build index named my-docs
leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index
leann list # List all stored indexes
leann build my-docs --docs ./documents # Build index named my-docs
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index
leann list # List all stored indexes
""",
)
@@ -99,6 +100,11 @@ Examples:
build_parser.add_argument("--num-threads", type=int, default=1)
build_parser.add_argument("--compact", action="store_true", default=True)
build_parser.add_argument("--recompute", action="store_true", default=True)
build_parser.add_argument(
"--file-types",
type=str,
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
)
# Search command
search_parser = subparsers.add_parser("search", help="Search documents")
@@ -108,7 +114,12 @@ Examples:
search_parser.add_argument("--complexity", type=int, default=64)
search_parser.add_argument("--beam-width", type=int, default=1)
search_parser.add_argument("--prune-ratio", type=float, default=0.0)
search_parser.add_argument("--recompute-embeddings", action="store_true")
search_parser.add_argument(
"--recompute-embeddings",
action="store_true",
default=True,
help="Recompute embeddings (default: True)",
)
search_parser.add_argument(
"--pruning-strategy",
choices=["global", "local", "proportional"],
@@ -131,7 +142,12 @@ Examples:
ask_parser.add_argument("--complexity", type=int, default=32)
ask_parser.add_argument("--beam-width", type=int, default=1)
ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
ask_parser.add_argument("--recompute-embeddings", action="store_true")
ask_parser.add_argument(
"--recompute-embeddings",
action="store_true",
default=True,
help="Recompute embeddings (default: True)",
)
ask_parser.add_argument(
"--pruning-strategy",
choices=["global", "local", "proportional"],
@@ -254,8 +270,10 @@ Examples:
print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive")
def load_documents(self, docs_dir: str):
def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
print(f"Loading documents from {docs_dir}...")
if custom_file_types:
print(f"Using custom file types: {custom_file_types}")
# Try to use better PDF parsers first
documents = []
@@ -287,66 +305,81 @@ Examples:
documents.extend(default_docs)
# Load other file types with default reader
code_extensions = [
# Original document types
".txt",
".md",
".docx",
# Code files for Claude Code integration
".py",
".js",
".ts",
".jsx",
".tsx",
".java",
".cpp",
".c",
".h",
".hpp",
".cs",
".go",
".rs",
".rb",
".php",
".swift",
".kt",
".scala",
".r",
".sql",
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
# Config and markup files
".json",
".yaml",
".yml",
".xml",
".toml",
".ini",
".cfg",
".conf",
".html",
".css",
".scss",
".less",
".vue",
".svelte",
# Data science
".ipynb",
".R",
".py",
".jl",
]
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=code_extensions,
).load_data(show_progress=True)
documents.extend(other_docs)
if custom_file_types:
# Parse custom file types from comma-separated string
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
# Ensure extensions start with a dot
code_extensions = [ext if ext.startswith(".") else f".{ext}" for ext in code_extensions]
else:
# Use default supported file types
code_extensions = [
# Original document types
".txt",
".md",
".docx",
".pptx",
# Code files for Claude Code integration
".py",
".js",
".ts",
".jsx",
".tsx",
".java",
".cpp",
".c",
".h",
".hpp",
".cs",
".go",
".rs",
".rb",
".php",
".swift",
".kt",
".scala",
".r",
".sql",
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
# Config and markup files
".json",
".yaml",
".yml",
".xml",
".toml",
".ini",
".cfg",
".conf",
".html",
".css",
".scss",
".less",
".vue",
".svelte",
# Data science
".ipynb",
".R",
".py",
".jl",
]
# Try to load other file types, but don't fail if none are found
try:
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=code_extensions,
).load_data(show_progress=True)
documents.extend(other_docs)
except ValueError as e:
if "No files found" in str(e):
print("No additional files found for other supported types.")
else:
raise e
all_texts = []
@@ -424,7 +457,7 @@ Examples:
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return
all_texts = self.load_documents(docs_dir)
all_texts = self.load_documents(docs_dir, args.file_types)
if not all_texts:
print("No documents found")
return

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann"
version = "0.2.1"
version = "0.2.2"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md"
requires-python = ">=3.9"