more support for type of docs in cli

This commit is contained in:
yichuan520030910320
2025-08-07 18:14:03 -07:00
parent 8b22d2b5d3
commit c96d653072
2 changed files with 76 additions and 60 deletions

View File

@@ -74,10 +74,11 @@ class LeannCLI:
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
leann build my-docs --docs ./documents # Build index named my-docs leann build my-docs --docs ./documents # Build index named my-docs
leann search my-docs "query" # Search in my-docs index leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
leann ask my-docs "question" # Ask my-docs index leann search my-docs "query" # Search in my-docs index
leann list # List all stored indexes leann ask my-docs "question" # Ask my-docs index
leann list # List all stored indexes
""", """,
) )
@@ -99,6 +100,11 @@ Examples:
build_parser.add_argument("--num-threads", type=int, default=1) build_parser.add_argument("--num-threads", type=int, default=1)
build_parser.add_argument("--compact", action="store_true", default=True) build_parser.add_argument("--compact", action="store_true", default=True)
build_parser.add_argument("--recompute", action="store_true", default=True) build_parser.add_argument("--recompute", action="store_true", default=True)
build_parser.add_argument(
"--file-types",
type=str,
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
)
# Search command # Search command
search_parser = subparsers.add_parser("search", help="Search documents") search_parser = subparsers.add_parser("search", help="Search documents")
@@ -254,8 +260,10 @@ Examples:
print(f' leann search {example_name} "your query"') print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive") print(f" leann ask {example_name} --interactive")
def load_documents(self, docs_dir: str): def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
print(f"Loading documents from {docs_dir}...") print(f"Loading documents from {docs_dir}...")
if custom_file_types:
print(f"Using custom file types: {custom_file_types}")
# Try to use better PDF parsers first # Try to use better PDF parsers first
documents = [] documents = []
@@ -287,59 +295,67 @@ Examples:
documents.extend(default_docs) documents.extend(default_docs)
# Load other file types with default reader # Load other file types with default reader
code_extensions = [ if custom_file_types:
# Original document types # Parse custom file types from comma-separated string
".txt", code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
".md", # Ensure extensions start with a dot
".docx", code_extensions = [ext if ext.startswith(".") else f".{ext}" for ext in code_extensions]
# Code files for Claude Code integration else:
".py", # Use default supported file types
".js", code_extensions = [
".ts", # Original document types
".jsx", ".txt",
".tsx", ".md",
".java", ".docx",
".cpp", ".pptx",
".c", # Code files for Claude Code integration
".h", ".py",
".hpp", ".js",
".cs", ".ts",
".go", ".jsx",
".rs", ".tsx",
".rb", ".java",
".php", ".cpp",
".swift", ".c",
".kt", ".h",
".scala", ".hpp",
".r", ".cs",
".sql", ".go",
".sh", ".rs",
".bash", ".rb",
".zsh", ".php",
".fish", ".swift",
".ps1", ".kt",
".bat", ".scala",
# Config and markup files ".r",
".json", ".sql",
".yaml", ".sh",
".yml", ".bash",
".xml", ".zsh",
".toml", ".fish",
".ini", ".ps1",
".cfg", ".bat",
".conf", # Config and markup files
".html", ".json",
".css", ".yaml",
".scss", ".yml",
".less", ".xml",
".vue", ".toml",
".svelte", ".ini",
# Data science ".cfg",
".ipynb", ".conf",
".R", ".html",
".py", ".css",
".jl", ".scss",
] ".less",
".vue",
".svelte",
# Data science
".ipynb",
".R",
".py",
".jl",
]
other_docs = SimpleDirectoryReader( other_docs = SimpleDirectoryReader(
docs_dir, docs_dir,
recursive=True, recursive=True,
@@ -424,7 +440,7 @@ Examples:
print(f"Index '{index_name}' already exists. Use --force to rebuild.") print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return return
all_texts = self.load_documents(docs_dir) all_texts = self.load_documents(docs_dir, args.file_types)
if not all_texts: if not all_texts:
print("No documents found") print("No documents found")
return return