more support for type of docs in cli
This commit is contained in:
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: 67a2611ad1...b2dc4ea2c7
@@ -74,10 +74,11 @@ class LeannCLI:
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
leann build my-docs --docs ./documents # Build index named my-docs
|
leann build my-docs --docs ./documents # Build index named my-docs
|
||||||
leann search my-docs "query" # Search in my-docs index
|
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||||
leann ask my-docs "question" # Ask my-docs index
|
leann search my-docs "query" # Search in my-docs index
|
||||||
leann list # List all stored indexes
|
leann ask my-docs "question" # Ask my-docs index
|
||||||
|
leann list # List all stored indexes
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -99,6 +100,11 @@ Examples:
|
|||||||
build_parser.add_argument("--num-threads", type=int, default=1)
|
build_parser.add_argument("--num-threads", type=int, default=1)
|
||||||
build_parser.add_argument("--compact", action="store_true", default=True)
|
build_parser.add_argument("--compact", action="store_true", default=True)
|
||||||
build_parser.add_argument("--recompute", action="store_true", default=True)
|
build_parser.add_argument("--recompute", action="store_true", default=True)
|
||||||
|
build_parser.add_argument(
|
||||||
|
"--file-types",
|
||||||
|
type=str,
|
||||||
|
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
|
||||||
|
)
|
||||||
|
|
||||||
# Search command
|
# Search command
|
||||||
search_parser = subparsers.add_parser("search", help="Search documents")
|
search_parser = subparsers.add_parser("search", help="Search documents")
|
||||||
@@ -254,8 +260,10 @@ Examples:
|
|||||||
print(f' leann search {example_name} "your query"')
|
print(f' leann search {example_name} "your query"')
|
||||||
print(f" leann ask {example_name} --interactive")
|
print(f" leann ask {example_name} --interactive")
|
||||||
|
|
||||||
def load_documents(self, docs_dir: str):
|
def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
|
||||||
print(f"Loading documents from {docs_dir}...")
|
print(f"Loading documents from {docs_dir}...")
|
||||||
|
if custom_file_types:
|
||||||
|
print(f"Using custom file types: {custom_file_types}")
|
||||||
|
|
||||||
# Try to use better PDF parsers first
|
# Try to use better PDF parsers first
|
||||||
documents = []
|
documents = []
|
||||||
@@ -287,59 +295,67 @@ Examples:
|
|||||||
documents.extend(default_docs)
|
documents.extend(default_docs)
|
||||||
|
|
||||||
# Load other file types with default reader
|
# Load other file types with default reader
|
||||||
code_extensions = [
|
if custom_file_types:
|
||||||
# Original document types
|
# Parse custom file types from comma-separated string
|
||||||
".txt",
|
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
|
||||||
".md",
|
# Ensure extensions start with a dot
|
||||||
".docx",
|
code_extensions = [ext if ext.startswith(".") else f".{ext}" for ext in code_extensions]
|
||||||
# Code files for Claude Code integration
|
else:
|
||||||
".py",
|
# Use default supported file types
|
||||||
".js",
|
code_extensions = [
|
||||||
".ts",
|
# Original document types
|
||||||
".jsx",
|
".txt",
|
||||||
".tsx",
|
".md",
|
||||||
".java",
|
".docx",
|
||||||
".cpp",
|
".pptx",
|
||||||
".c",
|
# Code files for Claude Code integration
|
||||||
".h",
|
".py",
|
||||||
".hpp",
|
".js",
|
||||||
".cs",
|
".ts",
|
||||||
".go",
|
".jsx",
|
||||||
".rs",
|
".tsx",
|
||||||
".rb",
|
".java",
|
||||||
".php",
|
".cpp",
|
||||||
".swift",
|
".c",
|
||||||
".kt",
|
".h",
|
||||||
".scala",
|
".hpp",
|
||||||
".r",
|
".cs",
|
||||||
".sql",
|
".go",
|
||||||
".sh",
|
".rs",
|
||||||
".bash",
|
".rb",
|
||||||
".zsh",
|
".php",
|
||||||
".fish",
|
".swift",
|
||||||
".ps1",
|
".kt",
|
||||||
".bat",
|
".scala",
|
||||||
# Config and markup files
|
".r",
|
||||||
".json",
|
".sql",
|
||||||
".yaml",
|
".sh",
|
||||||
".yml",
|
".bash",
|
||||||
".xml",
|
".zsh",
|
||||||
".toml",
|
".fish",
|
||||||
".ini",
|
".ps1",
|
||||||
".cfg",
|
".bat",
|
||||||
".conf",
|
# Config and markup files
|
||||||
".html",
|
".json",
|
||||||
".css",
|
".yaml",
|
||||||
".scss",
|
".yml",
|
||||||
".less",
|
".xml",
|
||||||
".vue",
|
".toml",
|
||||||
".svelte",
|
".ini",
|
||||||
# Data science
|
".cfg",
|
||||||
".ipynb",
|
".conf",
|
||||||
".R",
|
".html",
|
||||||
".py",
|
".css",
|
||||||
".jl",
|
".scss",
|
||||||
]
|
".less",
|
||||||
|
".vue",
|
||||||
|
".svelte",
|
||||||
|
# Data science
|
||||||
|
".ipynb",
|
||||||
|
".R",
|
||||||
|
".py",
|
||||||
|
".jl",
|
||||||
|
]
|
||||||
other_docs = SimpleDirectoryReader(
|
other_docs = SimpleDirectoryReader(
|
||||||
docs_dir,
|
docs_dir,
|
||||||
recursive=True,
|
recursive=True,
|
||||||
@@ -424,7 +440,7 @@ Examples:
|
|||||||
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
||||||
return
|
return
|
||||||
|
|
||||||
all_texts = self.load_documents(docs_dir)
|
all_texts = self.load_documents(docs_dir, args.file_types)
|
||||||
if not all_texts:
|
if not all_texts:
|
||||||
print("No documents found")
|
print("No documents found")
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user