add chunk size in leann build& fix batch size in oai& docs

This commit is contained in:
yichuan520030910320
2025-08-14 13:14:14 -07:00
parent fafdf8fcbe
commit 42c8370709
3 changed files with 70 additions and 3 deletions

View File

@@ -123,6 +123,30 @@ Examples:
type=str,
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
)
build_parser.add_argument(
"--doc-chunk-size",
type=int,
default=256,
help="Document chunk size in tokens/characters (default: 256)",
)
build_parser.add_argument(
"--doc-chunk-overlap",
type=int,
default=128,
help="Document chunk overlap (default: 128)",
)
build_parser.add_argument(
"--code-chunk-size",
type=int,
default=512,
help="Code chunk size in tokens/lines (default: 512)",
)
build_parser.add_argument(
"--code-chunk-overlap",
type=int,
default=50,
help="Code chunk overlap (default: 50)",
)
# Search command
search_parser = subparsers.add_parser("search", help="Search documents")
@@ -687,6 +711,37 @@ Examples:
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return
# Configure chunking based on CLI args before loading documents
# Guard against invalid configurations
doc_chunk_size = max(1, int(args.doc_chunk_size))
doc_chunk_overlap = max(0, int(args.doc_chunk_overlap))
if doc_chunk_overlap >= doc_chunk_size:
print(
f"⚠️ Adjusting doc chunk overlap from {doc_chunk_overlap} to {doc_chunk_size - 1} (must be < chunk size)"
)
doc_chunk_overlap = doc_chunk_size - 1
code_chunk_size = max(1, int(args.code_chunk_size))
code_chunk_overlap = max(0, int(args.code_chunk_overlap))
if code_chunk_overlap >= code_chunk_size:
print(
f"⚠️ Adjusting code chunk overlap from {code_chunk_overlap} to {code_chunk_size - 1} (must be < chunk size)"
)
code_chunk_overlap = code_chunk_size - 1
self.node_parser = SentenceSplitter(
chunk_size=doc_chunk_size,
chunk_overlap=doc_chunk_overlap,
separator=" ",
paragraph_separator="\n\n",
)
self.code_parser = SentenceSplitter(
chunk_size=code_chunk_size,
chunk_overlap=code_chunk_overlap,
separator="\n",
paragraph_separator="\n\n",
)
all_texts = self.load_documents(docs_paths, args.file_types)
if not all_texts:
print("No documents found")

View File

@@ -263,8 +263,16 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
print(f"len of texts: {len(texts)}")
# OpenAI has limits on batch size and input length
max_batch_size = 1000 # Conservative batch size
max_batch_size = 800 # Conservative batch size because the token limit is 300K
all_embeddings = []
# get the avg len of texts
avg_len = sum(len(text) for text in texts) / len(texts)
print(f"avg len of texts: {avg_len}")
# if avg len is less than 1000, use the max batch size
if avg_len > 300:
max_batch_size = 500
# if avg len is less than 1000, use the max batch size
try:
from tqdm import tqdm

View File

@@ -41,11 +41,15 @@ claude
### Index Entire Git Repository
```bash
# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
# Index all tracked files in your Git repository.
# Note: submodules are currently skipped; we can add them back if needed.
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Index only specific file types from git
# Index only tracked Python files from Git.
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# If you encounter empty requests caused by empty files (e.g., __init__.py), exclude zero-byte files. thanks @ww2283 for pointing (that)[https://github.com/yichuan-w/LEANN/issues/48] out
leann build leann-prospec-lig --docs $(find ./src -name "*.py" -not -empty) --embedding-mode openai --embedding-model text-embedding-3-small
```
### Multiple Directories and Files