diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 9f6911f..6de52c2 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -123,6 +123,30 @@ Examples: type=str, help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.", ) + build_parser.add_argument( + "--doc-chunk-size", + type=int, + default=256, + help="Document chunk size in tokens/characters (default: 256)", + ) + build_parser.add_argument( + "--doc-chunk-overlap", + type=int, + default=128, + help="Document chunk overlap (default: 128)", + ) + build_parser.add_argument( + "--code-chunk-size", + type=int, + default=512, + help="Code chunk size in tokens/lines (default: 512)", + ) + build_parser.add_argument( + "--code-chunk-overlap", + type=int, + default=50, + help="Code chunk overlap (default: 50)", + ) # Search command search_parser = subparsers.add_parser("search", help="Search documents") @@ -687,6 +711,37 @@ Examples: print(f"Index '{index_name}' already exists. Use --force to rebuild.") return + # Configure chunking based on CLI args before loading documents + # Guard against invalid configurations + doc_chunk_size = max(1, int(args.doc_chunk_size)) + doc_chunk_overlap = max(0, int(args.doc_chunk_overlap)) + if doc_chunk_overlap >= doc_chunk_size: + print( + f"⚠️ Adjusting doc chunk overlap from {doc_chunk_overlap} to {doc_chunk_size - 1} (must be < chunk size)" + ) + doc_chunk_overlap = doc_chunk_size - 1 + + code_chunk_size = max(1, int(args.code_chunk_size)) + code_chunk_overlap = max(0, int(args.code_chunk_overlap)) + if code_chunk_overlap >= code_chunk_size: + print( + f"⚠️ Adjusting code chunk overlap from {code_chunk_overlap} to {code_chunk_size - 1} (must be < chunk size)" + ) + code_chunk_overlap = code_chunk_size - 1 + + self.node_parser = SentenceSplitter( + chunk_size=doc_chunk_size, + chunk_overlap=doc_chunk_overlap, + separator=" ", + paragraph_separator="\n\n", + ) + self.code_parser = SentenceSplitter( + chunk_size=code_chunk_size, + chunk_overlap=code_chunk_overlap, + separator="\n", + paragraph_separator="\n\n", + ) + all_texts = self.load_documents(docs_paths, args.file_types) if not all_texts: print("No documents found") diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py index 1a19835..9cce58c 100644 --- a/packages/leann-core/src/leann/embedding_compute.py +++ b/packages/leann-core/src/leann/embedding_compute.py @@ -263,8 +263,16 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray: print(f"len of texts: {len(texts)}") # OpenAI has limits on batch size and input length - max_batch_size = 1000 # Conservative batch size + max_batch_size = 800 # Conservative batch size because the token limit is 300K all_embeddings = [] + # get the avg len of texts + avg_len = sum(len(text) for text in texts) / len(texts) + print(f"avg len of texts: {avg_len}") + # if avg len is less than 1000, use the max batch size + if avg_len > 300: + max_batch_size = 500 + + # if avg len is less than 1000, use the max batch size try: from tqdm import tqdm diff --git a/packages/leann-mcp/README.md b/packages/leann-mcp/README.md index c7909d6..78615fa 100644 --- a/packages/leann-mcp/README.md +++ b/packages/leann-mcp/README.md @@ -41,11 +41,15 @@ claude ### Index Entire Git Repository ```bash -# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want +# Index all tracked files in your Git repository. +# Note: submodules are currently skipped; we can add them back if needed. leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw -# Index only specific file types from git +# Index only tracked Python files from Git. leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw + +# If you encounter empty requests caused by empty files (e.g., __init__.py), exclude zero-byte files. thanks @ww2283 for pointing (that)[https://github.com/yichuan-w/LEANN/issues/48] out +leann build leann-prospec-lig --docs $(find ./src -name "*.py" -not -empty) --embedding-mode openai --embedding-model text-embedding-3-small ``` ### Multiple Directories and Files