add chunk size in leann build& fix batch size in oai& docs

2025-08-14 13:14:14 -07:00
parent fafdf8fcbe
commit 42c8370709
3 changed files with 70 additions and 3 deletions
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -123,6 +123,30 @@ Examples:
            type=str,
            help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
        )
+        build_parser.add_argument(
+            "--doc-chunk-size",
+            type=int,
+            default=256,
+            help="Document chunk size in tokens/characters (default: 256)",
+        )
+        build_parser.add_argument(
+            "--doc-chunk-overlap",
+            type=int,
+            default=128,
+            help="Document chunk overlap (default: 128)",
+        )
+        build_parser.add_argument(
+            "--code-chunk-size",
+            type=int,
+            default=512,
+            help="Code chunk size in tokens/lines (default: 512)",
+        )
+        build_parser.add_argument(
+            "--code-chunk-overlap",
+            type=int,
+            default=50,
+            help="Code chunk overlap (default: 50)",
+        )

        # Search command
        search_parser = subparsers.add_parser("search", help="Search documents")
@@ -687,6 +711,37 @@ Examples:
            print(f"Index '{index_name}' already exists. Use --force to rebuild.")
            return

+        # Configure chunking based on CLI args before loading documents
+        # Guard against invalid configurations
+        doc_chunk_size = max(1, int(args.doc_chunk_size))
+        doc_chunk_overlap = max(0, int(args.doc_chunk_overlap))
+        if doc_chunk_overlap >= doc_chunk_size:
+            print(
+                f"⚠️  Adjusting doc chunk overlap from {doc_chunk_overlap} to {doc_chunk_size - 1} (must be < chunk size)"
+            )
+            doc_chunk_overlap = doc_chunk_size - 1
+
+        code_chunk_size = max(1, int(args.code_chunk_size))
+        code_chunk_overlap = max(0, int(args.code_chunk_overlap))
+        if code_chunk_overlap >= code_chunk_size:
+            print(
+                f"⚠️  Adjusting code chunk overlap from {code_chunk_overlap} to {code_chunk_size - 1} (must be < chunk size)"
+            )
+            code_chunk_overlap = code_chunk_size - 1
+
+        self.node_parser = SentenceSplitter(
+            chunk_size=doc_chunk_size,
+            chunk_overlap=doc_chunk_overlap,
+            separator=" ",
+            paragraph_separator="\n\n",
+        )
+        self.code_parser = SentenceSplitter(
+            chunk_size=code_chunk_size,
+            chunk_overlap=code_chunk_overlap,
+            separator="\n",
+            paragraph_separator="\n\n",
+        )
+
        all_texts = self.load_documents(docs_paths, args.file_types)
        if not all_texts:
            print("No documents found")
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -263,8 +263,16 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
    print(f"len of texts: {len(texts)}")

    # OpenAI has limits on batch size and input length
-    max_batch_size = 1000  # Conservative batch size
+    max_batch_size = 800  # Conservative batch size because the token limit is 300K
    all_embeddings = []
+    # get the avg len of texts
+    avg_len = sum(len(text) for text in texts) / len(texts)
+    print(f"avg len of texts: {avg_len}")
+    # if avg len is less than 1000, use the max batch size
+    if avg_len > 300:
+        max_batch_size = 500
+
+    # if avg len is less than 1000, use the max batch size

    try:
        from tqdm import tqdm
--- a/packages/leann-mcp/README.md
+++ b/packages/leann-mcp/README.md
@@ -41,11 +41,15 @@ claude

 ### Index Entire Git Repository
 ```bash
-# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
+# Index all tracked files in your Git repository.
+# Note: submodules are currently skipped; we can add them back if needed.
 leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw

-# Index only specific file types from git
+# Index only tracked Python files from Git.
 leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+
+# If you encounter empty requests caused by empty files (e.g., __init__.py), exclude zero-byte files. thanks @ww2283 for pointing (that)[https://github.com/yichuan-w/LEANN/issues/48] out
+leann build leann-prospec-lig --docs $(find ./src -name "*.py" -not -empty) --embedding-mode openai --embedding-model text-embedding-3-small
 ```

 ### Multiple Directories and Files