fixing chunking token issues within limit for embedding models

This commit is contained in:
aakash
2025-10-26 18:53:53 -07:00
parent a85d0ad4a7
commit 64b92a04a7
4 changed files with 270 additions and 24 deletions

View File

@@ -180,14 +180,14 @@ class BaseRAGExample(ABC):
ast_group.add_argument(
"--ast-chunk-size",
type=int,
default=512,
help="Maximum characters per AST chunk (default: 512)",
default=300,
help="Maximum CHARACTERS per AST chunk (default: 300). Final chunks may be larger due to overlap. For 512 token models: recommended 300 chars",
)
ast_group.add_argument(
"--ast-chunk-overlap",
type=int,
default=64,
help="Overlap between AST chunks (default: 64)",
help="Overlap between AST chunks in CHARACTERS (default: 64). Added to chunk size, not included in it",
)
ast_group.add_argument(
"--code-file-extensions",