Feature/optimize ollama batching (#152)

* feat: add metadata output to search results

- Add --show-metadata flag to display file paths in search results
- Preserve document metadata (file_path, file_name, timestamps) during chunking
- Update MCP tool schema to support show_metadata parameter
- Enhance CLI search output to display metadata when requested
- Fix pre-existing bug: args.backend -> args.backend_name

Resolves yichuan-w/LEANN#144

* fix: resolve ZMQ linking issues in Python extension

- Use pkg_check_modules IMPORTED_TARGET to create PkgConfig::ZMQ
- Set PKG_CONFIG_PATH to prioritize ARM64 Homebrew on Apple Silicon
- Override macOS -undefined dynamic_lookup to force proper symbol resolution
- Use PUBLIC linkage for ZMQ in faiss library for transitive linking
- Mark cppzmq includes as SYSTEM to suppress warnings

Fixes editable install ZMQ symbol errors while maintaining compatibility
across Linux, macOS Intel, and macOS ARM64 platforms.

* style: apply ruff formatting

* chore: update faiss submodule to use ww2283 fork

Use ww2283/faiss fork with fix/zmq-linking branch to resolve CI checkout
failures. The ZMQ linking fixes are not yet merged upstream.

* feat: implement true batch processing for Ollama embeddings

Migrate from deprecated /api/embeddings to modern /api/embed endpoint
which supports batch inputs. This reduces HTTP overhead by sending
32 texts per request instead of making individual API calls.

Changes:
- Update endpoint from /api/embeddings to /api/embed
- Change parameter from 'prompt' (single) to 'input' (array)
- Update response parsing for batch embeddings array
- Increase timeout to 60s for batch processing
- Improve error handling for batch requests

Performance:
- Reduces API calls by 32x (batch size)
- Eliminates HTTP connection overhead per text
- Note: Ollama still processes batch items sequentially internally

Related: #151

* fall back to original faiss as i merge the PR

---------

Co-authored-by: yichuan520030910320 <yichuan_wang@berkeley.edu>
This commit is contained in:
ww26
2025-10-30 19:39:14 -04:00
committed by GitHub
parent dbb5f4d352
commit a85d0ad4a7
4 changed files with 127 additions and 65 deletions

View File

@@ -255,6 +255,11 @@ Examples:
action="store_true",
help="Non-interactive mode: automatically select index without prompting",
)
search_parser.add_argument(
"--show-metadata",
action="store_true",
help="Display file paths and metadata in search results",
)
# Ask command
ask_parser = subparsers.add_parser("ask", help="Ask questions")
@@ -1263,7 +1268,7 @@ Examples:
from .chunking_utils import create_text_chunks
# Use enhanced chunking with AST support
all_texts = create_text_chunks(
chunk_texts = create_text_chunks(
documents,
chunk_size=self.node_parser.chunk_size,
chunk_overlap=self.node_parser.chunk_overlap,
@@ -1274,6 +1279,14 @@ Examples:
ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True),
)
# Note: AST chunking currently returns plain text chunks without metadata
# We preserve basic file info by associating chunks with their source documents
# For better metadata preservation, documents list order should be maintained
for chunk_text in chunk_texts:
# TODO: Enhance create_text_chunks to return metadata alongside text
# For now, we store chunks with empty metadata
all_texts.append({"text": chunk_text, "metadata": {}})
except ImportError as e:
print(
f"⚠️ AST chunking utilities not available in package ({e}), falling back to traditional chunking"
@@ -1285,17 +1298,27 @@ Examples:
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
# Check if this is a code file based on source path
source_path = doc.metadata.get("source", "")
file_path = doc.metadata.get("file_path", "")
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
# Extract metadata to preserve with chunks
chunk_metadata = {
"file_path": file_path or source_path,
"file_name": doc.metadata.get("file_name", ""),
}
# Add optional metadata if available
if "creation_date" in doc.metadata:
chunk_metadata["creation_date"] = doc.metadata["creation_date"]
if "last_modified_date" in doc.metadata:
chunk_metadata["last_modified_date"] = doc.metadata["last_modified_date"]
# Use appropriate parser based on file type
parser = self.code_parser if is_code_file else self.node_parser
nodes = parser.get_nodes_from_documents([doc])
for node in nodes:
text_with_source = (
"Chunk source:" + source_path + "\n" + node.get_content().replace("\n", " ")
)
all_texts.append(text_with_source)
all_texts.append({"text": node.get_content(), "metadata": chunk_metadata})
print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
return all_texts
@@ -1370,7 +1393,7 @@ Examples:
index_dir.mkdir(parents=True, exist_ok=True)
print(f"Building index '{index_name}' with {args.backend} backend...")
print(f"Building index '{index_name}' with {args.backend_name} backend...")
embedding_options: dict[str, Any] = {}
if args.embedding_mode == "ollama":
@@ -1382,7 +1405,7 @@ Examples:
embedding_options["api_key"] = resolved_embedding_key
builder = LeannBuilder(
backend_name=args.backend,
backend_name=args.backend_name,
embedding_model=args.embedding_model,
embedding_mode=args.embedding_mode,
embedding_options=embedding_options or None,
@@ -1393,10 +1416,8 @@ Examples:
num_threads=args.num_threads,
)
for chunk_text_with_source in all_texts:
chunk_source = chunk_text_with_source.split("\n")[0].split(":")[1]
chunk_text = chunk_text_with_source.split("\n")[1]
builder.add_text(chunk_text, {"source": chunk_source})
for chunk in all_texts:
builder.add_text(chunk["text"], metadata=chunk["metadata"])
builder.build_index(index_path)
print(f"Index built at {index_path}")
@@ -1517,6 +1538,23 @@ Examples:
print(f"Search results for '{query}' (top {len(results)}):")
for i, result in enumerate(results, 1):
print(f"{i}. Score: {result.score:.3f}")
# Display metadata if flag is set
if args.show_metadata and result.metadata:
file_path = result.metadata.get("file_path", "")
if file_path:
print(f" 📄 File: {file_path}")
file_name = result.metadata.get("file_name", "")
if file_name and file_name != file_path:
print(f" 📝 Name: {file_name}")
# Show timestamps if available
if "creation_date" in result.metadata:
print(f" 🕐 Created: {result.metadata['creation_date']}")
if "last_modified_date" in result.metadata:
print(f" 🕑 Modified: {result.metadata['last_modified_date']}")
print(f" {result.text[:200]}...")
print(f" Source: {result.metadata.get('source', '')}")
print()