[EXP] Update the benchmark code (#71)
* chore(hnsw): reorder imports to satisfy ruff I001 * chore: sync changes; fix Ruff import order; update examples, benchmarks, and dependencies - Fix import order in packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py (Ruff I001) - Update benchmarks/run_evaluation.py - Update apps/base_rag_example.py and leann-core API usage - Add benchmarks/data/README.md - Update uv.lock - Misc cleanup - Note: added paru-bin as an embedded git repo; consider making it a submodule (git rm --cached paru-bin) if unintended * chore: remove unintended embedded repo paru-bin and ignore it Fix CI: avoid missing .gitmodules entry by removing gitlink and adding to .gitignore. * ci: retrigger after removing unintended gitlink (paru-bin) * feat(benchmarks): add --batch-size option and plumb through to HNSW search (default 0) * feat(hnsw): add batch_size to LeannSearcher.search and LeannChat.ask; forward only for HNSW backend * chore(logging): surface recompute and batching params; enable INFO logging in benchmark * feat(embeddings): add optional manual tokenization path (HF tokenizer+model) with mean pooling; default remains SentenceTransformer.encode * fix micro bench and fix pre commit * update readme --------- Co-authored-by: yichuan-w <yichuan-w@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
@@ -236,6 +237,7 @@ class HNSWSearcher(BaseSearcher):
|
||||
distances = np.empty((batch_size_query, top_k), dtype=np.float32)
|
||||
labels = np.empty((batch_size_query, top_k), dtype=np.int64)
|
||||
|
||||
search_time = time.time()
|
||||
self._index.search(
|
||||
query.shape[0],
|
||||
faiss.swig_ptr(query),
|
||||
@@ -244,7 +246,8 @@ class HNSWSearcher(BaseSearcher):
|
||||
faiss.swig_ptr(labels),
|
||||
params,
|
||||
)
|
||||
|
||||
search_time = time.time() - search_time
|
||||
logger.info(f" Search time in HNSWSearcher.search() backend: {search_time} seconds")
|
||||
string_labels = [[str(int_label) for int_label in batch_labels] for batch_labels in labels]
|
||||
|
||||
return {"labels": string_labels, "distances": distances}
|
||||
|
||||
@@ -578,6 +578,8 @@ class LeannSearcher:
|
||||
self.passage_manager = PassageManager(
|
||||
self.meta_data.get("passage_sources", []), metadata_file_path=self.meta_path_str
|
||||
)
|
||||
# Preserve backend name for conditional parameter forwarding
|
||||
self.backend_name = backend_name
|
||||
backend_factory = BACKEND_REGISTRY.get(backend_name)
|
||||
if backend_factory is None:
|
||||
raise ValueError(f"Backend '{backend_name}' not found.")
|
||||
@@ -597,6 +599,7 @@ class LeannSearcher:
|
||||
recompute_embeddings: bool = True,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
expected_zmq_port: int = 5557,
|
||||
batch_size: int = 0,
|
||||
**kwargs,
|
||||
) -> list[SearchResult]:
|
||||
logger.info("🔍 LeannSearcher.search() called:")
|
||||
@@ -636,23 +639,33 @@ class LeannSearcher:
|
||||
use_server_if_available=recompute_embeddings,
|
||||
zmq_port=zmq_port,
|
||||
)
|
||||
# logger.info(f" Generated embedding shape: {query_embedding.shape}")
|
||||
# time.time() - start_time
|
||||
# logger.info(f" Embedding time: {embedding_time} seconds")
|
||||
logger.info(f" Generated embedding shape: {query_embedding.shape}")
|
||||
embedding_time = time.time() - start_time
|
||||
logger.info(f" Embedding time: {embedding_time} seconds")
|
||||
|
||||
start_time = time.time()
|
||||
backend_search_kwargs: dict[str, Any] = {
|
||||
"complexity": complexity,
|
||||
"beam_width": beam_width,
|
||||
"prune_ratio": prune_ratio,
|
||||
"recompute_embeddings": recompute_embeddings,
|
||||
"pruning_strategy": pruning_strategy,
|
||||
"zmq_port": zmq_port,
|
||||
}
|
||||
# Only HNSW supports batching; forward conditionally
|
||||
if self.backend_name == "hnsw":
|
||||
backend_search_kwargs["batch_size"] = batch_size
|
||||
|
||||
# Merge any extra kwargs last
|
||||
backend_search_kwargs.update(kwargs)
|
||||
|
||||
results = self.backend_impl.search(
|
||||
query_embedding,
|
||||
top_k,
|
||||
complexity=complexity,
|
||||
beam_width=beam_width,
|
||||
prune_ratio=prune_ratio,
|
||||
recompute_embeddings=recompute_embeddings,
|
||||
pruning_strategy=pruning_strategy,
|
||||
zmq_port=zmq_port,
|
||||
**kwargs,
|
||||
**backend_search_kwargs,
|
||||
)
|
||||
# logger.info(f" Search time: {search_time} seconds")
|
||||
search_time = time.time() - start_time
|
||||
logger.info(f" Search time in search() LEANN searcher: {search_time} seconds")
|
||||
logger.info(f" Backend returned: labels={len(results.get('labels', [[]])[0])} results")
|
||||
|
||||
enriched_results = []
|
||||
@@ -731,9 +744,15 @@ class LeannChat:
|
||||
index_path: str,
|
||||
llm_config: Optional[dict[str, Any]] = None,
|
||||
enable_warmup: bool = False,
|
||||
searcher: Optional[LeannSearcher] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.searcher = LeannSearcher(index_path, enable_warmup=enable_warmup, **kwargs)
|
||||
if searcher is None:
|
||||
self.searcher = LeannSearcher(index_path, enable_warmup=enable_warmup, **kwargs)
|
||||
self._owns_searcher = True
|
||||
else:
|
||||
self.searcher = searcher
|
||||
self._owns_searcher = False
|
||||
self.llm = get_llm(llm_config)
|
||||
|
||||
def ask(
|
||||
@@ -747,6 +766,7 @@ class LeannChat:
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
llm_kwargs: Optional[dict[str, Any]] = None,
|
||||
expected_zmq_port: int = 5557,
|
||||
batch_size: int = 0,
|
||||
**search_kwargs,
|
||||
):
|
||||
if llm_kwargs is None:
|
||||
@@ -761,10 +781,11 @@ class LeannChat:
|
||||
recompute_embeddings=recompute_embeddings,
|
||||
pruning_strategy=pruning_strategy,
|
||||
expected_zmq_port=expected_zmq_port,
|
||||
batch_size=batch_size,
|
||||
**search_kwargs,
|
||||
)
|
||||
search_time = time.time() - search_time
|
||||
# logger.info(f" Search time: {search_time} seconds")
|
||||
logger.info(f" Search time: {search_time} seconds")
|
||||
context = "\n\n".join([r.text for r in results])
|
||||
prompt = (
|
||||
"Here is some retrieved context that might help answer your question:\n\n"
|
||||
@@ -800,7 +821,9 @@ class LeannChat:
|
||||
This method should be called after you're done using the chat interface,
|
||||
especially in test environments or batch processing scenarios.
|
||||
"""
|
||||
if hasattr(self.searcher, "cleanup"):
|
||||
# Only stop the embedding server if this LeannChat instance created the searcher.
|
||||
# When a shared searcher is passed in, avoid shutting down the server to enable reuse.
|
||||
if getattr(self, "_owns_searcher", False) and hasattr(self.searcher, "cleanup"):
|
||||
self.searcher.cleanup()
|
||||
|
||||
# Enable automatic cleanup patterns
|
||||
|
||||
@@ -6,6 +6,7 @@ Preserves all optimization parameters to ensure performance
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -28,6 +29,8 @@ def compute_embeddings(
|
||||
is_build: bool = False,
|
||||
batch_size: int = 32,
|
||||
adaptive_optimization: bool = True,
|
||||
manual_tokenize: bool = False,
|
||||
max_length: int = 512,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Unified embedding computation entry point
|
||||
@@ -50,6 +53,8 @@ def compute_embeddings(
|
||||
is_build=is_build,
|
||||
batch_size=batch_size,
|
||||
adaptive_optimization=adaptive_optimization,
|
||||
manual_tokenize=manual_tokenize,
|
||||
max_length=max_length,
|
||||
)
|
||||
elif mode == "openai":
|
||||
return compute_embeddings_openai(texts, model_name)
|
||||
@@ -71,6 +76,8 @@ def compute_embeddings_sentence_transformers(
|
||||
batch_size: int = 32,
|
||||
is_build: bool = False,
|
||||
adaptive_optimization: bool = True,
|
||||
manual_tokenize: bool = False,
|
||||
max_length: int = 512,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Compute embeddings using SentenceTransformer with model caching and adaptive optimization
|
||||
@@ -214,20 +221,130 @@ def compute_embeddings_sentence_transformers(
|
||||
logger.info(f"Model cached: {cache_key}")
|
||||
|
||||
# Compute embeddings with optimized inference mode
|
||||
logger.info(f"Starting embedding computation... (batch_size: {batch_size})")
|
||||
logger.info(
|
||||
f"Starting embedding computation... (batch_size: {batch_size}, manual_tokenize={manual_tokenize})"
|
||||
)
|
||||
|
||||
# Use torch.inference_mode for optimal performance
|
||||
with torch.inference_mode():
|
||||
embeddings = model.encode(
|
||||
texts,
|
||||
batch_size=batch_size,
|
||||
show_progress_bar=is_build, # Don't show progress bar in server environment
|
||||
convert_to_numpy=True,
|
||||
normalize_embeddings=False,
|
||||
device=device,
|
||||
)
|
||||
start_time = time.time()
|
||||
if not manual_tokenize:
|
||||
# Use SentenceTransformer's optimized encode path (default)
|
||||
with torch.inference_mode():
|
||||
embeddings = model.encode(
|
||||
texts,
|
||||
batch_size=batch_size,
|
||||
show_progress_bar=is_build, # Don't show progress bar in server environment
|
||||
convert_to_numpy=True,
|
||||
normalize_embeddings=False,
|
||||
device=device,
|
||||
)
|
||||
# Synchronize if CUDA to measure accurate wall time
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
# Manual tokenization + forward pass using HF AutoTokenizer/AutoModel
|
||||
try:
|
||||
from transformers import AutoModel, AutoTokenizer # type: ignore
|
||||
except Exception as e:
|
||||
raise ImportError(f"transformers is required for manual_tokenize=True: {e}")
|
||||
|
||||
# Cache tokenizer and model
|
||||
tok_cache_key = f"hf_tokenizer_{model_name}"
|
||||
mdl_cache_key = f"hf_model_{model_name}_{device}_{use_fp16}"
|
||||
if tok_cache_key in _model_cache and mdl_cache_key in _model_cache:
|
||||
hf_tokenizer = _model_cache[tok_cache_key]
|
||||
hf_model = _model_cache[mdl_cache_key]
|
||||
logger.info("Using cached HF tokenizer/model for manual path")
|
||||
else:
|
||||
logger.info("Loading HF tokenizer/model for manual tokenization path")
|
||||
hf_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
||||
torch_dtype = torch.float16 if (use_fp16 and device == "cuda") else torch.float32
|
||||
hf_model = AutoModel.from_pretrained(model_name, torch_dtype=torch_dtype)
|
||||
hf_model.to(device)
|
||||
hf_model.eval()
|
||||
# Optional compile on supported devices
|
||||
if device in ["cuda", "mps"]:
|
||||
try:
|
||||
hf_model = torch.compile(hf_model, mode="reduce-overhead", dynamic=True) # type: ignore
|
||||
except Exception:
|
||||
pass
|
||||
_model_cache[tok_cache_key] = hf_tokenizer
|
||||
_model_cache[mdl_cache_key] = hf_model
|
||||
|
||||
all_embeddings: list[np.ndarray] = []
|
||||
# Progress bar when building or for large inputs
|
||||
show_progress = is_build or len(texts) > 32
|
||||
try:
|
||||
if show_progress:
|
||||
from tqdm import tqdm # type: ignore
|
||||
|
||||
batch_iter = tqdm(
|
||||
range(0, len(texts), batch_size),
|
||||
desc="Embedding (manual)",
|
||||
unit="batch",
|
||||
)
|
||||
else:
|
||||
batch_iter = range(0, len(texts), batch_size)
|
||||
except Exception:
|
||||
batch_iter = range(0, len(texts), batch_size)
|
||||
|
||||
start_time_manual = time.time()
|
||||
with torch.inference_mode():
|
||||
for start_index in batch_iter:
|
||||
end_index = min(start_index + batch_size, len(texts))
|
||||
batch_texts = texts[start_index:end_index]
|
||||
tokenize_start_time = time.time()
|
||||
inputs = hf_tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
tokenize_end_time = time.time()
|
||||
logger.info(
|
||||
f"Tokenize time taken: {tokenize_end_time - tokenize_start_time} seconds"
|
||||
)
|
||||
# Print shapes of all input tensors for debugging
|
||||
for k, v in inputs.items():
|
||||
print(f"inputs[{k!r}] shape: {getattr(v, 'shape', type(v))}")
|
||||
to_device_start_time = time.time()
|
||||
inputs = {k: v.to(device) for k, v in inputs.items()}
|
||||
to_device_end_time = time.time()
|
||||
logger.info(
|
||||
f"To device time taken: {to_device_end_time - to_device_start_time} seconds"
|
||||
)
|
||||
forward_start_time = time.time()
|
||||
outputs = hf_model(**inputs)
|
||||
forward_end_time = time.time()
|
||||
logger.info(f"Forward time taken: {forward_end_time - forward_start_time} seconds")
|
||||
last_hidden_state = outputs.last_hidden_state # (B, L, H)
|
||||
attention_mask = inputs.get("attention_mask")
|
||||
if attention_mask is None:
|
||||
# Fallback: assume all tokens are valid
|
||||
pooled = last_hidden_state.mean(dim=1)
|
||||
else:
|
||||
mask = attention_mask.unsqueeze(-1).to(last_hidden_state.dtype)
|
||||
masked = last_hidden_state * mask
|
||||
lengths = mask.sum(dim=1).clamp(min=1)
|
||||
pooled = masked.sum(dim=1) / lengths
|
||||
# Move to CPU float32
|
||||
batch_embeddings = pooled.detach().to("cpu").float().numpy()
|
||||
all_embeddings.append(batch_embeddings)
|
||||
|
||||
embeddings = np.vstack(all_embeddings).astype(np.float32, copy=False)
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
except Exception:
|
||||
pass
|
||||
end_time = time.time()
|
||||
logger.info(f"Manual tokenize time taken: {end_time - start_time_manual} seconds")
|
||||
end_time = time.time()
|
||||
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
|
||||
logger.info(f"Time taken: {end_time - start_time} seconds")
|
||||
|
||||
# Validate results
|
||||
if np.isnan(embeddings).any() or np.isinf(embeddings).any():
|
||||
|
||||
Reference in New Issue
Block a user