style: format
This commit is contained in:
@@ -482,7 +482,7 @@ class FinanceBenchEvaluator:
|
|||||||
self, generated_answer: str, ground_truth: str, question: str
|
self, generated_answer: str, ground_truth: str, question: str
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Check if generated answer matches ground truth using LLM as judge"""
|
"""Check if generated answer matches ground truth using LLM as judge"""
|
||||||
judge_prompt = f"""You are an expert judge evaluating financial question answering.
|
judge_prompt = f"""You are an expert judge evaluating financial question answering.
|
||||||
|
|
||||||
Question: {question}
|
Question: {question}
|
||||||
|
|
||||||
|
|||||||
2
benchmarks/laion/.gitignore
vendored
2
benchmarks/laion/.gitignore
vendored
@@ -1 +1 @@
|
|||||||
data/
|
data/
|
||||||
|
|||||||
@@ -166,4 +166,4 @@ benchmarks/laion/
|
|||||||
- For real LAION data, implement actual download logic in `setup_laion.py`
|
- For real LAION data, implement actual download logic in `setup_laion.py`
|
||||||
- CLIP embeddings are randomly generated - replace with real CLIP model for production
|
- CLIP embeddings are randomly generated - replace with real CLIP model for production
|
||||||
- Adjust `num_samples` and `num_queries` based on available resources
|
- Adjust `num_samples` and `num_queries` based on available resources
|
||||||
- Consider using `--num-samples` during evaluation for faster testing
|
- Consider using `--num-samples` during evaluation for faster testing
|
||||||
|
|||||||
@@ -323,7 +323,10 @@ class LAIONEvaluator:
|
|||||||
f" Storage saving by compact: {timing_metrics.get('storage_saving_percent', 0):.1f}%"
|
f" Storage saving by compact: {timing_metrics.get('storage_saving_percent', 0):.1f}%"
|
||||||
)
|
)
|
||||||
# Show excluded components for reference if available
|
# Show excluded components for reference if available
|
||||||
if any(k in non_compact for k in ("passages_text_mb", "passages_index_mb", "metadata_mb")):
|
if any(
|
||||||
|
k in non_compact
|
||||||
|
for k in ("passages_text_mb", "passages_index_mb", "metadata_mb")
|
||||||
|
):
|
||||||
print(" (passages excluded in totals, shown for reference):")
|
print(" (passages excluded in totals, shown for reference):")
|
||||||
print(
|
print(
|
||||||
f" - Passages text: {non_compact.get('passages_text_mb', 0):.1f} MB, "
|
f" - Passages text: {non_compact.get('passages_text_mb', 0):.1f} MB, "
|
||||||
@@ -333,7 +336,9 @@ class LAIONEvaluator:
|
|||||||
else:
|
else:
|
||||||
# Fallback to legacy totals if running with older metrics
|
# Fallback to legacy totals if running with older metrics
|
||||||
print("\n📏 Index Comparison Analysis:")
|
print("\n📏 Index Comparison Analysis:")
|
||||||
print(f" Compact index (current): {current.get('total_with_embeddings', 0):.1f} MB")
|
print(
|
||||||
|
f" Compact index (current): {current.get('total_with_embeddings', 0):.1f} MB"
|
||||||
|
)
|
||||||
print(
|
print(
|
||||||
f" Non-compact index (with embeddings): {non_compact.get('total_with_embeddings', 0):.1f} MB"
|
f" Non-compact index (with embeddings): {non_compact.get('total_with_embeddings', 0):.1f} MB"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -118,16 +118,12 @@ class HNSWBuilder(LeannBackendBuilderInterface):
|
|||||||
# index_file_old = index_file.with_suffix(".old")
|
# index_file_old = index_file.with_suffix(".old")
|
||||||
# shutil.move(str(index_file), str(index_file_old))
|
# shutil.move(str(index_file), str(index_file_old))
|
||||||
shutil.move(str(csr_temp_file), str(index_file))
|
shutil.move(str(csr_temp_file), str(index_file))
|
||||||
logger.info(
|
logger.info(f"INFO: Replaced original index with {mode_str} version at '{index_file}'")
|
||||||
f"INFO: Replaced original index with {mode_str} version at '{index_file}'"
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# Clean up and fail fast
|
# Clean up and fail fast
|
||||||
if csr_temp_file.exists():
|
if csr_temp_file.exists():
|
||||||
os.remove(csr_temp_file)
|
os.remove(csr_temp_file)
|
||||||
raise RuntimeError(
|
raise RuntimeError("CSR conversion failed - cannot proceed with compact format")
|
||||||
"CSR conversion failed - cannot proceed with compact format"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class HNSWSearcher(BaseSearcher):
|
class HNSWSearcher(BaseSearcher):
|
||||||
@@ -216,9 +212,7 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
)
|
)
|
||||||
if recompute_embeddings:
|
if recompute_embeddings:
|
||||||
if zmq_port is None:
|
if zmq_port is None:
|
||||||
raise ValueError(
|
raise ValueError("zmq_port must be provided if recompute_embeddings is True")
|
||||||
"zmq_port must be provided if recompute_embeddings is True"
|
|
||||||
)
|
|
||||||
|
|
||||||
if query.dtype != np.float32:
|
if query.dtype != np.float32:
|
||||||
query = query.astype(np.float32)
|
query = query.astype(np.float32)
|
||||||
@@ -227,9 +221,7 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
|
|
||||||
params = faiss.SearchParametersHNSW()
|
params = faiss.SearchParametersHNSW()
|
||||||
if zmq_port is not None:
|
if zmq_port is not None:
|
||||||
params.zmq_port = (
|
params.zmq_port = zmq_port # C++ code won't use this if recompute_embeddings is False
|
||||||
zmq_port # C++ code won't use this if recompute_embeddings is False
|
|
||||||
)
|
|
||||||
params.efSearch = complexity
|
params.efSearch = complexity
|
||||||
params.beam_size = beam_width
|
params.beam_size = beam_width
|
||||||
|
|
||||||
@@ -237,8 +229,7 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
# This prevents early termination when all scores are in a narrow range
|
# This prevents early termination when all scores are in a narrow range
|
||||||
embedding_model = self.meta.get("embedding_model", "").lower()
|
embedding_model = self.meta.get("embedding_model", "").lower()
|
||||||
if self.distance_metric == "cosine" and any(
|
if self.distance_metric == "cosine" and any(
|
||||||
openai_model in embedding_model
|
openai_model in embedding_model for openai_model in ["text-embedding", "openai"]
|
||||||
for openai_model in ["text-embedding", "openai"]
|
|
||||||
):
|
):
|
||||||
params.check_relative_distance = False
|
params.check_relative_distance = False
|
||||||
else:
|
else:
|
||||||
@@ -253,9 +244,7 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
params.send_neigh_times_ratio = 0.0
|
params.send_neigh_times_ratio = 0.0
|
||||||
elif pruning_strategy == "proportional":
|
elif pruning_strategy == "proportional":
|
||||||
params.local_prune = False
|
params.local_prune = False
|
||||||
params.send_neigh_times_ratio = (
|
params.send_neigh_times_ratio = 1.0 # Any value > 1e-6 triggers proportional mode
|
||||||
1.0 # Any value > 1e-6 triggers proportional mode
|
|
||||||
)
|
|
||||||
else: # "global"
|
else: # "global"
|
||||||
params.local_prune = False
|
params.local_prune = False
|
||||||
params.send_neigh_times_ratio = 0.0
|
params.send_neigh_times_ratio = 0.0
|
||||||
@@ -277,9 +266,7 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
params,
|
params,
|
||||||
)
|
)
|
||||||
search_time = time.time() - search_time
|
search_time = time.time() - search_time
|
||||||
logger.info(
|
logger.info(f" Search time in HNSWSearcher.search() backend: {search_time} seconds")
|
||||||
f" Search time in HNSWSearcher.search() backend: {search_time} seconds"
|
|
||||||
)
|
|
||||||
if self._id_map:
|
if self._id_map:
|
||||||
|
|
||||||
def map_label(x: int) -> str:
|
def map_label(x: int) -> str:
|
||||||
@@ -287,13 +274,10 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
return self._id_map[x]
|
return self._id_map[x]
|
||||||
return str(x)
|
return str(x)
|
||||||
|
|
||||||
string_labels = [
|
string_labels = [[map_label(int(l)) for l in batch_labels] for batch_labels in labels]
|
||||||
[map_label(int(l)) for l in batch_labels] for batch_labels in labels
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
string_labels = [
|
string_labels = [
|
||||||
[str(int_label) for int_label in batch_labels]
|
[str(int_label) for int_label in batch_labels] for batch_labels in labels
|
||||||
for batch_labels in labels
|
|
||||||
]
|
]
|
||||||
|
|
||||||
return {"labels": string_labels, "distances": distances}
|
return {"labels": string_labels, "distances": distances}
|
||||||
|
|||||||
@@ -447,7 +447,10 @@ class LeannBuilder:
|
|||||||
string_ids = [chunk["id"] for chunk in self.chunks]
|
string_ids = [chunk["id"] for chunk in self.chunks]
|
||||||
# Persist ID map alongside index so backends that return integer labels can remap to passage IDs
|
# Persist ID map alongside index so backends that return integer labels can remap to passage IDs
|
||||||
try:
|
try:
|
||||||
idmap_file = index_dir / f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt"
|
idmap_file = (
|
||||||
|
index_dir
|
||||||
|
/ f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt"
|
||||||
|
)
|
||||||
with open(idmap_file, "w", encoding="utf-8") as f:
|
with open(idmap_file, "w", encoding="utf-8") as f:
|
||||||
for sid in string_ids:
|
for sid in string_ids:
|
||||||
f.write(str(sid) + "\n")
|
f.write(str(sid) + "\n")
|
||||||
@@ -573,7 +576,10 @@ class LeannBuilder:
|
|||||||
string_ids = [str(id_val) for id_val in ids]
|
string_ids = [str(id_val) for id_val in ids]
|
||||||
# Persist ID map (order == embeddings order)
|
# Persist ID map (order == embeddings order)
|
||||||
try:
|
try:
|
||||||
idmap_file = index_dir / f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt"
|
idmap_file = (
|
||||||
|
index_dir
|
||||||
|
/ f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt"
|
||||||
|
)
|
||||||
with open(idmap_file, "w", encoding="utf-8") as f:
|
with open(idmap_file, "w", encoding="utf-8") as f:
|
||||||
for sid in string_ids:
|
for sid in string_ids:
|
||||||
f.write(str(sid) + "\n")
|
f.write(str(sid) + "\n")
|
||||||
|
|||||||
1
paru-bin
Submodule
1
paru-bin
Submodule
Submodule paru-bin added at 92a55429af
Reference in New Issue
Block a user