style: format

This commit is contained in:
Andy Lee
2025-08-22 13:50:57 -07:00
parent 26d961bfc5
commit ed72232bab
7 changed files with 28 additions and 32 deletions

View File

@@ -323,7 +323,10 @@ class LAIONEvaluator:
f" Storage saving by compact: {timing_metrics.get('storage_saving_percent', 0):.1f}%" f" Storage saving by compact: {timing_metrics.get('storage_saving_percent', 0):.1f}%"
) )
# Show excluded components for reference if available # Show excluded components for reference if available
if any(k in non_compact for k in ("passages_text_mb", "passages_index_mb", "metadata_mb")): if any(
k in non_compact
for k in ("passages_text_mb", "passages_index_mb", "metadata_mb")
):
print(" (passages excluded in totals, shown for reference):") print(" (passages excluded in totals, shown for reference):")
print( print(
f" - Passages text: {non_compact.get('passages_text_mb', 0):.1f} MB, " f" - Passages text: {non_compact.get('passages_text_mb', 0):.1f} MB, "
@@ -333,7 +336,9 @@ class LAIONEvaluator:
else: else:
# Fallback to legacy totals if running with older metrics # Fallback to legacy totals if running with older metrics
print("\n📏 Index Comparison Analysis:") print("\n📏 Index Comparison Analysis:")
print(f" Compact index (current): {current.get('total_with_embeddings', 0):.1f} MB") print(
f" Compact index (current): {current.get('total_with_embeddings', 0):.1f} MB"
)
print( print(
f" Non-compact index (with embeddings): {non_compact.get('total_with_embeddings', 0):.1f} MB" f" Non-compact index (with embeddings): {non_compact.get('total_with_embeddings', 0):.1f} MB"
) )

View File

@@ -118,16 +118,12 @@ class HNSWBuilder(LeannBackendBuilderInterface):
# index_file_old = index_file.with_suffix(".old") # index_file_old = index_file.with_suffix(".old")
# shutil.move(str(index_file), str(index_file_old)) # shutil.move(str(index_file), str(index_file_old))
shutil.move(str(csr_temp_file), str(index_file)) shutil.move(str(csr_temp_file), str(index_file))
logger.info( logger.info(f"INFO: Replaced original index with {mode_str} version at '{index_file}'")
f"INFO: Replaced original index with {mode_str} version at '{index_file}'"
)
else: else:
# Clean up and fail fast # Clean up and fail fast
if csr_temp_file.exists(): if csr_temp_file.exists():
os.remove(csr_temp_file) os.remove(csr_temp_file)
raise RuntimeError( raise RuntimeError("CSR conversion failed - cannot proceed with compact format")
"CSR conversion failed - cannot proceed with compact format"
)
class HNSWSearcher(BaseSearcher): class HNSWSearcher(BaseSearcher):
@@ -216,9 +212,7 @@ class HNSWSearcher(BaseSearcher):
) )
if recompute_embeddings: if recompute_embeddings:
if zmq_port is None: if zmq_port is None:
raise ValueError( raise ValueError("zmq_port must be provided if recompute_embeddings is True")
"zmq_port must be provided if recompute_embeddings is True"
)
if query.dtype != np.float32: if query.dtype != np.float32:
query = query.astype(np.float32) query = query.astype(np.float32)
@@ -227,9 +221,7 @@ class HNSWSearcher(BaseSearcher):
params = faiss.SearchParametersHNSW() params = faiss.SearchParametersHNSW()
if zmq_port is not None: if zmq_port is not None:
params.zmq_port = ( params.zmq_port = zmq_port # C++ code won't use this if recompute_embeddings is False
zmq_port # C++ code won't use this if recompute_embeddings is False
)
params.efSearch = complexity params.efSearch = complexity
params.beam_size = beam_width params.beam_size = beam_width
@@ -237,8 +229,7 @@ class HNSWSearcher(BaseSearcher):
# This prevents early termination when all scores are in a narrow range # This prevents early termination when all scores are in a narrow range
embedding_model = self.meta.get("embedding_model", "").lower() embedding_model = self.meta.get("embedding_model", "").lower()
if self.distance_metric == "cosine" and any( if self.distance_metric == "cosine" and any(
openai_model in embedding_model openai_model in embedding_model for openai_model in ["text-embedding", "openai"]
for openai_model in ["text-embedding", "openai"]
): ):
params.check_relative_distance = False params.check_relative_distance = False
else: else:
@@ -253,9 +244,7 @@ class HNSWSearcher(BaseSearcher):
params.send_neigh_times_ratio = 0.0 params.send_neigh_times_ratio = 0.0
elif pruning_strategy == "proportional": elif pruning_strategy == "proportional":
params.local_prune = False params.local_prune = False
params.send_neigh_times_ratio = ( params.send_neigh_times_ratio = 1.0 # Any value > 1e-6 triggers proportional mode
1.0 # Any value > 1e-6 triggers proportional mode
)
else: # "global" else: # "global"
params.local_prune = False params.local_prune = False
params.send_neigh_times_ratio = 0.0 params.send_neigh_times_ratio = 0.0
@@ -277,9 +266,7 @@ class HNSWSearcher(BaseSearcher):
params, params,
) )
search_time = time.time() - search_time search_time = time.time() - search_time
logger.info( logger.info(f" Search time in HNSWSearcher.search() backend: {search_time} seconds")
f" Search time in HNSWSearcher.search() backend: {search_time} seconds"
)
if self._id_map: if self._id_map:
def map_label(x: int) -> str: def map_label(x: int) -> str:
@@ -287,13 +274,10 @@ class HNSWSearcher(BaseSearcher):
return self._id_map[x] return self._id_map[x]
return str(x) return str(x)
string_labels = [ string_labels = [[map_label(int(l)) for l in batch_labels] for batch_labels in labels]
[map_label(int(l)) for l in batch_labels] for batch_labels in labels
]
else: else:
string_labels = [ string_labels = [
[str(int_label) for int_label in batch_labels] [str(int_label) for int_label in batch_labels] for batch_labels in labels
for batch_labels in labels
] ]
return {"labels": string_labels, "distances": distances} return {"labels": string_labels, "distances": distances}

View File

@@ -447,7 +447,10 @@ class LeannBuilder:
string_ids = [chunk["id"] for chunk in self.chunks] string_ids = [chunk["id"] for chunk in self.chunks]
# Persist ID map alongside index so backends that return integer labels can remap to passage IDs # Persist ID map alongside index so backends that return integer labels can remap to passage IDs
try: try:
idmap_file = index_dir / f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt" idmap_file = (
index_dir
/ f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt"
)
with open(idmap_file, "w", encoding="utf-8") as f: with open(idmap_file, "w", encoding="utf-8") as f:
for sid in string_ids: for sid in string_ids:
f.write(str(sid) + "\n") f.write(str(sid) + "\n")
@@ -573,7 +576,10 @@ class LeannBuilder:
string_ids = [str(id_val) for id_val in ids] string_ids = [str(id_val) for id_val in ids]
# Persist ID map (order == embeddings order) # Persist ID map (order == embeddings order)
try: try:
idmap_file = index_dir / f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt" idmap_file = (
index_dir
/ f"{index_name[: -len('.leann')] if index_name.endswith('.leann') else index_name}.ids.txt"
)
with open(idmap_file, "w", encoding="utf-8") as f: with open(idmap_file, "w", encoding="utf-8") as f:
for sid in string_ids: for sid in string_ids:
f.write(str(sid) + "\n") f.write(str(sid) + "\n")

1
paru-bin Submodule

Submodule paru-bin added at 92a55429af