reader: non-destructive portability (relative hints + fallback); fix comments; sky: refine yaml

This commit is contained in:
Andy Lee
2025-08-14 01:05:01 -07:00
parent 3f81861cba
commit 0361725323
4 changed files with 213 additions and 14 deletions

View File

@@ -82,17 +82,35 @@ def create_hnsw_embedding_server(
with open(passages_file) as f:
meta = json.load(f)
# Convert relative paths to absolute paths based on metadata file location
metadata_dir = Path(passages_file).parent.parent # Go up one level from the metadata file
# Resolve passage files for cross-machine portability
metadata_dir = Path(passages_file).parent # Same directory as meta.json
passage_sources = []
for source in meta["passage_sources"]:
source_copy = source.copy()
# Convert relative paths to absolute paths
if not Path(source_copy["path"]).is_absolute():
source_copy["path"] = str(metadata_dir / source_copy["path"])
if not Path(source_copy["index_path"]).is_absolute():
source_copy["index_path"] = str(metadata_dir / source_copy["index_path"])
passage_sources.append(source_copy)
src = dict(source)
# Absolute candidates from meta
cand_path = Path(src.get("path", ""))
cand_idx = Path(src.get("index_path", ""))
# Relative hints if provided
rel_path = src.get("path_relative")
rel_idx = src.get("index_path_relative")
# Defaults (siblings of meta)
default_path = metadata_dir / "documents.leann.passages.jsonl"
default_idx = metadata_dir / "documents.leann.passages.idx"
# Normalize path
if not cand_path.exists():
if rel_path and (metadata_dir / rel_path).exists():
src["path"] = str(metadata_dir / rel_path)
elif default_path.exists():
src["path"] = str(default_path)
# Normalize index_path
if not cand_idx.exists():
if rel_idx and (metadata_dir / rel_idx).exists():
src["index_path"] = str(metadata_dir / rel_idx)
elif default_idx.exists():
src["index_path"] = str(default_idx)
passage_sources.append(src)
passages = PassageManager(passage_sources)
logger.info(

View File

@@ -328,6 +328,9 @@ class LeannBuilder:
"type": "jsonl",
"path": str(passages_file),
"index_path": str(offset_file),
# Relative hints for cross-machine portability (non-breaking addition)
"path_relative": f"{index_name}.passages.jsonl",
"index_path_relative": f"{index_name}.passages.idx",
}
],
}
@@ -444,6 +447,9 @@ class LeannBuilder:
"type": "jsonl",
"path": str(passages_file),
"index_path": str(offset_file),
# Relative hints for cross-machine portability (non-breaking addition)
"path_relative": f"{index_name}.passages.jsonl",
"index_path_relative": f"{index_name}.passages.idx",
}
],
"built_from_precomputed_embeddings": True,
@@ -485,6 +491,42 @@ class LeannSearcher:
self.embedding_model = self.meta_data["embedding_model"]
# Support both old and new format
self.embedding_mode = self.meta_data.get("embedding_mode", "sentence-transformers")
# Best-effort portability: if meta contains absolute paths from another machine,
# and those paths do not exist locally, try relative hints or fallback sibling filenames.
try:
idx_path_obj = Path(self.meta_path_str).with_suffix("").with_suffix("")
index_dir = idx_path_obj.parent
index_name = idx_path_obj.name
default_passages = index_dir / f"{index_name}.passages.jsonl"
default_offsets = index_dir / f"{index_name}.passages.idx"
sources = self.meta_data.get("passage_sources", [])
normalized_sources: list[dict[str, Any]] = []
for src in sources:
new_src = dict(src)
raw_path = Path(new_src.get("path", ""))
raw_idx = Path(new_src.get("index_path", ""))
rel_path = new_src.get("path_relative")
rel_idx = new_src.get("index_path_relative")
# Normalize path
if not raw_path.exists():
cand = index_dir / rel_path if rel_path else default_passages
if cand.exists():
new_src["path"] = str(cand)
# Normalize idx
if not raw_idx.exists():
cand = index_dir / rel_idx if rel_idx else default_offsets
if cand.exists():
new_src["index_path"] = str(cand)
normalized_sources.append(new_src)
# Only override in-memory view; do not rewrite meta file (non-destructive)
self.meta_data["passage_sources"] = normalized_sources
except Exception:
pass
self.passage_manager = PassageManager(self.meta_data.get("passage_sources", []))
backend_factory = BACKEND_REGISTRY.get(backend_name)
if backend_factory is None: