Compare commits
6 Commits
add-gh-pat
...
fix/openai
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b4b825fec | ||
|
|
34ef0db42f | ||
|
|
41812c7d22 | ||
|
|
2047a1a128 | ||
|
|
402e8f97ad | ||
|
|
9a5c197acd |
@@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs: list[Path],
|
||||
index_path: str = "chrome_history_index.leann",
|
||||
max_count: int = -1,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
):
|
||||
"""
|
||||
Create LEANN index from multiple Chrome profile data sources.
|
||||
@@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs: List of Path objects pointing to Chrome profile directories
|
||||
index_path: Path to save the LEANN index
|
||||
max_count: Maximum number of history entries to process per profile
|
||||
embedding_model: The embedding model to use
|
||||
embedding_mode: The embedding backend mode
|
||||
"""
|
||||
print("Creating LEANN index from multiple Chrome profile data sources...")
|
||||
|
||||
@@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles(
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_model=embedding_model,
|
||||
embedding_mode=embedding_mode,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
@@ -132,6 +138,8 @@ def create_leann_index(
|
||||
profile_path: str | None = None,
|
||||
index_path: str = "chrome_history_index.leann",
|
||||
max_count: int = 1000,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
):
|
||||
"""
|
||||
Create LEANN index from Chrome history data.
|
||||
@@ -140,6 +148,8 @@ def create_leann_index(
|
||||
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
||||
index_path: Path to save the LEANN index
|
||||
max_count: Maximum number of history entries to process
|
||||
embedding_model: The embedding model to use
|
||||
embedding_mode: The embedding backend mode
|
||||
"""
|
||||
print("Creating LEANN index from Chrome history data...")
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
@@ -187,9 +197,11 @@ def create_leann_index(
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_model=embedding_model,
|
||||
embedding_mode=embedding_mode,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
@@ -273,6 +285,24 @@ async def main():
|
||||
default=True,
|
||||
help="Automatically find all Chrome profiles (default: True)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
type=str,
|
||||
default="facebook/contriever",
|
||||
help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-mode",
|
||||
type=str,
|
||||
default="sentence-transformers",
|
||||
choices=["sentence-transformers", "openai", "mlx"],
|
||||
help="The embedding backend mode",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-existing-index",
|
||||
action="store_true",
|
||||
help="Use existing index without rebuilding",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -283,26 +313,34 @@ async def main():
|
||||
print(f"Index directory: {INDEX_DIR}")
|
||||
print(f"Max entries: {args.max_entries}")
|
||||
|
||||
# Find Chrome profile directories
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
if args.auto_find_profiles:
|
||||
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
|
||||
if not profile_dirs:
|
||||
print("No Chrome profiles found automatically. Exiting.")
|
||||
if args.use_existing_index:
|
||||
# Use existing index without rebuilding
|
||||
if not Path(INDEX_PATH).exists():
|
||||
print(f"Error: Index file not found at {INDEX_PATH}")
|
||||
return
|
||||
print(f"Using existing index at {INDEX_PATH}")
|
||||
index_path = INDEX_PATH
|
||||
else:
|
||||
# Use single specified profile
|
||||
profile_path = Path(args.chrome_profile)
|
||||
if not profile_path.exists():
|
||||
print(f"Chrome profile not found: {profile_path}")
|
||||
return
|
||||
profile_dirs = [profile_path]
|
||||
# Find Chrome profile directories
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
# Create or load the LEANN index from all sources
|
||||
index_path = create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs, INDEX_PATH, args.max_entries
|
||||
)
|
||||
if args.auto_find_profiles:
|
||||
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
|
||||
if not profile_dirs:
|
||||
print("No Chrome profiles found automatically. Exiting.")
|
||||
return
|
||||
else:
|
||||
# Use single specified profile
|
||||
profile_path = Path(args.chrome_profile)
|
||||
if not profile_path.exists():
|
||||
print(f"Chrome profile not found: {profile_path}")
|
||||
return
|
||||
profile_dirs = [profile_path]
|
||||
|
||||
# Create or load the LEANN index from all sources
|
||||
index_path = create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode
|
||||
)
|
||||
|
||||
if index_path:
|
||||
if args.query:
|
||||
|
||||
@@ -124,7 +124,9 @@ class HNSWSearcher(BaseSearcher):
|
||||
)
|
||||
from . import faiss # type: ignore
|
||||
|
||||
self.distance_metric = self.meta.get("distance_metric", "mips").lower()
|
||||
self.distance_metric = (
|
||||
self.meta.get("backend_kwargs", {}).get("distance_metric", "mips").lower()
|
||||
)
|
||||
metric_enum = get_metric_map().get(self.distance_metric)
|
||||
if metric_enum is None:
|
||||
raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.")
|
||||
@@ -200,6 +202,16 @@ class HNSWSearcher(BaseSearcher):
|
||||
params.efSearch = complexity
|
||||
params.beam_size = beam_width
|
||||
|
||||
# For OpenAI embeddings with cosine distance, disable relative distance check
|
||||
# This prevents early termination when all scores are in a narrow range
|
||||
embedding_model = self.meta.get("embedding_model", "").lower()
|
||||
if self.distance_metric == "cosine" and any(
|
||||
openai_model in embedding_model for openai_model in ["text-embedding", "openai"]
|
||||
):
|
||||
params.check_relative_distance = False
|
||||
else:
|
||||
params.check_relative_distance = True
|
||||
|
||||
# PQ pruning: direct mapping to HNSW's pq_pruning_ratio
|
||||
params.pq_pruning_ratio = prune_ratio
|
||||
|
||||
|
||||
@@ -293,6 +293,8 @@ class EmbeddingServerManager:
|
||||
command.extend(["--passages-file", str(passages_file)])
|
||||
if embedding_mode != "sentence-transformers":
|
||||
command.extend(["--embedding-mode", embedding_mode])
|
||||
if kwargs.get("distance_metric"):
|
||||
command.extend(["--distance-metric", kwargs["distance_metric"]])
|
||||
|
||||
return command
|
||||
|
||||
|
||||
@@ -63,12 +63,19 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
if not self.embedding_model:
|
||||
raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.")
|
||||
|
||||
# Get distance_metric from meta if not provided in kwargs
|
||||
distance_metric = (
|
||||
kwargs.get("distance_metric")
|
||||
or self.meta.get("backend_kwargs", {}).get("distance_metric")
|
||||
or "mips"
|
||||
)
|
||||
|
||||
server_started, actual_port = self.embedding_server_manager.start_server(
|
||||
port=port,
|
||||
model_name=self.embedding_model,
|
||||
embedding_mode=self.embedding_mode,
|
||||
passages_file=passages_source_file,
|
||||
distance_metric=kwargs.get("distance_metric"),
|
||||
distance_metric=distance_metric,
|
||||
enable_warmup=kwargs.get("enable_warmup", False),
|
||||
)
|
||||
if not server_started:
|
||||
|
||||
Reference in New Issue
Block a user