From 2047a1a1283670495786b9ec1b99ec252cd0cdb3 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 27 Jul 2025 23:09:56 -0700 Subject: [PATCH] feat: add OpenAI embeddings support to google_history_reader_leann.py - Add --embedding-model and --embedding-mode arguments - Support automatic detection of normalized embeddings - Works correctly with cosine distance for OpenAI embeddings --- examples/google_history_reader_leann.py | 31 ++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py index 82d78b1..570cb5e 100644 --- a/examples/google_history_reader_leann.py +++ b/examples/google_history_reader_leann.py @@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles( profile_dirs: list[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1, + embedding_model: str = "facebook/contriever", + embedding_mode: str = "sentence-transformers", ): """ Create LEANN index from multiple Chrome profile data sources. @@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles( profile_dirs: List of Path objects pointing to Chrome profile directories index_path: Path to save the LEANN index max_count: Maximum number of history entries to process per profile + embedding_model: The embedding model to use + embedding_mode: The embedding backend mode """ print("Creating LEANN index from multiple Chrome profile data sources...") @@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles( print("\n[PHASE 1] Building Leann index...") # Use HNSW backend for better macOS compatibility + # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric builder = LeannBuilder( backend_name="hnsw", - embedding_model="facebook/contriever", + embedding_model=embedding_model, + embedding_mode=embedding_mode, graph_degree=32, complexity=64, is_compact=True, @@ -132,6 +138,8 @@ def create_leann_index( profile_path: str | None = None, index_path: str = "chrome_history_index.leann", max_count: int = 1000, + embedding_model: str = "facebook/contriever", + embedding_mode: str = "sentence-transformers", ): """ Create LEANN index from Chrome history data. @@ -140,6 +148,8 @@ def create_leann_index( profile_path: Path to the Chrome profile directory (optional, uses default if None) index_path: Path to save the LEANN index max_count: Maximum number of history entries to process + embedding_model: The embedding model to use + embedding_mode: The embedding backend mode """ print("Creating LEANN index from Chrome history data...") INDEX_DIR = Path(index_path).parent @@ -187,9 +197,11 @@ def create_leann_index( print("\n[PHASE 1] Building Leann index...") # Use HNSW backend for better macOS compatibility + # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric builder = LeannBuilder( backend_name="hnsw", - embedding_model="facebook/contriever", + embedding_model=embedding_model, + embedding_mode=embedding_mode, graph_degree=32, complexity=64, is_compact=True, @@ -273,6 +285,19 @@ async def main(): default=True, help="Automatically find all Chrome profiles (default: True)", ) + parser.add_argument( + "--embedding-model", + type=str, + default="facebook/contriever", + help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')", + ) + parser.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx"], + help="The embedding backend mode", + ) args = parser.parse_args() @@ -301,7 +326,7 @@ async def main(): # Create or load the LEANN index from all sources index_path = create_leann_index_from_multiple_chrome_profiles( - profile_dirs, INDEX_PATH, args.max_entries + profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode ) if index_path: