From 2047a1a1283670495786b9ec1b99ec252cd0cdb3 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Sun, 27 Jul 2025 23:09:56 -0700
Subject: [PATCH] feat: add OpenAI embeddings support to
 google_history_reader_leann.py

- Add --embedding-model and --embedding-mode arguments
- Support automatic detection of normalized embeddings
- Works correctly with cosine distance for OpenAI embeddings
---
 examples/google_history_reader_leann.py | 31 ++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py
index 82d78b1..570cb5e 100644
--- a/examples/google_history_reader_leann.py
+++ b/examples/google_history_reader_leann.py
@@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles(
     profile_dirs: list[Path],
     index_path: str = "chrome_history_index.leann",
     max_count: int = -1,
+    embedding_model: str = "facebook/contriever",
+    embedding_mode: str = "sentence-transformers",
 ):
     """
     Create LEANN index from multiple Chrome profile data sources.
@@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles(
         profile_dirs: List of Path objects pointing to Chrome profile directories
         index_path: Path to save the LEANN index
         max_count: Maximum number of history entries to process per profile
+        embedding_model: The embedding model to use
+        embedding_mode: The embedding backend mode
     """
     print("Creating LEANN index from multiple Chrome profile data sources...")
 
@@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles(
         print("\n[PHASE 1] Building Leann index...")
 
         # Use HNSW backend for better macOS compatibility
+        # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
         builder = LeannBuilder(
             backend_name="hnsw",
-            embedding_model="facebook/contriever",
+            embedding_model=embedding_model,
+            embedding_mode=embedding_mode,
             graph_degree=32,
             complexity=64,
             is_compact=True,
@@ -132,6 +138,8 @@ def create_leann_index(
     profile_path: str | None = None,
     index_path: str = "chrome_history_index.leann",
     max_count: int = 1000,
+    embedding_model: str = "facebook/contriever",
+    embedding_mode: str = "sentence-transformers",
 ):
     """
     Create LEANN index from Chrome history data.
@@ -140,6 +148,8 @@ def create_leann_index(
         profile_path: Path to the Chrome profile directory (optional, uses default if None)
         index_path: Path to save the LEANN index
         max_count: Maximum number of history entries to process
+        embedding_model: The embedding model to use
+        embedding_mode: The embedding backend mode
     """
     print("Creating LEANN index from Chrome history data...")
     INDEX_DIR = Path(index_path).parent
@@ -187,9 +197,11 @@ def create_leann_index(
         print("\n[PHASE 1] Building Leann index...")
 
         # Use HNSW backend for better macOS compatibility
+        # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
         builder = LeannBuilder(
             backend_name="hnsw",
-            embedding_model="facebook/contriever",
+            embedding_model=embedding_model,
+            embedding_mode=embedding_mode,
             graph_degree=32,
             complexity=64,
             is_compact=True,
@@ -273,6 +285,19 @@ async def main():
         default=True,
         help="Automatically find all Chrome profiles (default: True)",
     )
+    parser.add_argument(
+        "--embedding-model",
+        type=str,
+        default="facebook/contriever",
+        help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')",
+    )
+    parser.add_argument(
+        "--embedding-mode",
+        type=str,
+        default="sentence-transformers",
+        choices=["sentence-transformers", "openai", "mlx"],
+        help="The embedding backend mode",
+    )
 
     args = parser.parse_args()
 
@@ -301,7 +326,7 @@ async def main():
 
     # Create or load the LEANN index from all sources
     index_path = create_leann_index_from_multiple_chrome_profiles(
-        profile_dirs, INDEX_PATH, args.max_entries
+        profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode
     )
 
     if index_path: