feat: add OpenAI embeddings support to google_history_reader_leann.py
- Add --embedding-model and --embedding-mode arguments - Support automatic detection of normalized embeddings - Works correctly with cosine distance for OpenAI embeddings
This commit is contained in:
@@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles(
|
|||||||
profile_dirs: list[Path],
|
profile_dirs: list[Path],
|
||||||
index_path: str = "chrome_history_index.leann",
|
index_path: str = "chrome_history_index.leann",
|
||||||
max_count: int = -1,
|
max_count: int = -1,
|
||||||
|
embedding_model: str = "facebook/contriever",
|
||||||
|
embedding_mode: str = "sentence-transformers",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create LEANN index from multiple Chrome profile data sources.
|
Create LEANN index from multiple Chrome profile data sources.
|
||||||
@@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles(
|
|||||||
profile_dirs: List of Path objects pointing to Chrome profile directories
|
profile_dirs: List of Path objects pointing to Chrome profile directories
|
||||||
index_path: Path to save the LEANN index
|
index_path: Path to save the LEANN index
|
||||||
max_count: Maximum number of history entries to process per profile
|
max_count: Maximum number of history entries to process per profile
|
||||||
|
embedding_model: The embedding model to use
|
||||||
|
embedding_mode: The embedding backend mode
|
||||||
"""
|
"""
|
||||||
print("Creating LEANN index from multiple Chrome profile data sources...")
|
print("Creating LEANN index from multiple Chrome profile data sources...")
|
||||||
|
|
||||||
@@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles(
|
|||||||
print("\n[PHASE 1] Building Leann index...")
|
print("\n[PHASE 1] Building Leann index...")
|
||||||
|
|
||||||
# Use HNSW backend for better macOS compatibility
|
# Use HNSW backend for better macOS compatibility
|
||||||
|
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name="hnsw",
|
backend_name="hnsw",
|
||||||
embedding_model="facebook/contriever",
|
embedding_model=embedding_model,
|
||||||
|
embedding_mode=embedding_mode,
|
||||||
graph_degree=32,
|
graph_degree=32,
|
||||||
complexity=64,
|
complexity=64,
|
||||||
is_compact=True,
|
is_compact=True,
|
||||||
@@ -132,6 +138,8 @@ def create_leann_index(
|
|||||||
profile_path: str | None = None,
|
profile_path: str | None = None,
|
||||||
index_path: str = "chrome_history_index.leann",
|
index_path: str = "chrome_history_index.leann",
|
||||||
max_count: int = 1000,
|
max_count: int = 1000,
|
||||||
|
embedding_model: str = "facebook/contriever",
|
||||||
|
embedding_mode: str = "sentence-transformers",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create LEANN index from Chrome history data.
|
Create LEANN index from Chrome history data.
|
||||||
@@ -140,6 +148,8 @@ def create_leann_index(
|
|||||||
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
||||||
index_path: Path to save the LEANN index
|
index_path: Path to save the LEANN index
|
||||||
max_count: Maximum number of history entries to process
|
max_count: Maximum number of history entries to process
|
||||||
|
embedding_model: The embedding model to use
|
||||||
|
embedding_mode: The embedding backend mode
|
||||||
"""
|
"""
|
||||||
print("Creating LEANN index from Chrome history data...")
|
print("Creating LEANN index from Chrome history data...")
|
||||||
INDEX_DIR = Path(index_path).parent
|
INDEX_DIR = Path(index_path).parent
|
||||||
@@ -187,9 +197,11 @@ def create_leann_index(
|
|||||||
print("\n[PHASE 1] Building Leann index...")
|
print("\n[PHASE 1] Building Leann index...")
|
||||||
|
|
||||||
# Use HNSW backend for better macOS compatibility
|
# Use HNSW backend for better macOS compatibility
|
||||||
|
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name="hnsw",
|
backend_name="hnsw",
|
||||||
embedding_model="facebook/contriever",
|
embedding_model=embedding_model,
|
||||||
|
embedding_mode=embedding_mode,
|
||||||
graph_degree=32,
|
graph_degree=32,
|
||||||
complexity=64,
|
complexity=64,
|
||||||
is_compact=True,
|
is_compact=True,
|
||||||
@@ -273,6 +285,19 @@ async def main():
|
|||||||
default=True,
|
default=True,
|
||||||
help="Automatically find all Chrome profiles (default: True)",
|
help="Automatically find all Chrome profiles (default: True)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--embedding-model",
|
||||||
|
type=str,
|
||||||
|
default="facebook/contriever",
|
||||||
|
help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--embedding-mode",
|
||||||
|
type=str,
|
||||||
|
default="sentence-transformers",
|
||||||
|
choices=["sentence-transformers", "openai", "mlx"],
|
||||||
|
help="The embedding backend mode",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -301,7 +326,7 @@ async def main():
|
|||||||
|
|
||||||
# Create or load the LEANN index from all sources
|
# Create or load the LEANN index from all sources
|
||||||
index_path = create_leann_index_from_multiple_chrome_profiles(
|
index_path = create_leann_index_from_multiple_chrome_profiles(
|
||||||
profile_dirs, INDEX_PATH, args.max_entries
|
profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode
|
||||||
)
|
)
|
||||||
|
|
||||||
if index_path:
|
if index_path:
|
||||||
|
|||||||
Reference in New Issue
Block a user