Compare commits

...

7 Commits

Author SHA1 Message Date
Andy Lee
4b4b825fec Merge remote-tracking branch 'origin/main' into fix/openai-embeddings-cosine-distance 2025-07-28 10:17:55 -07:00
Andy Lee
34ef0db42f fix: Improve OpenAI embeddings handling in HNSW backend 2025-07-28 10:15:56 -07:00
Andy Lee
41812c7d22 feat: add --use-existing-index option to google_history_reader_leann.py
- Allow using existing index without rebuilding
- Useful for testing pre-built indices
2025-07-28 00:36:57 -07:00
Andy Lee
2047a1a128 feat: add OpenAI embeddings support to google_history_reader_leann.py
- Add --embedding-model and --embedding-mode arguments
- Support automatic detection of normalized embeddings
- Works correctly with cosine distance for OpenAI embeddings
2025-07-27 23:10:20 -07:00
Andy Lee
261006c36a docs: revert 2025-07-27 22:07:36 -07:00
Andy Lee
402e8f97ad style: format 2025-07-27 20:25:40 -07:00
Andy Lee
9a5c197acd fix: auto-detect normalized embeddings and use cosine distance
- Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere)
- Automatically set distance_metric='cosine' for normalized embeddings
- Add warnings when using non-optimal distance metrics
- Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2)
- Fix DiskANN zmq_port compatibility with lazy loading strategy
- Add documentation for normalized embeddings feature

This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric.
2025-07-27 20:21:05 -07:00
5 changed files with 86 additions and 27 deletions

View File

@@ -94,14 +94,14 @@ uv sync
## Quick Star ## Quick Start
Our declarative API makes RAG as easy as writing a config file. Our declarative API makes RAG as easy as writing a config file.
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb)
```python ```python
from leann import LeannBuilder, LeannSearcher, LeannCha from leann import LeannBuilder, LeannSearcher, LeannChat
from pathlib import Path from pathlib import Path
INDEX_PATH = str(Path("./").resolve() / "demo.leann") INDEX_PATH = str(Path("./").resolve() / "demo.leann")
@@ -268,7 +268,7 @@ The default Chrome profile path is configured for a typical macOS setup. If you
1. Open Terminal 1. Open Terminal
2. Run: `ls ~/Library/Application\ Support/Google/Chrome/` 2. Run: `ls ~/Library/Application\ Support/Google/Chrome/`
3. Look for folders like "Default", "Profile 1", "Profile 2", etc. 3. Look for folders like "Default", "Profile 1", "Profile 2", etc.
4. Use the full path as your `--chrome-profile` argumen 4. Use the full path as your `--chrome-profile` argument
**Common Chrome profile locations:** **Common Chrome profile locations:**
- macOS: `~/Library/Application Support/Google/Chrome/Default` - macOS: `~/Library/Application Support/Google/Chrome/Default`
@@ -311,7 +311,7 @@ sudo packages/wechat-exporter/wechattweak-cli install
**Troubleshooting:** **Troubleshooting:**
- **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41) - **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41)
- **Export errors**: If you encounter the error below, try restarting WeCha - **Export errors**: If you encounter the error below, try restarting WeChat
``` ```
Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed. Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.
Failed to find or export WeChat data. Exiting. Failed to find or export WeChat data. Exiting.
@@ -366,7 +366,7 @@ leann search my-docs "machine learning concepts"
leann ask my-docs --interactive leann ask my-docs --interactive
# List all your indexes # List all your indexes
leann lis leann list
``` ```
**Key CLI features:** **Key CLI features:**
@@ -451,7 +451,7 @@ Options:
```bash ```bash
uv pip install -e ".[dev]" # Install dev dependencies uv pip install -e ".[dev]" # Install dev dependencies
python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR datase python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR dataset
python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia
``` ```

View File

@@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles(
profile_dirs: list[Path], profile_dirs: list[Path],
index_path: str = "chrome_history_index.leann", index_path: str = "chrome_history_index.leann",
max_count: int = -1, max_count: int = -1,
embedding_model: str = "facebook/contriever",
embedding_mode: str = "sentence-transformers",
): ):
""" """
Create LEANN index from multiple Chrome profile data sources. Create LEANN index from multiple Chrome profile data sources.
@@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles(
profile_dirs: List of Path objects pointing to Chrome profile directories profile_dirs: List of Path objects pointing to Chrome profile directories
index_path: Path to save the LEANN index index_path: Path to save the LEANN index
max_count: Maximum number of history entries to process per profile max_count: Maximum number of history entries to process per profile
embedding_model: The embedding model to use
embedding_mode: The embedding backend mode
""" """
print("Creating LEANN index from multiple Chrome profile data sources...") print("Creating LEANN index from multiple Chrome profile data sources...")
@@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles(
print("\n[PHASE 1] Building Leann index...") print("\n[PHASE 1] Building Leann index...")
# Use HNSW backend for better macOS compatibility # Use HNSW backend for better macOS compatibility
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
builder = LeannBuilder( builder = LeannBuilder(
backend_name="hnsw", backend_name="hnsw",
embedding_model="facebook/contriever", embedding_model=embedding_model,
embedding_mode=embedding_mode,
graph_degree=32, graph_degree=32,
complexity=64, complexity=64,
is_compact=True, is_compact=True,
@@ -132,6 +138,8 @@ def create_leann_index(
profile_path: str | None = None, profile_path: str | None = None,
index_path: str = "chrome_history_index.leann", index_path: str = "chrome_history_index.leann",
max_count: int = 1000, max_count: int = 1000,
embedding_model: str = "facebook/contriever",
embedding_mode: str = "sentence-transformers",
): ):
""" """
Create LEANN index from Chrome history data. Create LEANN index from Chrome history data.
@@ -140,6 +148,8 @@ def create_leann_index(
profile_path: Path to the Chrome profile directory (optional, uses default if None) profile_path: Path to the Chrome profile directory (optional, uses default if None)
index_path: Path to save the LEANN index index_path: Path to save the LEANN index
max_count: Maximum number of history entries to process max_count: Maximum number of history entries to process
embedding_model: The embedding model to use
embedding_mode: The embedding backend mode
""" """
print("Creating LEANN index from Chrome history data...") print("Creating LEANN index from Chrome history data...")
INDEX_DIR = Path(index_path).parent INDEX_DIR = Path(index_path).parent
@@ -187,9 +197,11 @@ def create_leann_index(
print("\n[PHASE 1] Building Leann index...") print("\n[PHASE 1] Building Leann index...")
# Use HNSW backend for better macOS compatibility # Use HNSW backend for better macOS compatibility
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
builder = LeannBuilder( builder = LeannBuilder(
backend_name="hnsw", backend_name="hnsw",
embedding_model="facebook/contriever", embedding_model=embedding_model,
embedding_mode=embedding_mode,
graph_degree=32, graph_degree=32,
complexity=64, complexity=64,
is_compact=True, is_compact=True,
@@ -273,6 +285,24 @@ async def main():
default=True, default=True,
help="Automatically find all Chrome profiles (default: True)", help="Automatically find all Chrome profiles (default: True)",
) )
parser.add_argument(
"--embedding-model",
type=str,
default="facebook/contriever",
help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')",
)
parser.add_argument(
"--embedding-mode",
type=str,
default="sentence-transformers",
choices=["sentence-transformers", "openai", "mlx"],
help="The embedding backend mode",
)
parser.add_argument(
"--use-existing-index",
action="store_true",
help="Use existing index without rebuilding",
)
args = parser.parse_args() args = parser.parse_args()
@@ -283,26 +313,34 @@ async def main():
print(f"Index directory: {INDEX_DIR}") print(f"Index directory: {INDEX_DIR}")
print(f"Max entries: {args.max_entries}") print(f"Max entries: {args.max_entries}")
# Find Chrome profile directories if args.use_existing_index:
from history_data.history import ChromeHistoryReader # Use existing index without rebuilding
if not Path(INDEX_PATH).exists():
if args.auto_find_profiles: print(f"Error: Index file not found at {INDEX_PATH}")
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
if not profile_dirs:
print("No Chrome profiles found automatically. Exiting.")
return return
print(f"Using existing index at {INDEX_PATH}")
index_path = INDEX_PATH
else: else:
# Use single specified profile # Find Chrome profile directories
profile_path = Path(args.chrome_profile) from history_data.history import ChromeHistoryReader
if not profile_path.exists():
print(f"Chrome profile not found: {profile_path}")
return
profile_dirs = [profile_path]
# Create or load the LEANN index from all sources if args.auto_find_profiles:
index_path = create_leann_index_from_multiple_chrome_profiles( profile_dirs = ChromeHistoryReader.find_chrome_profiles()
profile_dirs, INDEX_PATH, args.max_entries if not profile_dirs:
) print("No Chrome profiles found automatically. Exiting.")
return
else:
# Use single specified profile
profile_path = Path(args.chrome_profile)
if not profile_path.exists():
print(f"Chrome profile not found: {profile_path}")
return
profile_dirs = [profile_path]
# Create or load the LEANN index from all sources
index_path = create_leann_index_from_multiple_chrome_profiles(
profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode
)
if index_path: if index_path:
if args.query: if args.query:

View File

@@ -124,7 +124,9 @@ class HNSWSearcher(BaseSearcher):
) )
from . import faiss # type: ignore from . import faiss # type: ignore
self.distance_metric = self.meta.get("distance_metric", "mips").lower() self.distance_metric = (
self.meta.get("backend_kwargs", {}).get("distance_metric", "mips").lower()
)
metric_enum = get_metric_map().get(self.distance_metric) metric_enum = get_metric_map().get(self.distance_metric)
if metric_enum is None: if metric_enum is None:
raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.") raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.")
@@ -200,6 +202,16 @@ class HNSWSearcher(BaseSearcher):
params.efSearch = complexity params.efSearch = complexity
params.beam_size = beam_width params.beam_size = beam_width
# For OpenAI embeddings with cosine distance, disable relative distance check
# This prevents early termination when all scores are in a narrow range
embedding_model = self.meta.get("embedding_model", "").lower()
if self.distance_metric == "cosine" and any(
openai_model in embedding_model for openai_model in ["text-embedding", "openai"]
):
params.check_relative_distance = False
else:
params.check_relative_distance = True
# PQ pruning: direct mapping to HNSW's pq_pruning_ratio # PQ pruning: direct mapping to HNSW's pq_pruning_ratio
params.pq_pruning_ratio = prune_ratio params.pq_pruning_ratio = prune_ratio

View File

@@ -293,6 +293,8 @@ class EmbeddingServerManager:
command.extend(["--passages-file", str(passages_file)]) command.extend(["--passages-file", str(passages_file)])
if embedding_mode != "sentence-transformers": if embedding_mode != "sentence-transformers":
command.extend(["--embedding-mode", embedding_mode]) command.extend(["--embedding-mode", embedding_mode])
if kwargs.get("distance_metric"):
command.extend(["--distance-metric", kwargs["distance_metric"]])
return command return command

View File

@@ -63,12 +63,19 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
if not self.embedding_model: if not self.embedding_model:
raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.") raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.")
# Get distance_metric from meta if not provided in kwargs
distance_metric = (
kwargs.get("distance_metric")
or self.meta.get("backend_kwargs", {}).get("distance_metric")
or "mips"
)
server_started, actual_port = self.embedding_server_manager.start_server( server_started, actual_port = self.embedding_server_manager.start_server(
port=port, port=port,
model_name=self.embedding_model, model_name=self.embedding_model,
embedding_mode=self.embedding_mode, embedding_mode=self.embedding_mode,
passages_file=passages_source_file, passages_file=passages_source_file,
distance_metric=kwargs.get("distance_metric"), distance_metric=distance_metric,
enable_warmup=kwargs.get("enable_warmup", False), enable_warmup=kwargs.get("enable_warmup", False),
) )
if not server_started: if not server_started: