Compare commits
7 Commits
v0.1.15
...
fix/openai
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b4b825fec | ||
|
|
34ef0db42f | ||
|
|
41812c7d22 | ||
|
|
2047a1a128 | ||
|
|
261006c36a | ||
|
|
402e8f97ad | ||
|
|
9a5c197acd |
12
README.md
12
README.md
@@ -94,14 +94,14 @@ uv sync
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Quick Star
|
## Quick Start
|
||||||
|
|
||||||
Our declarative API makes RAG as easy as writing a config file.
|
Our declarative API makes RAG as easy as writing a config file.
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb)
|
[](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from leann import LeannBuilder, LeannSearcher, LeannCha
|
from leann import LeannBuilder, LeannSearcher, LeannChat
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
INDEX_PATH = str(Path("./").resolve() / "demo.leann")
|
INDEX_PATH = str(Path("./").resolve() / "demo.leann")
|
||||||
|
|
||||||
@@ -268,7 +268,7 @@ The default Chrome profile path is configured for a typical macOS setup. If you
|
|||||||
1. Open Terminal
|
1. Open Terminal
|
||||||
2. Run: `ls ~/Library/Application\ Support/Google/Chrome/`
|
2. Run: `ls ~/Library/Application\ Support/Google/Chrome/`
|
||||||
3. Look for folders like "Default", "Profile 1", "Profile 2", etc.
|
3. Look for folders like "Default", "Profile 1", "Profile 2", etc.
|
||||||
4. Use the full path as your `--chrome-profile` argumen
|
4. Use the full path as your `--chrome-profile` argument
|
||||||
|
|
||||||
**Common Chrome profile locations:**
|
**Common Chrome profile locations:**
|
||||||
- macOS: `~/Library/Application Support/Google/Chrome/Default`
|
- macOS: `~/Library/Application Support/Google/Chrome/Default`
|
||||||
@@ -311,7 +311,7 @@ sudo packages/wechat-exporter/wechattweak-cli install
|
|||||||
|
|
||||||
**Troubleshooting:**
|
**Troubleshooting:**
|
||||||
- **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41)
|
- **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41)
|
||||||
- **Export errors**: If you encounter the error below, try restarting WeCha
|
- **Export errors**: If you encounter the error below, try restarting WeChat
|
||||||
```
|
```
|
||||||
Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.
|
Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.
|
||||||
Failed to find or export WeChat data. Exiting.
|
Failed to find or export WeChat data. Exiting.
|
||||||
@@ -366,7 +366,7 @@ leann search my-docs "machine learning concepts"
|
|||||||
leann ask my-docs --interactive
|
leann ask my-docs --interactive
|
||||||
|
|
||||||
# List all your indexes
|
# List all your indexes
|
||||||
leann lis
|
leann list
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key CLI features:**
|
**Key CLI features:**
|
||||||
@@ -451,7 +451,7 @@ Options:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv pip install -e ".[dev]" # Install dev dependencies
|
uv pip install -e ".[dev]" # Install dev dependencies
|
||||||
python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR datase
|
python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR dataset
|
||||||
python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia
|
python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles(
|
|||||||
profile_dirs: list[Path],
|
profile_dirs: list[Path],
|
||||||
index_path: str = "chrome_history_index.leann",
|
index_path: str = "chrome_history_index.leann",
|
||||||
max_count: int = -1,
|
max_count: int = -1,
|
||||||
|
embedding_model: str = "facebook/contriever",
|
||||||
|
embedding_mode: str = "sentence-transformers",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create LEANN index from multiple Chrome profile data sources.
|
Create LEANN index from multiple Chrome profile data sources.
|
||||||
@@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles(
|
|||||||
profile_dirs: List of Path objects pointing to Chrome profile directories
|
profile_dirs: List of Path objects pointing to Chrome profile directories
|
||||||
index_path: Path to save the LEANN index
|
index_path: Path to save the LEANN index
|
||||||
max_count: Maximum number of history entries to process per profile
|
max_count: Maximum number of history entries to process per profile
|
||||||
|
embedding_model: The embedding model to use
|
||||||
|
embedding_mode: The embedding backend mode
|
||||||
"""
|
"""
|
||||||
print("Creating LEANN index from multiple Chrome profile data sources...")
|
print("Creating LEANN index from multiple Chrome profile data sources...")
|
||||||
|
|
||||||
@@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles(
|
|||||||
print("\n[PHASE 1] Building Leann index...")
|
print("\n[PHASE 1] Building Leann index...")
|
||||||
|
|
||||||
# Use HNSW backend for better macOS compatibility
|
# Use HNSW backend for better macOS compatibility
|
||||||
|
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name="hnsw",
|
backend_name="hnsw",
|
||||||
embedding_model="facebook/contriever",
|
embedding_model=embedding_model,
|
||||||
|
embedding_mode=embedding_mode,
|
||||||
graph_degree=32,
|
graph_degree=32,
|
||||||
complexity=64,
|
complexity=64,
|
||||||
is_compact=True,
|
is_compact=True,
|
||||||
@@ -132,6 +138,8 @@ def create_leann_index(
|
|||||||
profile_path: str | None = None,
|
profile_path: str | None = None,
|
||||||
index_path: str = "chrome_history_index.leann",
|
index_path: str = "chrome_history_index.leann",
|
||||||
max_count: int = 1000,
|
max_count: int = 1000,
|
||||||
|
embedding_model: str = "facebook/contriever",
|
||||||
|
embedding_mode: str = "sentence-transformers",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create LEANN index from Chrome history data.
|
Create LEANN index from Chrome history data.
|
||||||
@@ -140,6 +148,8 @@ def create_leann_index(
|
|||||||
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
||||||
index_path: Path to save the LEANN index
|
index_path: Path to save the LEANN index
|
||||||
max_count: Maximum number of history entries to process
|
max_count: Maximum number of history entries to process
|
||||||
|
embedding_model: The embedding model to use
|
||||||
|
embedding_mode: The embedding backend mode
|
||||||
"""
|
"""
|
||||||
print("Creating LEANN index from Chrome history data...")
|
print("Creating LEANN index from Chrome history data...")
|
||||||
INDEX_DIR = Path(index_path).parent
|
INDEX_DIR = Path(index_path).parent
|
||||||
@@ -187,9 +197,11 @@ def create_leann_index(
|
|||||||
print("\n[PHASE 1] Building Leann index...")
|
print("\n[PHASE 1] Building Leann index...")
|
||||||
|
|
||||||
# Use HNSW backend for better macOS compatibility
|
# Use HNSW backend for better macOS compatibility
|
||||||
|
# LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name="hnsw",
|
backend_name="hnsw",
|
||||||
embedding_model="facebook/contriever",
|
embedding_model=embedding_model,
|
||||||
|
embedding_mode=embedding_mode,
|
||||||
graph_degree=32,
|
graph_degree=32,
|
||||||
complexity=64,
|
complexity=64,
|
||||||
is_compact=True,
|
is_compact=True,
|
||||||
@@ -273,6 +285,24 @@ async def main():
|
|||||||
default=True,
|
default=True,
|
||||||
help="Automatically find all Chrome profiles (default: True)",
|
help="Automatically find all Chrome profiles (default: True)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--embedding-model",
|
||||||
|
type=str,
|
||||||
|
default="facebook/contriever",
|
||||||
|
help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--embedding-mode",
|
||||||
|
type=str,
|
||||||
|
default="sentence-transformers",
|
||||||
|
choices=["sentence-transformers", "openai", "mlx"],
|
||||||
|
help="The embedding backend mode",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-existing-index",
|
||||||
|
action="store_true",
|
||||||
|
help="Use existing index without rebuilding",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -283,26 +313,34 @@ async def main():
|
|||||||
print(f"Index directory: {INDEX_DIR}")
|
print(f"Index directory: {INDEX_DIR}")
|
||||||
print(f"Max entries: {args.max_entries}")
|
print(f"Max entries: {args.max_entries}")
|
||||||
|
|
||||||
# Find Chrome profile directories
|
if args.use_existing_index:
|
||||||
from history_data.history import ChromeHistoryReader
|
# Use existing index without rebuilding
|
||||||
|
if not Path(INDEX_PATH).exists():
|
||||||
if args.auto_find_profiles:
|
print(f"Error: Index file not found at {INDEX_PATH}")
|
||||||
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
|
|
||||||
if not profile_dirs:
|
|
||||||
print("No Chrome profiles found automatically. Exiting.")
|
|
||||||
return
|
return
|
||||||
|
print(f"Using existing index at {INDEX_PATH}")
|
||||||
|
index_path = INDEX_PATH
|
||||||
else:
|
else:
|
||||||
# Use single specified profile
|
# Find Chrome profile directories
|
||||||
profile_path = Path(args.chrome_profile)
|
from history_data.history import ChromeHistoryReader
|
||||||
if not profile_path.exists():
|
|
||||||
print(f"Chrome profile not found: {profile_path}")
|
|
||||||
return
|
|
||||||
profile_dirs = [profile_path]
|
|
||||||
|
|
||||||
# Create or load the LEANN index from all sources
|
if args.auto_find_profiles:
|
||||||
index_path = create_leann_index_from_multiple_chrome_profiles(
|
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
|
||||||
profile_dirs, INDEX_PATH, args.max_entries
|
if not profile_dirs:
|
||||||
)
|
print("No Chrome profiles found automatically. Exiting.")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# Use single specified profile
|
||||||
|
profile_path = Path(args.chrome_profile)
|
||||||
|
if not profile_path.exists():
|
||||||
|
print(f"Chrome profile not found: {profile_path}")
|
||||||
|
return
|
||||||
|
profile_dirs = [profile_path]
|
||||||
|
|
||||||
|
# Create or load the LEANN index from all sources
|
||||||
|
index_path = create_leann_index_from_multiple_chrome_profiles(
|
||||||
|
profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode
|
||||||
|
)
|
||||||
|
|
||||||
if index_path:
|
if index_path:
|
||||||
if args.query:
|
if args.query:
|
||||||
|
|||||||
@@ -124,7 +124,9 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
)
|
)
|
||||||
from . import faiss # type: ignore
|
from . import faiss # type: ignore
|
||||||
|
|
||||||
self.distance_metric = self.meta.get("distance_metric", "mips").lower()
|
self.distance_metric = (
|
||||||
|
self.meta.get("backend_kwargs", {}).get("distance_metric", "mips").lower()
|
||||||
|
)
|
||||||
metric_enum = get_metric_map().get(self.distance_metric)
|
metric_enum = get_metric_map().get(self.distance_metric)
|
||||||
if metric_enum is None:
|
if metric_enum is None:
|
||||||
raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.")
|
raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.")
|
||||||
@@ -200,6 +202,16 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
params.efSearch = complexity
|
params.efSearch = complexity
|
||||||
params.beam_size = beam_width
|
params.beam_size = beam_width
|
||||||
|
|
||||||
|
# For OpenAI embeddings with cosine distance, disable relative distance check
|
||||||
|
# This prevents early termination when all scores are in a narrow range
|
||||||
|
embedding_model = self.meta.get("embedding_model", "").lower()
|
||||||
|
if self.distance_metric == "cosine" and any(
|
||||||
|
openai_model in embedding_model for openai_model in ["text-embedding", "openai"]
|
||||||
|
):
|
||||||
|
params.check_relative_distance = False
|
||||||
|
else:
|
||||||
|
params.check_relative_distance = True
|
||||||
|
|
||||||
# PQ pruning: direct mapping to HNSW's pq_pruning_ratio
|
# PQ pruning: direct mapping to HNSW's pq_pruning_ratio
|
||||||
params.pq_pruning_ratio = prune_ratio
|
params.pq_pruning_ratio = prune_ratio
|
||||||
|
|
||||||
|
|||||||
@@ -293,6 +293,8 @@ class EmbeddingServerManager:
|
|||||||
command.extend(["--passages-file", str(passages_file)])
|
command.extend(["--passages-file", str(passages_file)])
|
||||||
if embedding_mode != "sentence-transformers":
|
if embedding_mode != "sentence-transformers":
|
||||||
command.extend(["--embedding-mode", embedding_mode])
|
command.extend(["--embedding-mode", embedding_mode])
|
||||||
|
if kwargs.get("distance_metric"):
|
||||||
|
command.extend(["--distance-metric", kwargs["distance_metric"]])
|
||||||
|
|
||||||
return command
|
return command
|
||||||
|
|
||||||
|
|||||||
@@ -63,12 +63,19 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
|||||||
if not self.embedding_model:
|
if not self.embedding_model:
|
||||||
raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.")
|
raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.")
|
||||||
|
|
||||||
|
# Get distance_metric from meta if not provided in kwargs
|
||||||
|
distance_metric = (
|
||||||
|
kwargs.get("distance_metric")
|
||||||
|
or self.meta.get("backend_kwargs", {}).get("distance_metric")
|
||||||
|
or "mips"
|
||||||
|
)
|
||||||
|
|
||||||
server_started, actual_port = self.embedding_server_manager.start_server(
|
server_started, actual_port = self.embedding_server_manager.start_server(
|
||||||
port=port,
|
port=port,
|
||||||
model_name=self.embedding_model,
|
model_name=self.embedding_model,
|
||||||
embedding_mode=self.embedding_mode,
|
embedding_mode=self.embedding_mode,
|
||||||
passages_file=passages_source_file,
|
passages_file=passages_source_file,
|
||||||
distance_metric=kwargs.get("distance_metric"),
|
distance_metric=distance_metric,
|
||||||
enable_warmup=kwargs.get("enable_warmup", False),
|
enable_warmup=kwargs.get("enable_warmup", False),
|
||||||
)
|
)
|
||||||
if not server_started:
|
if not server_started:
|
||||||
|
|||||||
Reference in New Issue
Block a user