feat: support more embedders
This commit is contained in:
17
packages/leann-core/src/leann/__init__.py
Normal file
17
packages/leann-core/src/leann/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# This file makes the 'leann' directory a Python package.
|
||||
|
||||
from .api import LeannBuilder, LeannSearcher, LeannChat, SearchResult
|
||||
|
||||
# Import backends to ensure they are registered
|
||||
try:
|
||||
import leann_backend_hnsw
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import leann_backend_diskann
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
__all__ = ['LeannBuilder', 'LeannSearcher', 'LeannChat', 'SearchResult']
|
||||
@@ -6,22 +6,69 @@ import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import openai
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# --- Helper Functions for Embeddings ---
|
||||
|
||||
def _get_openai_client():
|
||||
"""Initializes and returns an OpenAI client, ensuring the API key is set."""
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY environment variable not set, which is required for OpenAI models.")
|
||||
return openai.OpenAI(api_key=api_key)
|
||||
|
||||
def _is_openai_model(model_name: str) -> bool:
|
||||
"""Checks if the model is likely an OpenAI embedding model."""
|
||||
# This is a simple check, can be improved with a more robust list.
|
||||
return "ada" in model_name or "babbage" in model_name or model_name.startswith("text-embedding-")
|
||||
|
||||
def _compute_embeddings(chunks: List[str], model_name: str) -> np.ndarray:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
# TODO: use a better embedding model
|
||||
model = SentenceTransformer(model_name)
|
||||
print(f"INFO: Computing embeddings for {len(chunks)} chunks using '{model_name}'...")
|
||||
embeddings = model.encode(chunks, show_progress_bar=True)
|
||||
"""Computes embeddings for a list of text chunks using either SentenceTransformers or OpenAI."""
|
||||
if _is_openai_model(model_name):
|
||||
print(f"INFO: Computing embeddings for {len(chunks)} chunks using OpenAI model '{model_name}'...")
|
||||
client = _get_openai_client()
|
||||
response = client.embeddings.create(model=model_name, input=chunks)
|
||||
embeddings = [item.embedding for item in response.data]
|
||||
else:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer(model_name)
|
||||
print(f"INFO: Computing embeddings for {len(chunks)} chunks using SentenceTransformer model '{model_name}'...")
|
||||
embeddings = model.encode(chunks, show_progress_bar=True)
|
||||
|
||||
return np.asarray(embeddings, dtype=np.float32)
|
||||
|
||||
def _get_embedding_dimensions(model_name: str) -> int:
|
||||
"""Gets the embedding dimensions for a given model."""
|
||||
print(f"INFO: Calculating dimensions for model '{model_name}'...")
|
||||
if _is_openai_model(model_name):
|
||||
client = _get_openai_client()
|
||||
response = client.embeddings.create(model=model_name, input=["dummy text"])
|
||||
return len(response.data[0].embedding)
|
||||
else:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer(model_name)
|
||||
dimension = model.get_sentence_embedding_dimension()
|
||||
if dimension is None:
|
||||
raise ValueError(f"Model '{model_name}' does not have a valid embedding dimension.")
|
||||
return dimension
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""Represents a single search result."""
|
||||
id: int
|
||||
score: float
|
||||
text: str
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# --- Core Classes ---
|
||||
|
||||
class LeannBuilder:
|
||||
"""
|
||||
The builder is responsible for building the index, it will compute the embeddings and then build the index.
|
||||
It will also save the metadata of the index.
|
||||
"""
|
||||
def __init__(self, backend_name: str, embedding_model: str = "sentence-transformers/all-mpnet-base-v2", **backend_kwargs):
|
||||
def __init__(self, backend_name: str, embedding_model: str = "sentence-transformers/all-mpnet-base-v2", dimensions: Optional[int] = None, **backend_kwargs):
|
||||
self.backend_name = backend_name
|
||||
backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(backend_name)
|
||||
if backend_factory is None:
|
||||
@@ -29,6 +76,7 @@ class LeannBuilder:
|
||||
self.backend_factory = backend_factory
|
||||
|
||||
self.embedding_model = embedding_model
|
||||
self.dimensions = dimensions
|
||||
self.backend_kwargs = backend_kwargs
|
||||
self.chunks: List[Dict[str, Any]] = []
|
||||
print(f"INFO: LeannBuilder initialized with '{backend_name}' backend.")
|
||||
@@ -40,12 +88,18 @@ class LeannBuilder:
|
||||
if not self.chunks:
|
||||
raise ValueError("No chunks added. Use add_text() first.")
|
||||
|
||||
if self.dimensions is None:
|
||||
self.dimensions = _get_embedding_dimensions(self.embedding_model)
|
||||
print(f"INFO: Auto-detected dimensions for '{self.embedding_model}': {self.dimensions}")
|
||||
|
||||
texts_to_embed = [c["text"] for c in self.chunks]
|
||||
embeddings = _compute_embeddings(texts_to_embed, self.embedding_model)
|
||||
|
||||
builder_instance = self.backend_factory.builder(**self.backend_kwargs)
|
||||
# Pass chunks data for passages file generation
|
||||
build_kwargs = self.backend_kwargs.copy()
|
||||
current_backend_kwargs = self.backend_kwargs.copy()
|
||||
current_backend_kwargs['dimensions'] = self.dimensions
|
||||
builder_instance = self.backend_factory.builder(**current_backend_kwargs)
|
||||
|
||||
build_kwargs = current_backend_kwargs.copy()
|
||||
build_kwargs['chunks'] = self.chunks
|
||||
builder_instance.build(embeddings, index_path, **build_kwargs)
|
||||
|
||||
@@ -56,6 +110,7 @@ class LeannBuilder:
|
||||
"version": "0.1.0",
|
||||
"backend_name": self.backend_name,
|
||||
"embedding_model": self.embedding_model,
|
||||
"dimensions": self.dimensions,
|
||||
"backend_kwargs": self.backend_kwargs,
|
||||
"num_chunks": len(self.chunks),
|
||||
"chunks": self.chunks,
|
||||
@@ -87,6 +142,8 @@ class LeannSearcher:
|
||||
|
||||
final_kwargs = self.meta_data.get("backend_kwargs", {})
|
||||
final_kwargs.update(backend_kwargs)
|
||||
if 'dimensions' not in final_kwargs:
|
||||
final_kwargs['dimensions'] = self.meta_data.get('dimensions')
|
||||
|
||||
self.backend_impl = backend_factory.searcher(index_path, **final_kwargs)
|
||||
print(f"INFO: LeannSearcher initialized with '{backend_name}' backend using index '{index_path}'.")
|
||||
@@ -94,18 +151,19 @@ class LeannSearcher:
|
||||
def search(self, query: str, top_k: int = 5, **search_kwargs):
|
||||
query_embedding = _compute_embeddings([query], self.embedding_model)
|
||||
|
||||
search_kwargs['embedding_model'] = self.embedding_model
|
||||
results = self.backend_impl.search(query_embedding, top_k, **search_kwargs)
|
||||
|
||||
enriched_results = []
|
||||
for label, dist in zip(results['labels'][0], results['distances'][0]):
|
||||
if label < len(self.meta_data['chunks']):
|
||||
chunk_info = self.meta_data['chunks'][label]
|
||||
enriched_results.append({
|
||||
"id": label,
|
||||
"score": dist,
|
||||
"text": chunk_info['text'],
|
||||
"metadata": chunk_info['metadata']
|
||||
})
|
||||
enriched_results.append(SearchResult(
|
||||
id=label,
|
||||
score=dist,
|
||||
text=chunk_info['text'],
|
||||
metadata=chunk_info.get('metadata', {})
|
||||
))
|
||||
return enriched_results
|
||||
|
||||
|
||||
@@ -125,15 +183,6 @@ class LeannChat:
|
||||
|
||||
self.searcher = LeannSearcher(index_path, **kwargs)
|
||||
self.llm_model = llm_model
|
||||
self.openai_client = None # Lazy load
|
||||
|
||||
def _get_openai_client(self):
|
||||
if self.openai_client is None:
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY environment variable not set.")
|
||||
self.openai_client = openai.OpenAI(api_key=api_key)
|
||||
return self.openai_client
|
||||
|
||||
def ask(self, question: str, top_k=5, **kwargs):
|
||||
"""
|
||||
@@ -165,13 +214,13 @@ class LeannChat:
|
||||
"""
|
||||
|
||||
results = self.searcher.search(question, top_k=top_k, **kwargs)
|
||||
context = "\n\n".join([r['text'] for r in results])
|
||||
context = "\n\n".join([r.text for r in results])
|
||||
|
||||
prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
|
||||
|
||||
print(f"DEBUG: Calling LLM with prompt: {prompt}...")
|
||||
try:
|
||||
client = self._get_openai_client()
|
||||
client = _get_openai_client()
|
||||
response = client.chat.completions.create(
|
||||
model=self.llm_model,
|
||||
messages=[
|
||||
|
||||
Reference in New Issue
Block a user