feat: hnsw embedding server and csr format

2025-07-05 23:04:41 +00:00
parent 368474d036
commit 0aa84e147b
9 changed files with 959 additions and 154 deletions
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -5,56 +5,50 @@ import numpy as np
 import os
 import json
 from pathlib import Path
-import openai # Import openai library
+import openai

-# 一个辅助函数，用于临时计算 embedding
 def _compute_embeddings(chunks: List[str], model_name: str) -> np.ndarray:
-    try:
-        from sentence_transformers import SentenceTransformer
-        model = SentenceTransformer(model_name)
-        print(f"INFO: Computing embeddings for {len(chunks)} chunks using '{model_name}'...")
-        embeddings = model.encode(chunks, show_progress_bar=True)
-        return np.asarray(embeddings, dtype=np.float32)
-    except ImportError:
-        print("WARNING: sentence-transformers not installed. Falling back to random embeddings.")
-        # 如果没有安装，则生成随机向量用于测试
-        # TODO: 应该从一个固定的地方获取维度信息
-        return np.random.rand(len(chunks), 768).astype(np.float32)
+    from sentence_transformers import SentenceTransformer
+    # TODO: use a better embedding model
+    model = SentenceTransformer(model_name)
+    print(f"INFO: Computing embeddings for {len(chunks)} chunks using '{model_name}'...")
+    embeddings = model.encode(chunks, show_progress_bar=True)
+    return np.asarray(embeddings, dtype=np.float32)


 class LeannBuilder:
    """
-    负责构建 Leann 索引的上层 API。
-    它协调 embedding 计算和后端索引构建。
+    The builder is responsible for building the index, it will compute the embeddings and then build the index.
+    It will also save the metadata of the index.
    """
    def __init__(self, backend_name: str, embedding_model: str = "sentence-transformers/all-mpnet-base-v2", **backend_kwargs):
        self.backend_name = backend_name
-        self.backend_factory = BACKEND_REGISTRY.get(backend_name)
-        if self.backend_factory is None:
+        backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(backend_name)
+        if backend_factory is None:
            raise ValueError(f"Backend '{backend_name}' not found or not registered.")
-        
+        self.backend_factory = backend_factory
+
        self.embedding_model = embedding_model
        self.backend_kwargs = backend_kwargs
        self.chunks: List[Dict[str, Any]] = []
        print(f"INFO: LeannBuilder initialized with '{backend_name}' backend.")

    def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
-        # 简单的分块逻辑
        self.chunks.append({"text": text, "metadata": metadata or {}})

    def build_index(self, index_path: str):
        if not self.chunks:
            raise ValueError("No chunks added. Use add_text() first.")

-        # 1. 计算 embedding (这是 leann-core 的职责)
        texts_to_embed = [c["text"] for c in self.chunks]
        embeddings = _compute_embeddings(texts_to_embed, self.embedding_model)

-        # 2. 创建 builder 实例并构建索引
        builder_instance = self.backend_factory.builder(**self.backend_kwargs)
-        builder_instance.build(embeddings, index_path, **self.backend_kwargs)
+        # Pass chunks data for passages file generation
+        build_kwargs = self.backend_kwargs.copy()
+        build_kwargs['chunks'] = self.chunks
+        builder_instance.build(embeddings, index_path, **build_kwargs)

-        # 3. 保存 leann 特有的元数据（不包含向量）
        index_dir = Path(index_path).parent
        leann_meta_path = index_dir / f"{Path(index_path).name}.meta.json"
        
@@ -62,6 +56,7 @@ class LeannBuilder:
            "version": "0.1.0",
            "backend_name": self.backend_name,
            "embedding_model": self.embedding_model,
+            "backend_kwargs": self.backend_kwargs,
            "num_chunks": len(self.chunks),
            "chunks": self.chunks,
        }
@@ -72,7 +67,8 @@ class LeannBuilder:

 class LeannSearcher:
    """
-    负责加载索引并执行检索的上层 API。
+    The searcher is responsible for loading the index and performing the search.
+    It will also load the metadata of the index.
    """
    def __init__(self, index_path: str, **backend_kwargs):
        leann_meta_path = Path(index_path).parent / f"{Path(index_path).name}.meta.json"
@@ -89,17 +85,17 @@ class LeannSearcher:
        if backend_factory is None:
            raise ValueError(f"Backend '{backend_name}' (from index file) not found or not registered.")

-        # 创建 searcher 实例
-        self.backend_impl = backend_factory.searcher(index_path, **backend_kwargs)
+        final_kwargs = self.meta_data.get("backend_kwargs", {})
+        final_kwargs.update(backend_kwargs)
+
+        self.backend_impl = backend_factory.searcher(index_path, **final_kwargs)
        print(f"INFO: LeannSearcher initialized with '{backend_name}' backend using index '{index_path}'.")
    
    def search(self, query: str, top_k: int = 5, **search_kwargs):
        query_embedding = _compute_embeddings([query], self.embedding_model)
        
-        # 委托给后端的 search 方法
        results = self.backend_impl.search(query_embedding, top_k, **search_kwargs)
        
-        # 丰富返回结果，加入原始文本和元数据
        enriched_results = []
        for label, dist in zip(results['labels'][0], results['distances'][0]):
            if label < len(self.meta_data['chunks']):
@@ -115,10 +111,10 @@ class LeannSearcher:

 class LeannChat:
    """
-    封装了 Searcher 和 LLM 的对话式 RAG 接口。
+    The chat is responsible for the conversation with the LLM.
+    It will use the searcher to get the results and then use the LLM to generate the response.
    """
    def __init__(self, index_path: str, backend_name: Optional[str] = None, llm_model: str = "gpt-4o", **kwargs):
-        # 如果用户没有指定后端，尝试从索引元数据中读取
        if backend_name is None:
            leann_meta_path = Path(index_path).parent / f"{Path(index_path).name}.meta.json"
            if not leann_meta_path.exists():
@@ -171,10 +167,8 @@ class LeannChat:
        results = self.searcher.search(question, top_k=top_k, **kwargs)
        context = "\n\n".join([r['text'] for r in results])

-        # 2. 构建 Prompt
        prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"

-        # 3. 调用 LLM
        print(f"DEBUG: Calling LLM with prompt: {prompt}...")
        try:
            client = self._get_openai_client()
--- a/packages/leann-core/src/leann/registry.py
+++ b/packages/leann-core/src/leann/registry.py
@@ -1,10 +1,13 @@
 # packages/leann-core/src/leann/registry.py

-# 全局的后端注册表字典
-BACKEND_REGISTRY = {}
+from typing import Dict, TYPE_CHECKING
+if TYPE_CHECKING:
+    from leann.interface import LeannBackendFactoryInterface
+
+BACKEND_REGISTRY: Dict[str, 'LeannBackendFactoryInterface'] = {}

 def register_backend(name: str):
-    """一个用于注册新后端类的装饰器。"""
+    """A decorator to register a new backend class."""
    def decorator(cls):
        print(f"INFO: Registering backend '{name}'")
        BACKEND_REGISTRY[name] = cls