feat: Add Google Gemini API support for chat and embeddings

- Add GeminiChat class with gemini-2.5-flash model support - Add compute_embeddings_gemini function with text-embedding-004 model - Update get_llm factory to support "gemini" type - Update API documentation to include gemini embedding mode - Support temperature, max_tokens, top_p parameters for Gemini chat - Support batch embedding processing with progress bars - Add proper error handling and API key validation
2025-08-15 17:52:37 -07:00
8 changed files with 12 additions and 65 deletions
@@ -5,7 +5,7 @@
 <p align="center">
  <img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions">
  <img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
-  <img src="https://img.shields.io/badge/Platform-Ubuntu%20%26%20Arch%20%26%20WSL%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
+  <img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
  <img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
  <img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration">
 </p>
@@ -94,9 +94,7 @@ CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync
 **Linux:**
 ```bash
-# Ubuntu/Debian (For Arch Linux: sudo pacman -S blas lapack openblas libaio boost protobuf abseil-cpp zeromq)
+sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
 sudo apt-get update && sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
 uv sync
 ```
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "leann-backend-diskann"
-version = "0.3.0"
+version = "0.2.9"
-dependencies = ["leann-core==0.3.0", "numpy", "protobuf>=3.19.0"]
+dependencies = ["leann-core==0.2.9", "numpy", "protobuf>=3.19.0"]
 [tool.scikit-build]
 # Key: simplified CMake path
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "leann-backend-hnsw"
-version = "0.3.0"
+version = "0.2.9"
 description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
 dependencies = [
-    "leann-core==0.3.0",
+    "leann-core==0.2.9",
    "numpy",
    "pyzmq>=23.0.0",
    "msgpack>=1.0.0",
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann-core"
-version = "0.3.0"
+version = "0.2.9"
 description = "Core API and plugin system for LEANN"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -307,23 +307,6 @@ class LeannBuilder:
    def build_index(self, index_path: str):
        if not self.chunks:
            raise ValueError("No chunks added.")
        # Filter out invalid/empty text chunks early to keep passage and embedding counts aligned
        valid_chunks: list[dict[str, Any]] = []
        skipped = 0
        for chunk in self.chunks:
            text = chunk.get("text", "")
            if isinstance(text, str) and text.strip():
                valid_chunks.append(chunk)
            else:
                skipped += 1
        if skipped > 0:
            print(
                f"Warning: Skipping {skipped} empty/invalid text chunk(s). Processing {len(valid_chunks)} valid chunks"
            )
            self.chunks = valid_chunks
            if not self.chunks:
                raise ValueError("All provided chunks are empty or invalid. Nothing to index.")
        if self.dimensions is None:
            self.dimensions = len(
                compute_embeddings(
@@ -148,12 +148,6 @@ Examples:
            type=str,
            help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
        )
        build_parser.add_argument(
            "--include-hidden",
            action=argparse.BooleanOptionalAction,
            default=False,
            help="Include hidden files and directories (paths starting with '.') during indexing (default: false)",
        )
        build_parser.add_argument(
            "--doc-chunk-size",
            type=int,
@@ -417,10 +411,7 @@ Examples:
                    print(f"  leann ask {example_name} --interactive")
    def load_documents(
-        self,
+        self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
        docs_paths: Union[str, list],
        custom_file_types: Union[str, None] = None,
        include_hidden: bool = False,
    ):
        # Handle both single path (string) and multiple paths (list) for backward compatibility
        if isinstance(docs_paths, str):
@@ -464,10 +455,6 @@ Examples:
        all_documents = []
        # Helper to detect hidden path components
        def _path_has_hidden_segment(p: Path) -> bool:
            return any(part.startswith(".") and part not in [".", ".."] for part in p.parts)
        # First, process individual files if any
        if files:
            print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
@@ -480,12 +467,8 @@ Examples:
                files_by_dir = defaultdict(list)
                for file_path in files:
-                    file_path_obj = Path(file_path)
+                    parent_dir = str(Path(file_path).parent)
-                    if not include_hidden and _path_has_hidden_segment(file_path_obj):
+                    files_by_dir[parent_dir].append(file_path)
                        print(f"  ⚠️  Skipping hidden file: {file_path}")
                        continue
                    parent_dir = str(file_path_obj.parent)
                    files_by_dir[parent_dir].append(str(file_path_obj))
                # Load files from each parent directory
                for parent_dir, file_list in files_by_dir.items():
@@ -496,7 +479,6 @@ Examples:
                        file_docs = SimpleDirectoryReader(
                            parent_dir,
                            input_files=file_list,
                            # exclude_hidden only affects directory scans; input_files are explicit
                            filename_as_id=True,
                        ).load_data()
                        all_documents.extend(file_docs)
@@ -595,8 +577,6 @@ Examples:
                    # Check if file matches any exclude pattern
                    try:
                        relative_path = file_path.relative_to(docs_path)
                        if not include_hidden and _path_has_hidden_segment(relative_path):
                            continue
                        if self._should_exclude_file(relative_path, gitignore_matches):
                            continue
                    except ValueError:
@@ -624,7 +604,6 @@ Examples:
                        try:
                            default_docs = SimpleDirectoryReader(
                                str(file_path.parent),
                                exclude_hidden=not include_hidden,
                                filename_as_id=True,
                                required_exts=[file_path.suffix],
                            ).load_data()
@@ -653,7 +632,6 @@ Examples:
                    encoding="utf-8",
                    required_exts=code_extensions,
                    file_extractor={},  # Use default extractors
                    exclude_hidden=not include_hidden,
                    filename_as_id=True,
                ).load_data(show_progress=True)
@@ -803,9 +781,7 @@ Examples:
            paragraph_separator="\n\n",
        )
-        all_texts = self.load_documents(
+        all_texts = self.load_documents(docs_paths, args.file_types)
            docs_paths, args.file_types, include_hidden=args.include_hidden
        )
        if not all_texts:
            print("No documents found")
            return
@@ -246,16 +246,6 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
    except ImportError as e:
        raise ImportError(f"OpenAI package not installed: {e}")
    # Validate input list
    if not texts:
        raise ValueError("Cannot compute embeddings for empty text list")
    # Extra validation: abort early if any item is empty/whitespace
    invalid_count = sum(1 for t in texts if not isinstance(t, str) or not t.strip())
    if invalid_count > 0:
        raise ValueError(
            f"Found {invalid_count} empty/invalid text(s) in input. Upstream should filter before calling OpenAI."
        )
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY environment variable not set")
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann"
-version = "0.3.0"
+version = "0.2.9"
 description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
 readme = "README.md"
 requires-python = ">=3.9"