From 838ade231ebc9f544b5539b39e872e297e0bdf72 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sat, 16 Aug 2025 11:50:25 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=97=20Auto-register=20apps:=20Universa?= =?UTF-8?q?l=20index=20discovery=20(#64)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Enhance CLI with improved list and smart remove commands ## ✨ New Features ### šŸ  Enhanced `leann list` command - **Better UX**: Current project shown first with clear separation - **Visual improvements**: Icons (šŸ /šŸ“‚), better formatting, size info - **Smart guidance**: Context-aware usage examples and getting started tips ### šŸ›”ļø Smart `leann remove` command - **Safety first**: Always shows ALL matching indexes across projects - **Intelligent handling**: - Single match: Clear location display with cross-project warnings - Multiple matches: Interactive selection with final confirmation - **Prevents accidents**: No more deleting wrong indexes due to name conflicts - **User-friendly**: 'c' to cancel, clear visual hierarchy, detailed info ### šŸ”§ Technical improvements - **Clean logging**: Hide debug messages for better CLI experience - **Comprehensive search**: Always scan all projects for transparency - **Error handling**: Graceful handling of edge cases and user input ## šŸŽÆ Impact - **Safer**: Eliminates risk of accidental index deletion - **Clearer**: Users always know what they're operating on - **Smarter**: Automatic detection and handling of common scenarios šŸ¤– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * chore: vscode ruff, and format --------- Co-authored-by: Claude --- .gitignore | 1 + .vscode/extensions.json | 5 + .vscode/settings.json | 22 +++ README.md | 7 +- apps/base_rag_example.py | 6 + packages/leann-core/src/leann/api.py | 7 +- packages/leann-core/src/leann/chat.py | 24 ++-- packages/leann-core/src/leann/cli.py | 160 ++++++++++++---------- packages/leann-core/src/leann/registry.py | 55 +++++++- pyproject.toml | 22 +-- 10 files changed, 200 insertions(+), 109 deletions(-) create mode 100644 .vscode/extensions.json create mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index 821f602..7563979 100755 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ demo/experiment_results/**/*.json *.eml *.emlx *.json +!.vscode/*.json *.sh *.txt !CMakeLists.txt diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..e6a7fad --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "charliermarsh.ruff", + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3e1a508 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,22 @@ +{ + "python.defaultInterpreterPath": ".venv/bin/python", + "python.terminal.activateEnvironment": true, + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit", + "source.fixAll": "explicit" + }, + "editor.insertSpaces": true, + "editor.tabSize": 4 + }, + "ruff.enable": true, + "files.watcherExclude": { + "**/.venv/**": true, + "**/__pycache__/**": true, + "**/*.egg-info/**": true, + "**/build/**": true, + "**/dist/**": true + } +} diff --git a/README.md b/README.md index e0c4c52..279a37a 100755 --- a/README.md +++ b/README.md @@ -542,8 +542,10 @@ Options: leann list # Lists all indexes across all projects with status indicators: -# āœ“ - Index is complete and ready to use -# āœ— - Index is incomplete or corrupted +# āœ… - Index is complete and ready to use +# āŒ - Index is incomplete or corrupted +# šŸ“ - CLI-created index (in .leann/indexes/) +# šŸ“„ - App-created index (*.leann.meta.json files) ``` **Remove Command:** @@ -557,6 +559,7 @@ Options: # - Shows all matching indexes across projects # - Requires confirmation for cross-project removal # - Interactive selection when multiple matches found +# - Supports both CLI and app-created indexes ``` diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py index 9fbb7d2..8d539d3 100644 --- a/apps/base_rag_example.py +++ b/apps/base_rag_example.py @@ -10,6 +10,7 @@ from typing import Any import dotenv from leann.api import LeannBuilder, LeannChat +from leann.registry import register_project_directory from llama_index.core.node_parser import SentenceSplitter dotenv.load_dotenv() @@ -214,6 +215,11 @@ class BaseRAGExample(ABC): builder.build_index(index_path) print(f"Index saved to: {index_path}") + # Register project directory so leann list can discover this index + # The index is saved as args.index_dir/index_name.leann + # We want to register the current working directory where the app is run + register_project_directory(Path.cwd()) + return index_path async def run_interactive_chat(self, args, index_path: str): diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index a00b6f9..ec32569 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -614,7 +614,7 @@ class LeannSearcher: zmq_port=zmq_port, ) # logger.info(f" Generated embedding shape: {query_embedding.shape}") - time.time() - start_time + # time.time() - start_time # logger.info(f" Embedding time: {embedding_time} seconds") start_time = time.time() @@ -680,8 +680,9 @@ class LeannSearcher: This method should be called after you're done using the searcher, especially in test environments or batch processing scenarios. """ - if hasattr(self.backend_impl, "embedding_server_manager"): - self.backend_impl.embedding_server_manager.stop_server() + backend = getattr(self.backend_impl, "embedding_server_manager", None) + if backend is not None: + backend.stop_server() # Enable automatic cleanup patterns def __enter__(self): diff --git a/packages/leann-core/src/leann/chat.py b/packages/leann-core/src/leann/chat.py index a428462..391c59d 100644 --- a/packages/leann-core/src/leann/chat.py +++ b/packages/leann-core/src/leann/chat.py @@ -707,20 +707,28 @@ class GeminiChat(LLMInterface): logger.info(f"Sending request to Gemini with model {self.model}") try: - # Set generation configuration - generation_config = { - "temperature": kwargs.get("temperature", 0.7), - "max_output_tokens": kwargs.get("max_tokens", 1000), - } + from google.genai.types import GenerateContentConfig + + generation_config = GenerateContentConfig( + temperature=kwargs.get("temperature", 0.7), + max_output_tokens=kwargs.get("max_tokens", 1000), + ) # Handle top_p parameter if "top_p" in kwargs: - generation_config["top_p"] = kwargs["top_p"] + generation_config.top_p = kwargs["top_p"] response = self.client.models.generate_content( - model=self.model, contents=prompt, config=generation_config + model=self.model, + contents=prompt, + config=generation_config, ) - return response.text.strip() + # Handle potential None response text + response_text = response.text + if response_text is None: + logger.warning("Gemini returned None response text") + return "" + return response_text.strip() except Exception as e: logger.error(f"Error communicating with Gemini: {e}") return f"Error: Could not get a response from Gemini. Details: {e}" diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 7968926..36a03af 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1,13 +1,14 @@ import argparse import asyncio from pathlib import Path -from typing import Union +from typing import Optional, Union from llama_index.core import SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter from tqdm import tqdm from .api import LeannBuilder, LeannChat, LeannSearcher +from .registry import register_project_directory def extract_pdf_text_with_pymupdf(file_path: str) -> str: @@ -263,31 +264,7 @@ Examples: def register_project_dir(self): """Register current project directory in global registry""" - global_registry = Path.home() / ".leann" / "projects.json" - global_registry.parent.mkdir(exist_ok=True) - - current_dir = str(Path.cwd()) - - # Load existing registry - projects = [] - if global_registry.exists(): - try: - import json - - with open(global_registry) as f: - projects = json.load(f) - except Exception: - projects = [] - - # Add current directory if not already present - if current_dir not in projects: - projects.append(current_dir) - - # Save registry - import json - - with open(global_registry, "w") as f: - json.dump(projects, f, indent=2) + register_project_directory() def _build_gitignore_parser(self, docs_dir: str): """Build gitignore parser using gitignore-parser library.""" @@ -373,13 +350,10 @@ Examples: valid_projects.append(current_path) # Separate current and other projects - current_project = None other_projects = [] for project_path in valid_projects: - if project_path == current_path: - current_project = project_path - else: + if project_path != current_path: other_projects.append(project_path) print("šŸ“š LEANN Indexes") @@ -389,35 +363,20 @@ Examples: current_indexes_count = 0 # Show current project first (most important) - if current_project: - current_indexes_dir = current_project / ".leann" / "indexes" - if current_indexes_dir.exists(): - current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()] + print("\nšŸ  Current Project") + print(f" {current_path}") + print(" " + "─" * 45) - print("\nšŸ  Current Project") - print(f" {current_project}") - print(" " + "─" * 45) - - if current_index_dirs: - for index_dir in current_index_dirs: - total_indexes += 1 - current_indexes_count += 1 - index_name = index_dir.name - meta_file = index_dir / "documents.leann.meta.json" - status = "āœ…" if meta_file.exists() else "āŒ" - - print(f" {current_indexes_count}. {index_name} {status}") - if meta_file.exists(): - size_mb = sum( - f.stat().st_size for f in index_dir.iterdir() if f.is_file() - ) / (1024 * 1024) - print(f" šŸ“¦ Size: {size_mb:.1f} MB") - else: - print(" šŸ“­ No indexes in current project") + current_indexes = self._discover_indexes_in_project(current_path) + if current_indexes: + for idx in current_indexes: + total_indexes += 1 + current_indexes_count += 1 + type_icon = "šŸ“" if idx["type"] == "cli" else "šŸ“„" + print(f" {current_indexes_count}. {type_icon} {idx['name']} {idx['status']}") + if idx["size_mb"] > 0: + print(f" šŸ“¦ Size: {idx['size_mb']:.1f} MB") else: - print("\nšŸ  Current Project") - print(f" {current_path}") - print(" " + "─" * 45) print(" šŸ“­ No indexes in current project") # Show other projects (reference information) @@ -426,29 +385,19 @@ Examples: print(" " + "─" * 45) for project_path in other_projects: - indexes_dir = project_path / ".leann" / "indexes" - if not indexes_dir.exists(): - continue - - index_dirs = [d for d in indexes_dir.iterdir() if d.is_dir()] - if not index_dirs: + project_indexes = self._discover_indexes_in_project(project_path) + if not project_indexes: continue print(f"\n šŸ“‚ {project_path.name}") print(f" {project_path}") - for index_dir in index_dirs: + for idx in project_indexes: total_indexes += 1 - index_name = index_dir.name - meta_file = index_dir / "documents.leann.meta.json" - status = "āœ…" if meta_file.exists() else "āŒ" - - print(f" • {index_name} {status}") - if meta_file.exists(): - size_mb = sum( - f.stat().st_size for f in index_dir.iterdir() if f.is_file() - ) / (1024 * 1024) - print(f" šŸ“¦ {size_mb:.1f} MB") + type_icon = "šŸ“" if idx["type"] == "cli" else "šŸ“„" + print(f" • {type_icon} {idx['name']} {idx['status']}") + if idx["size_mb"] > 0: + print(f" šŸ“¦ {idx['size_mb']:.1f} MB") # Summary and usage info print("\n" + "=" * 50) @@ -480,6 +429,67 @@ Examples: print("\nšŸ’” Create your first index:") print(" leann build my-docs --docs ./documents") + def _discover_indexes_in_project(self, project_path: Path): + """Discover all indexes in a project directory (both CLI and apps formats)""" + indexes = [] + + # 1. CLI format: .leann/indexes/index_name/ + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists(): + for index_dir in cli_indexes_dir.iterdir(): + if index_dir.is_dir(): + meta_file = index_dir / "documents.leann.meta.json" + status = "āœ…" if meta_file.exists() else "āŒ" + + size_mb = 0 + if meta_file.exists(): + try: + size_mb = sum( + f.stat().st_size for f in index_dir.iterdir() if f.is_file() + ) / (1024 * 1024) + except (OSError, PermissionError): + pass + + indexes.append( + { + "name": index_dir.name, + "type": "cli", + "status": status, + "size_mb": size_mb, + "path": index_dir, + } + ) + + # 2. Apps format: *.leann.meta.json files anywhere in the project + for meta_file in project_path.rglob("*.leann.meta.json"): + if meta_file.is_file(): + # Extract index name from filename (remove .leann.meta.json extension) + index_name = meta_file.name.replace(".leann.meta.json", "") + + # Apps indexes are considered complete if the .leann.meta.json file exists + status = "āœ…" + + # Calculate total size of all related files + size_mb = 0 + try: + index_dir = meta_file.parent + for related_file in index_dir.glob(f"{index_name}.leann*"): + size_mb += related_file.stat().st_size / (1024 * 1024) + except (OSError, PermissionError): + pass + + indexes.append( + { + "name": index_name, + "type": "app", + "status": status, + "size_mb": size_mb, + "path": meta_file, + } + ) + + return indexes + def remove_index(self, index_name: str, force: bool = False): """Safely remove an index - always show all matches for transparency""" @@ -637,7 +647,7 @@ Examples: return False def _delete_index_directory( - self, index_dir: Path, index_name: str, project_path: Path | None = None + self, index_dir: Path, index_name: str, project_path: Optional[Path] = None ): """Actually delete the index directory""" try: diff --git a/packages/leann-core/src/leann/registry.py b/packages/leann-core/src/leann/registry.py index dc6df68..6778745 100644 --- a/packages/leann-core/src/leann/registry.py +++ b/packages/leann-core/src/leann/registry.py @@ -2,8 +2,10 @@ import importlib import importlib.metadata +import json import logging -from typing import TYPE_CHECKING +from pathlib import Path +from typing import TYPE_CHECKING, Optional, Union if TYPE_CHECKING: from leann.interface import LeannBackendFactoryInterface @@ -43,3 +45,54 @@ def autodiscover_backends(): # print(f"WARN: Could not import backend module '{backend_module_name}': {e}") pass # print("INFO: Backend auto-discovery finished.") + + +def register_project_directory(project_dir: Optional[Union[str, Path]] = None): + """ + Register a project directory in the global LEANN registry. + + This allows `leann list` to discover indexes created by apps or other tools. + + Args: + project_dir: Directory to register. If None, uses current working directory. + """ + if project_dir is None: + project_dir = Path.cwd() + else: + project_dir = Path(project_dir) + + # Only register directories that have some kind of LEANN content + # Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format) + has_cli_indexes = (project_dir / ".leann" / "indexes").exists() + has_app_indexes = any(project_dir.rglob("*.leann.meta.json")) + + if not (has_cli_indexes or has_app_indexes): + # Don't register if there are no LEANN indexes + return + + global_registry = Path.home() / ".leann" / "projects.json" + global_registry.parent.mkdir(exist_ok=True) + + project_str = str(project_dir.resolve()) + + # Load existing registry + projects = [] + if global_registry.exists(): + try: + with open(global_registry) as f: + projects = json.load(f) + except Exception: + logger.debug("Could not load existing project registry") + projects = [] + + # Add project if not already present + if project_str not in projects: + projects.append(project_str) + + # Save updated registry + try: + with open(global_registry, "w") as f: + json.dump(projects, f, indent=2) + logger.debug(f"Registered project directory: {project_str}") + except Exception as e: + logger.warning(f"Could not save project registry: {e}") diff --git a/pyproject.toml b/pyproject.toml index 9aa3db3..3267332 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,8 +14,6 @@ dependencies = [ "numpy>=1.26.0", "torch", "tqdm", - "flask", - "flask_compress", "datasets>=2.15.0", "evaluate", "colorama", @@ -66,9 +64,7 @@ test = [ "pytest>=7.0", "pytest-timeout>=2.0", "llama-index-core>=0.12.0", - "llama-index-readers-file>=0.4.0", "python-dotenv>=1.0.0", - "sentence-transformers>=2.2.0", ] diskann = [ @@ -100,13 +96,8 @@ leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true } [tool.ruff] target-version = "py39" line-length = 100 -extend-exclude = [ - "third_party", - "*.egg-info", - "__pycache__", - ".git", - ".venv", -] +extend-exclude = ["third_party"] + [tool.ruff.lint] select = [ @@ -129,21 +120,12 @@ ignore = [ "RUF012", # mutable class attributes should be annotated with typing.ClassVar ] -[tool.ruff.lint.per-file-ignores] -"test/**/*.py" = ["E402"] # module level import not at top of file (common in tests) -"examples/**/*.py" = ["E402"] # module level import not at top of file (common in examples) - [tool.ruff.format] quote-style = "double" indent-style = "space" skip-magic-trailing-comma = false line-ending = "auto" -[dependency-groups] -dev = [ - "ruff>=0.12.4", -] - [tool.lychee] accept = ["200", "403", "429", "503"] timeout = 20