🔗 Auto-register apps: Universal index discovery (#64)

* feat: Enhance CLI with improved list and smart remove commands

##  New Features

### 🏠 Enhanced `leann list` command
- **Better UX**: Current project shown first with clear separation
- **Visual improvements**: Icons (🏠/📂), better formatting, size info
- **Smart guidance**: Context-aware usage examples and getting started tips

### 🛡️ Smart `leann remove` command
- **Safety first**: Always shows ALL matching indexes across projects
- **Intelligent handling**:
  - Single match: Clear location display with cross-project warnings
  - Multiple matches: Interactive selection with final confirmation
- **Prevents accidents**: No more deleting wrong indexes due to name conflicts
- **User-friendly**: 'c' to cancel, clear visual hierarchy, detailed info

### 🔧 Technical improvements
- **Clean logging**: Hide debug messages for better CLI experience
- **Comprehensive search**: Always scan all projects for transparency
- **Error handling**: Graceful handling of edge cases and user input

## 🎯 Impact
- **Safer**: Eliminates risk of accidental index deletion
- **Clearer**: Users always know what they're operating on
- **Smarter**: Automatic detection and handling of common scenarios

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

* chore: vscode ruff, and format

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Andy Lee
2025-08-16 11:50:25 -07:00
committed by GitHub
parent da6540decd
commit 838ade231e
10 changed files with 200 additions and 109 deletions

1
.gitignore vendored
View File

@@ -18,6 +18,7 @@ demo/experiment_results/**/*.json
*.eml *.eml
*.emlx *.emlx
*.json *.json
!.vscode/*.json
*.sh *.sh
*.txt *.txt
!CMakeLists.txt !CMakeLists.txt

5
.vscode/extensions.json vendored Normal file
View File

@@ -0,0 +1,5 @@
{
"recommendations": [
"charliermarsh.ruff",
]
}

22
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,22 @@
{
"python.defaultInterpreterPath": ".venv/bin/python",
"python.terminal.activateEnvironment": true,
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit"
},
"editor.insertSpaces": true,
"editor.tabSize": 4
},
"ruff.enable": true,
"files.watcherExclude": {
"**/.venv/**": true,
"**/__pycache__/**": true,
"**/*.egg-info/**": true,
"**/build/**": true,
"**/dist/**": true
}
}

View File

@@ -542,8 +542,10 @@ Options:
leann list leann list
# Lists all indexes across all projects with status indicators: # Lists all indexes across all projects with status indicators:
# - Index is complete and ready to use # - Index is complete and ready to use
# - Index is incomplete or corrupted # - Index is incomplete or corrupted
# 📁 - CLI-created index (in .leann/indexes/)
# 📄 - App-created index (*.leann.meta.json files)
``` ```
**Remove Command:** **Remove Command:**
@@ -557,6 +559,7 @@ Options:
# - Shows all matching indexes across projects # - Shows all matching indexes across projects
# - Requires confirmation for cross-project removal # - Requires confirmation for cross-project removal
# - Interactive selection when multiple matches found # - Interactive selection when multiple matches found
# - Supports both CLI and app-created indexes
``` ```
</details> </details>

View File

@@ -10,6 +10,7 @@ from typing import Any
import dotenv import dotenv
from leann.api import LeannBuilder, LeannChat from leann.api import LeannBuilder, LeannChat
from leann.registry import register_project_directory
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
dotenv.load_dotenv() dotenv.load_dotenv()
@@ -214,6 +215,11 @@ class BaseRAGExample(ABC):
builder.build_index(index_path) builder.build_index(index_path)
print(f"Index saved to: {index_path}") print(f"Index saved to: {index_path}")
# Register project directory so leann list can discover this index
# The index is saved as args.index_dir/index_name.leann
# We want to register the current working directory where the app is run
register_project_directory(Path.cwd())
return index_path return index_path
async def run_interactive_chat(self, args, index_path: str): async def run_interactive_chat(self, args, index_path: str):

View File

@@ -614,7 +614,7 @@ class LeannSearcher:
zmq_port=zmq_port, zmq_port=zmq_port,
) )
# logger.info(f" Generated embedding shape: {query_embedding.shape}") # logger.info(f" Generated embedding shape: {query_embedding.shape}")
time.time() - start_time # time.time() - start_time
# logger.info(f" Embedding time: {embedding_time} seconds") # logger.info(f" Embedding time: {embedding_time} seconds")
start_time = time.time() start_time = time.time()
@@ -680,8 +680,9 @@ class LeannSearcher:
This method should be called after you're done using the searcher, This method should be called after you're done using the searcher,
especially in test environments or batch processing scenarios. especially in test environments or batch processing scenarios.
""" """
if hasattr(self.backend_impl, "embedding_server_manager"): backend = getattr(self.backend_impl, "embedding_server_manager", None)
self.backend_impl.embedding_server_manager.stop_server() if backend is not None:
backend.stop_server()
# Enable automatic cleanup patterns # Enable automatic cleanup patterns
def __enter__(self): def __enter__(self):

View File

@@ -707,20 +707,28 @@ class GeminiChat(LLMInterface):
logger.info(f"Sending request to Gemini with model {self.model}") logger.info(f"Sending request to Gemini with model {self.model}")
try: try:
# Set generation configuration from google.genai.types import GenerateContentConfig
generation_config = {
"temperature": kwargs.get("temperature", 0.7), generation_config = GenerateContentConfig(
"max_output_tokens": kwargs.get("max_tokens", 1000), temperature=kwargs.get("temperature", 0.7),
} max_output_tokens=kwargs.get("max_tokens", 1000),
)
# Handle top_p parameter # Handle top_p parameter
if "top_p" in kwargs: if "top_p" in kwargs:
generation_config["top_p"] = kwargs["top_p"] generation_config.top_p = kwargs["top_p"]
response = self.client.models.generate_content( response = self.client.models.generate_content(
model=self.model, contents=prompt, config=generation_config model=self.model,
contents=prompt,
config=generation_config,
) )
return response.text.strip() # Handle potential None response text
response_text = response.text
if response_text is None:
logger.warning("Gemini returned None response text")
return ""
return response_text.strip()
except Exception as e: except Exception as e:
logger.error(f"Error communicating with Gemini: {e}") logger.error(f"Error communicating with Gemini: {e}")
return f"Error: Could not get a response from Gemini. Details: {e}" return f"Error: Could not get a response from Gemini. Details: {e}"

View File

@@ -1,13 +1,14 @@
import argparse import argparse
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from typing import Union from typing import Optional, Union
from llama_index.core import SimpleDirectoryReader from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
from tqdm import tqdm from tqdm import tqdm
from .api import LeannBuilder, LeannChat, LeannSearcher from .api import LeannBuilder, LeannChat, LeannSearcher
from .registry import register_project_directory
def extract_pdf_text_with_pymupdf(file_path: str) -> str: def extract_pdf_text_with_pymupdf(file_path: str) -> str:
@@ -263,31 +264,7 @@ Examples:
def register_project_dir(self): def register_project_dir(self):
"""Register current project directory in global registry""" """Register current project directory in global registry"""
global_registry = Path.home() / ".leann" / "projects.json" register_project_directory()
global_registry.parent.mkdir(exist_ok=True)
current_dir = str(Path.cwd())
# Load existing registry
projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
projects = json.load(f)
except Exception:
projects = []
# Add current directory if not already present
if current_dir not in projects:
projects.append(current_dir)
# Save registry
import json
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
def _build_gitignore_parser(self, docs_dir: str): def _build_gitignore_parser(self, docs_dir: str):
"""Build gitignore parser using gitignore-parser library.""" """Build gitignore parser using gitignore-parser library."""
@@ -373,13 +350,10 @@ Examples:
valid_projects.append(current_path) valid_projects.append(current_path)
# Separate current and other projects # Separate current and other projects
current_project = None
other_projects = [] other_projects = []
for project_path in valid_projects: for project_path in valid_projects:
if project_path == current_path: if project_path != current_path:
current_project = project_path
else:
other_projects.append(project_path) other_projects.append(project_path)
print("📚 LEANN Indexes") print("📚 LEANN Indexes")
@@ -389,35 +363,20 @@ Examples:
current_indexes_count = 0 current_indexes_count = 0
# Show current project first (most important) # Show current project first (most important)
if current_project: print("\n🏠 Current Project")
current_indexes_dir = current_project / ".leann" / "indexes" print(f" {current_path}")
if current_indexes_dir.exists(): print(" " + "" * 45)
current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
print("\n🏠 Current Project") current_indexes = self._discover_indexes_in_project(current_path)
print(f" {current_project}") if current_indexes:
print(" " + "" * 45) for idx in current_indexes:
total_indexes += 1
if current_index_dirs: current_indexes_count += 1
for index_dir in current_index_dirs: type_icon = "📁" if idx["type"] == "cli" else "📄"
total_indexes += 1 print(f" {current_indexes_count}. {type_icon} {idx['name']} {idx['status']}")
current_indexes_count += 1 if idx["size_mb"] > 0:
index_name = index_dir.name print(f" 📦 Size: {idx['size_mb']:.1f} MB")
meta_file = index_dir / "documents.leann.meta.json"
status = "" if meta_file.exists() else ""
print(f" {current_indexes_count}. {index_name} {status}")
if meta_file.exists():
size_mb = sum(
f.stat().st_size for f in index_dir.iterdir() if f.is_file()
) / (1024 * 1024)
print(f" 📦 Size: {size_mb:.1f} MB")
else:
print(" 📭 No indexes in current project")
else: else:
print("\n🏠 Current Project")
print(f" {current_path}")
print(" " + "" * 45)
print(" 📭 No indexes in current project") print(" 📭 No indexes in current project")
# Show other projects (reference information) # Show other projects (reference information)
@@ -426,29 +385,19 @@ Examples:
print(" " + "" * 45) print(" " + "" * 45)
for project_path in other_projects: for project_path in other_projects:
indexes_dir = project_path / ".leann" / "indexes" project_indexes = self._discover_indexes_in_project(project_path)
if not indexes_dir.exists(): if not project_indexes:
continue
index_dirs = [d for d in indexes_dir.iterdir() if d.is_dir()]
if not index_dirs:
continue continue
print(f"\n 📂 {project_path.name}") print(f"\n 📂 {project_path.name}")
print(f" {project_path}") print(f" {project_path}")
for index_dir in index_dirs: for idx in project_indexes:
total_indexes += 1 total_indexes += 1
index_name = index_dir.name type_icon = "📁" if idx["type"] == "cli" else "📄"
meta_file = index_dir / "documents.leann.meta.json" print(f"{type_icon} {idx['name']} {idx['status']}")
status = "" if meta_file.exists() else "" if idx["size_mb"] > 0:
print(f" 📦 {idx['size_mb']:.1f} MB")
print(f"{index_name} {status}")
if meta_file.exists():
size_mb = sum(
f.stat().st_size for f in index_dir.iterdir() if f.is_file()
) / (1024 * 1024)
print(f" 📦 {size_mb:.1f} MB")
# Summary and usage info # Summary and usage info
print("\n" + "=" * 50) print("\n" + "=" * 50)
@@ -480,6 +429,67 @@ Examples:
print("\n💡 Create your first index:") print("\n💡 Create your first index:")
print(" leann build my-docs --docs ./documents") print(" leann build my-docs --docs ./documents")
def _discover_indexes_in_project(self, project_path: Path):
"""Discover all indexes in a project directory (both CLI and apps formats)"""
indexes = []
# 1. CLI format: .leann/indexes/index_name/
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists():
for index_dir in cli_indexes_dir.iterdir():
if index_dir.is_dir():
meta_file = index_dir / "documents.leann.meta.json"
status = "" if meta_file.exists() else ""
size_mb = 0
if meta_file.exists():
try:
size_mb = sum(
f.stat().st_size for f in index_dir.iterdir() if f.is_file()
) / (1024 * 1024)
except (OSError, PermissionError):
pass
indexes.append(
{
"name": index_dir.name,
"type": "cli",
"status": status,
"size_mb": size_mb,
"path": index_dir,
}
)
# 2. Apps format: *.leann.meta.json files anywhere in the project
for meta_file in project_path.rglob("*.leann.meta.json"):
if meta_file.is_file():
# Extract index name from filename (remove .leann.meta.json extension)
index_name = meta_file.name.replace(".leann.meta.json", "")
# Apps indexes are considered complete if the .leann.meta.json file exists
status = ""
# Calculate total size of all related files
size_mb = 0
try:
index_dir = meta_file.parent
for related_file in index_dir.glob(f"{index_name}.leann*"):
size_mb += related_file.stat().st_size / (1024 * 1024)
except (OSError, PermissionError):
pass
indexes.append(
{
"name": index_name,
"type": "app",
"status": status,
"size_mb": size_mb,
"path": meta_file,
}
)
return indexes
def remove_index(self, index_name: str, force: bool = False): def remove_index(self, index_name: str, force: bool = False):
"""Safely remove an index - always show all matches for transparency""" """Safely remove an index - always show all matches for transparency"""
@@ -637,7 +647,7 @@ Examples:
return False return False
def _delete_index_directory( def _delete_index_directory(
self, index_dir: Path, index_name: str, project_path: Path | None = None self, index_dir: Path, index_name: str, project_path: Optional[Path] = None
): ):
"""Actually delete the index directory""" """Actually delete the index directory"""
try: try:

View File

@@ -2,8 +2,10 @@
import importlib import importlib
import importlib.metadata import importlib.metadata
import json
import logging import logging
from typing import TYPE_CHECKING from pathlib import Path
from typing import TYPE_CHECKING, Optional, Union
if TYPE_CHECKING: if TYPE_CHECKING:
from leann.interface import LeannBackendFactoryInterface from leann.interface import LeannBackendFactoryInterface
@@ -43,3 +45,54 @@ def autodiscover_backends():
# print(f"WARN: Could not import backend module '{backend_module_name}': {e}") # print(f"WARN: Could not import backend module '{backend_module_name}': {e}")
pass pass
# print("INFO: Backend auto-discovery finished.") # print("INFO: Backend auto-discovery finished.")
def register_project_directory(project_dir: Optional[Union[str, Path]] = None):
"""
Register a project directory in the global LEANN registry.
This allows `leann list` to discover indexes created by apps or other tools.
Args:
project_dir: Directory to register. If None, uses current working directory.
"""
if project_dir is None:
project_dir = Path.cwd()
else:
project_dir = Path(project_dir)
# Only register directories that have some kind of LEANN content
# Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format)
has_cli_indexes = (project_dir / ".leann" / "indexes").exists()
has_app_indexes = any(project_dir.rglob("*.leann.meta.json"))
if not (has_cli_indexes or has_app_indexes):
# Don't register if there are no LEANN indexes
return
global_registry = Path.home() / ".leann" / "projects.json"
global_registry.parent.mkdir(exist_ok=True)
project_str = str(project_dir.resolve())
# Load existing registry
projects = []
if global_registry.exists():
try:
with open(global_registry) as f:
projects = json.load(f)
except Exception:
logger.debug("Could not load existing project registry")
projects = []
# Add project if not already present
if project_str not in projects:
projects.append(project_str)
# Save updated registry
try:
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
logger.debug(f"Registered project directory: {project_str}")
except Exception as e:
logger.warning(f"Could not save project registry: {e}")

View File

@@ -14,8 +14,6 @@ dependencies = [
"numpy>=1.26.0", "numpy>=1.26.0",
"torch", "torch",
"tqdm", "tqdm",
"flask",
"flask_compress",
"datasets>=2.15.0", "datasets>=2.15.0",
"evaluate", "evaluate",
"colorama", "colorama",
@@ -66,9 +64,7 @@ test = [
"pytest>=7.0", "pytest>=7.0",
"pytest-timeout>=2.0", "pytest-timeout>=2.0",
"llama-index-core>=0.12.0", "llama-index-core>=0.12.0",
"llama-index-readers-file>=0.4.0",
"python-dotenv>=1.0.0", "python-dotenv>=1.0.0",
"sentence-transformers>=2.2.0",
] ]
diskann = [ diskann = [
@@ -100,13 +96,8 @@ leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
[tool.ruff] [tool.ruff]
target-version = "py39" target-version = "py39"
line-length = 100 line-length = 100
extend-exclude = [ extend-exclude = ["third_party"]
"third_party",
"*.egg-info",
"__pycache__",
".git",
".venv",
]
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
@@ -129,21 +120,12 @@ ignore = [
"RUF012", # mutable class attributes should be annotated with typing.ClassVar "RUF012", # mutable class attributes should be annotated with typing.ClassVar
] ]
[tool.ruff.lint.per-file-ignores]
"test/**/*.py" = ["E402"] # module level import not at top of file (common in tests)
"examples/**/*.py" = ["E402"] # module level import not at top of file (common in examples)
[tool.ruff.format] [tool.ruff.format]
quote-style = "double" quote-style = "double"
indent-style = "space" indent-style = "space"
skip-magic-trailing-comma = false skip-magic-trailing-comma = false
line-ending = "auto" line-ending = "auto"
[dependency-groups]
dev = [
"ruff>=0.12.4",
]
[tool.lychee] [tool.lychee]
accept = ["200", "403", "429", "503"] accept = ["200", "403", "429", "503"]
timeout = 20 timeout = 20