diff --git a/README.md b/README.md index 33f18d8..90c0b33 100755 --- a/README.md +++ b/README.md @@ -656,6 +656,19 @@ results = searcher.search( πŸ“– **[Complete Metadata filtering guide β†’](docs/metadata_filtering.md)** +### πŸ” Grep Search + +For exact text matching instead of semantic search, use the `use_grep` parameter: + +```python +# Exact text search +results = searcher.search("banana‑crocodile", use_grep=True, top_k=1) +``` + +**Use cases**: Finding specific code patterns, error messages, function names, or exact phrases where semantic similarity isn't needed. + +πŸ“– **[Complete grep search guide β†’](docs/grep_search.md)** + ## πŸ—οΈ Architecture & How It Works

diff --git a/docs/grep_search.md b/docs/grep_search.md new file mode 100644 index 0000000..02ab6df --- /dev/null +++ b/docs/grep_search.md @@ -0,0 +1,149 @@ +# LEANN Grep Search Usage Guide + +## Overview + +LEANN's grep search functionality provides exact text matching for finding specific code patterns, error messages, function names, or exact phrases in your indexed documents. + +## Basic Usage + +### Simple Grep Search + +```python +from leann.api import LeannSearcher + +searcher = LeannSearcher("your_index_path") + +# Exact text search +results = searcher.search("def authenticate_user", use_grep=True, top_k=5) + +for result in results: + print(f"Score: {result.score}") + print(f"Text: {result.text[:100]}...") + print("-" * 40) +``` + +### Comparison: Semantic vs Grep Search + +```python +# Semantic search - finds conceptually similar content +semantic_results = searcher.search("machine learning algorithms", top_k=3) + +# Grep search - finds exact text matches +grep_results = searcher.search("def train_model", use_grep=True, top_k=3) +``` + +## When to Use Grep Search + +### Use Cases + +- **Code Search**: Finding specific function definitions, class names, or variable references +- **Error Debugging**: Locating exact error messages or stack traces +- **Documentation**: Finding specific API endpoints or exact terminology + +### Examples + +```python +# Find function definitions +functions = searcher.search("def __init__", use_grep=True) + +# Find import statements +imports = searcher.search("from sklearn import", use_grep=True) + +# Find specific error types +errors = searcher.search("FileNotFoundError", use_grep=True) + +# Find TODO comments +todos = searcher.search("TODO:", use_grep=True) + +# Find configuration entries +configs = searcher.search("server_port=", use_grep=True) +``` + +## Technical Details + +### How It Works + +1. **File Location**: Grep search operates on the raw text stored in `.jsonl` files +2. **Command Execution**: Uses the system `grep` command with case-insensitive search +3. **Result Processing**: Parses JSON lines and extracts text and metadata +4. **Scoring**: Simple frequency-based scoring based on query term occurrences + +### Search Process + +``` +Query: "def train_model" + ↓ +grep -i -n "def train_model" documents.leann.passages.jsonl + ↓ +Parse matching JSON lines + ↓ +Calculate scores based on term frequency + ↓ +Return top_k results +``` + +### Scoring Algorithm + +```python +# Term frequency in document +score = text.lower().count(query.lower()) +``` + +Results are ranked by score (highest first), with higher scores indicating more occurrences of the search term. + +## Error Handling + +### Common Issues + +#### Grep Command Not Found +``` +RuntimeError: grep command not found. Please install grep or use semantic search. +``` + +**Solution**: Install grep on your system: +- **Ubuntu/Debian**: `sudo apt-get install grep` +- **macOS**: grep is pre-installed +- **Windows**: Use WSL or install grep via Git Bash/MSYS2 + +#### No Results Found +```python +# Check if your query exists in the raw data +results = searcher.search("your_query", use_grep=True) +if not results: + print("No exact matches found. Try:") + print("1. Check spelling and case") + print("2. Use partial terms") + print("3. Switch to semantic search") +``` + +## Complete Example + +```python +#!/usr/bin/env python3 +""" +Grep Search Example +Demonstrates grep search for exact text matching. +""" + +from leann.api import LeannSearcher + +def demonstrate_grep_search(): + # Initialize searcher + searcher = LeannSearcher("my_index") + + print("=== Function Search ===") + functions = searcher.search("def __init__", use_grep=True, top_k=5) + for i, result in enumerate(functions, 1): + print(f"{i}. Score: {result.score}") + print(f" Preview: {result.text[:60]}...") + print() + + print("=== Error Search ===") + errors = searcher.search("FileNotFoundError", use_grep=True, top_k=3) + for result in errors: + print(f"Content: {result.text.strip()}") + print("-" * 40) + +if __name__ == "__main__": + demonstrate_grep_search() +``` diff --git a/examples/grep_search_example.py b/examples/grep_search_example.py new file mode 100644 index 0000000..71723ab --- /dev/null +++ b/examples/grep_search_example.py @@ -0,0 +1,35 @@ +""" +Grep Search Example + +Shows how to use grep-based text search instead of semantic search. +Useful when you need exact text matches rather than meaning-based results. +""" + +from leann import LeannSearcher + +# Load your index +searcher = LeannSearcher("my-documents.leann") + +# Regular semantic search +print("=== Semantic Search ===") +results = searcher.search("machine learning algorithms", top_k=3) +for result in results: + print(f"Score: {result.score:.3f}") + print(f"Text: {result.text[:80]}...") + print() + +# Grep-based search for exact text matches +print("=== Grep Search ===") +results = searcher.search("def train_model", top_k=3, use_grep=True) +for result in results: + print(f"Score: {result.score}") + print(f"Text: {result.text[:80]}...") + print() + +# Find specific error messages +error_results = searcher.search("FileNotFoundError", use_grep=True) +print(f"Found {len(error_results)} files mentioning FileNotFoundError") + +# Search for function definitions +func_results = searcher.search("class SearchResult", use_grep=True, top_k=5) +print(f"Found {len(func_results)} class definitions") diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 28b49da..653f573 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -6,6 +6,8 @@ with the correct, original embedding logic from the user's reference code. import json import logging import pickle +import re +import subprocess import time import warnings from dataclasses import dataclass, field @@ -653,6 +655,7 @@ class LeannSearcher: expected_zmq_port: int = 5557, metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None, batch_size: int = 0, + use_grep: bool = False, **kwargs, ) -> list[SearchResult]: """ @@ -679,6 +682,10 @@ class LeannSearcher: Returns: List of SearchResult objects with text, metadata, and similarity scores """ + # Handle grep search + if use_grep: + return self._grep_search(query, top_k) + logger.info("πŸ” LeannSearcher.search() called:") logger.info(f" Query: '{query}'") logger.info(f" Top_k: {top_k}") @@ -795,9 +802,96 @@ class LeannSearcher: logger.info(f" {GREEN}βœ“ Final enriched results: {len(enriched_results)} passages{RESET}") return enriched_results + def _find_jsonl_file(self) -> Optional[str]: + """Find the .jsonl file containing raw passages for grep search""" + index_path = Path(self.meta_path_str).parent + potential_files = [ + index_path / "documents.leann.passages.jsonl", + index_path.parent / "documents.leann.passages.jsonl", + ] + + for file_path in potential_files: + if file_path.exists(): + return str(file_path) + return None + + def _grep_search(self, query: str, top_k: int = 5) -> list[SearchResult]: + """Perform grep-based search on raw passages""" + jsonl_file = self._find_jsonl_file() + if not jsonl_file: + raise FileNotFoundError("No .jsonl passages file found for grep search") + + try: + cmd = ["grep", "-i", "-n", query, jsonl_file] + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + + if result.returncode == 1: + return [] + elif result.returncode != 0: + raise RuntimeError(f"Grep failed: {result.stderr}") + + matches = [] + for line in result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split(":", 1) + if len(parts) != 2: + continue + + try: + data = json.loads(parts[1]) + text = data.get("text", "") + score = text.lower().count(query.lower()) + + matches.append( + SearchResult( + id=data.get("id", parts[0]), + text=text, + metadata=data.get("metadata", {}), + score=float(score), + ) + ) + except json.JSONDecodeError: + continue + + matches.sort(key=lambda x: x.score, reverse=True) + return matches[:top_k] + + except FileNotFoundError: + raise RuntimeError( + "grep command not found. Please install grep or use semantic search." + ) + + def _python_regex_search(self, query: str, top_k: int = 5) -> list[SearchResult]: + """Fallback regex search""" + jsonl_file = self._find_jsonl_file() + if not jsonl_file: + raise FileNotFoundError("No .jsonl file found") + + pattern = re.compile(re.escape(query), re.IGNORECASE) + matches = [] + + with open(jsonl_file, encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + if pattern.search(line): + try: + data = json.loads(line.strip()) + matches.append( + SearchResult( + id=data.get("id", str(line_num)), + text=data.get("text", ""), + metadata=data.get("metadata", {}), + score=float(len(pattern.findall(data.get("text", "")))), + ) + ) + except json.JSONDecodeError: + continue + + matches.sort(key=lambda x: x.score, reverse=True) + return matches[:top_k] + def cleanup(self): """Explicitly cleanup embedding server resources. - This method should be called after you're done using the searcher, especially in test environments or batch processing scenarios. """ @@ -853,6 +947,7 @@ class LeannChat: expected_zmq_port: int = 5557, metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None, batch_size: int = 0, + use_grep: bool = False, **search_kwargs, ): if llm_kwargs is None: diff --git a/uv.lock b/uv.lock index 430932a..9bb8f17 100644 --- a/uv.lock +++ b/uv.lock @@ -1564,7 +1564,7 @@ name = "importlib-metadata" version = "8.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp" }, + { name = "zipp", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } wheels = [ @@ -2117,7 +2117,7 @@ wheels = [ [[package]] name = "leann-backend-diskann" -version = "0.3.2" +version = "0.3.3" source = { editable = "packages/leann-backend-diskann" } dependencies = [ { name = "leann-core" }, @@ -2129,14 +2129,14 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.3.2" }, + { name = "leann-core", specifier = "==0.3.3" }, { name = "numpy" }, { name = "protobuf", specifier = ">=3.19.0" }, ] [[package]] name = "leann-backend-hnsw" -version = "0.3.2" +version = "0.3.3" source = { editable = "packages/leann-backend-hnsw" } dependencies = [ { name = "leann-core" }, @@ -2149,7 +2149,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.3.2" }, + { name = "leann-core", specifier = "==0.3.3" }, { name = "msgpack", specifier = ">=1.0.0" }, { name = "numpy" }, { name = "pyzmq", specifier = ">=23.0.0" }, @@ -2157,7 +2157,7 @@ requires-dist = [ [[package]] name = "leann-core" -version = "0.3.2" +version = "0.3.3" source = { editable = "packages/leann-core" } dependencies = [ { name = "accelerate" },