Add grep search functionality - Issue #86 (#87)

* Add grep search functionality to LeannSearcher - Add use_grep parameter to search method - Implement grep-based search on .jsonl files - Add fallback Python regex search - Support same SearchResult format as semantic search Addresses issue #86 * fix: resolve linting errors * docs: add grep search example * docs: add grep search to README examples * refactor: remove regex fallback, move grep example to features section * docs: add grep search to Advanced Features with comprehensive guide
2025-09-06 08:48:07 +12:00
parent b0b353d279
commit 0d9f92ea0f
5 changed files with 299 additions and 7 deletions
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -6,6 +6,8 @@ with the correct, original embedding logic from the user's reference code.
 import json
 import logging
 import pickle
+import re
+import subprocess
 import time
 import warnings
 from dataclasses import dataclass, field
@@ -653,6 +655,7 @@ class LeannSearcher:
        expected_zmq_port: int = 5557,
        metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None,
        batch_size: int = 0,
+        use_grep: bool = False,
        **kwargs,
    ) -> list[SearchResult]:
        """
@@ -679,6 +682,10 @@ class LeannSearcher:
        Returns:
            List of SearchResult objects with text, metadata, and similarity scores
        """
+        # Handle grep search
+        if use_grep:
+            return self._grep_search(query, top_k)
+
        logger.info("🔍 LeannSearcher.search() called:")
        logger.info(f"  Query: '{query}'")
        logger.info(f"  Top_k: {top_k}")
@@ -795,9 +802,96 @@ class LeannSearcher:
        logger.info(f"  {GREEN}✓ Final enriched results: {len(enriched_results)} passages{RESET}")
        return enriched_results

+    def _find_jsonl_file(self) -> Optional[str]:
+        """Find the .jsonl file containing raw passages for grep search"""
+        index_path = Path(self.meta_path_str).parent
+        potential_files = [
+            index_path / "documents.leann.passages.jsonl",
+            index_path.parent / "documents.leann.passages.jsonl",
+        ]
+
+        for file_path in potential_files:
+            if file_path.exists():
+                return str(file_path)
+        return None
+
+    def _grep_search(self, query: str, top_k: int = 5) -> list[SearchResult]:
+        """Perform grep-based search on raw passages"""
+        jsonl_file = self._find_jsonl_file()
+        if not jsonl_file:
+            raise FileNotFoundError("No .jsonl passages file found for grep search")
+
+        try:
+            cmd = ["grep", "-i", "-n", query, jsonl_file]
+            result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+
+            if result.returncode == 1:
+                return []
+            elif result.returncode != 0:
+                raise RuntimeError(f"Grep failed: {result.stderr}")
+
+            matches = []
+            for line in result.stdout.strip().split("\n"):
+                if not line:
+                    continue
+                parts = line.split(":", 1)
+                if len(parts) != 2:
+                    continue
+
+                try:
+                    data = json.loads(parts[1])
+                    text = data.get("text", "")
+                    score = text.lower().count(query.lower())
+
+                    matches.append(
+                        SearchResult(
+                            id=data.get("id", parts[0]),
+                            text=text,
+                            metadata=data.get("metadata", {}),
+                            score=float(score),
+                        )
+                    )
+                except json.JSONDecodeError:
+                    continue
+
+            matches.sort(key=lambda x: x.score, reverse=True)
+            return matches[:top_k]
+
+        except FileNotFoundError:
+            raise RuntimeError(
+                "grep command not found. Please install grep or use semantic search."
+            )
+
+    def _python_regex_search(self, query: str, top_k: int = 5) -> list[SearchResult]:
+        """Fallback regex search"""
+        jsonl_file = self._find_jsonl_file()
+        if not jsonl_file:
+            raise FileNotFoundError("No .jsonl file found")
+
+        pattern = re.compile(re.escape(query), re.IGNORECASE)
+        matches = []
+
+        with open(jsonl_file, encoding="utf-8") as f:
+            for line_num, line in enumerate(f, 1):
+                if pattern.search(line):
+                    try:
+                        data = json.loads(line.strip())
+                        matches.append(
+                            SearchResult(
+                                id=data.get("id", str(line_num)),
+                                text=data.get("text", ""),
+                                metadata=data.get("metadata", {}),
+                                score=float(len(pattern.findall(data.get("text", "")))),
+                            )
+                        )
+                    except json.JSONDecodeError:
+                        continue
+
+        matches.sort(key=lambda x: x.score, reverse=True)
+        return matches[:top_k]
+
    def cleanup(self):
        """Explicitly cleanup embedding server resources.
-
        This method should be called after you're done using the searcher,
        especially in test environments or batch processing scenarios.
        """
@@ -853,6 +947,7 @@ class LeannChat:
        expected_zmq_port: int = 5557,
        metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None,
        batch_size: int = 0,
+        use_grep: bool = False,
        **search_kwargs,
    ):
        if llm_kwargs is None: