fix: resolve all ruff linting errors and add lint CI check

- Fix ambiguous fullwidth characters (commas, parentheses) in strings and comments - Replace Chinese comments with English equivalents - Fix unused imports with proper noqa annotations for intentional imports - Fix bare except clauses with specific exception types - Fix redefined variables and undefined names - Add ruff noqa annotations for generated protobuf files - Add lint and format check to GitHub Actions CI pipeline
2025-07-26 22:35:12 -07:00
parent 8537a6b17e
commit b3e9ee96fa
53 changed files with 5655 additions and 5220 deletions
--- a/examples/run_evaluation.py
+++ b/examples/run_evaluation.py
@@ -5,24 +5,21 @@ It correctly compares results by fetching the text content for both the new sear
 results and the golden standard results, making the comparison robust to ID changes.
 """

-import json
 import argparse
+import json
+import sys
 import time
 from pathlib import Path
-import sys
-import numpy as np
-from typing import List

-from leann.api import LeannSearcher, LeannBuilder
+import numpy as np
+from leann.api import LeannBuilder, LeannSearcher


 def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
    """Checks if the data directory exists, and if not, downloads it from HF Hub."""
    if not data_root.exists():
        print(f"Data directory '{data_root}' not found.")
-        print(
-            "Downloading evaluation data from Hugging Face Hub... (this may take a moment)"
-        )
+        print("Downloading evaluation data from Hugging Face Hub... (this may take a moment)")
        try:
            from huggingface_hub import snapshot_download

@@ -63,7 +60,7 @@ def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
            sys.exit(1)


-def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):
+def download_embeddings_if_needed(data_root: Path, dataset_type: str | None = None):
    """Download embeddings files specifically."""
    embeddings_dir = data_root / "embeddings"

@@ -101,7 +98,7 @@ def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):


 # --- Helper Function to get Golden Passages ---
-def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
+def get_golden_texts(searcher: LeannSearcher, golden_ids: list[int]) -> set:
    """
    Retrieves the text for golden passage IDs directly from the LeannSearcher's
    passage manager.
@@ -113,24 +110,20 @@ def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
            passage_data = searcher.passage_manager.get_passage(str(gid))
            golden_texts.add(passage_data["text"])
        except KeyError:
-            print(
-                f"Warning: Golden passage ID '{gid}' not found in the index's passage data."
-            )
+            print(f"Warning: Golden passage ID '{gid}' not found in the index's passage data.")
    return golden_texts


-def load_queries(file_path: Path) -> List[str]:
+def load_queries(file_path: Path) -> list[str]:
    queries = []
-    with open(file_path, "r", encoding="utf-8") as f:
+    with open(file_path, encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            queries.append(data["query"])
    return queries


-def build_index_from_embeddings(
-    embeddings_file: str, output_path: str, backend: str = "hnsw"
-):
+def build_index_from_embeddings(embeddings_file: str, output_path: str, backend: str = "hnsw"):
    """
    Build a LEANN index from pre-computed embeddings.

@@ -173,9 +166,7 @@ def build_index_from_embeddings(


 def main():
-    parser = argparse.ArgumentParser(
-        description="Run recall evaluation on a LEANN index."
-    )
+    parser = argparse.ArgumentParser(description="Run recall evaluation on a LEANN index.")
    parser.add_argument(
        "index_path",
        type=str,
@@ -202,9 +193,7 @@ def main():
    parser.add_argument(
        "--num-queries", type=int, default=10, help="Number of queries to evaluate."
    )
-    parser.add_argument(
-        "--top-k", type=int, default=3, help="The 'k' value for recall@k."
-    )
+    parser.add_argument("--top-k", type=int, default=3, help="The 'k' value for recall@k.")
    parser.add_argument(
        "--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW."
    )
@@ -219,9 +208,7 @@ def main():
    # Download data based on mode
    if args.mode == "build":
        # For building mode, we need embeddings
-        download_data_if_needed(
-            data_root, download_embeddings=False
-        )  # Basic data first
+        download_data_if_needed(data_root, download_embeddings=False)  # Basic data first

        # Auto-detect dataset type and download embeddings
        if args.embeddings_file:
@@ -262,9 +249,7 @@ def main():
        print(f"Index built successfully: {built_index_path}")

        # Ask if user wants to run evaluation
-        eval_response = (
-            input("Run evaluation on the built index? (y/n): ").strip().lower()
-        )
+        eval_response = input("Run evaluation on the built index? (y/n): ").strip().lower()
        if eval_response != "y":
            print("Index building complete. Exiting.")
            return
@@ -293,12 +278,8 @@ def main():
                        break

            if not args.index_path:
-                print(
-                    "No indices found. The data download should have included pre-built indices."
-                )
-                print(
-                    "Please check the data/indices/ directory or provide --index-path manually."
-                )
+                print("No indices found. The data download should have included pre-built indices.")
+                print("Please check the data/indices/ directory or provide --index-path manually.")
                sys.exit(1)

    # Detect dataset type from index path to select the correct ground truth
@@ -310,14 +291,10 @@ def main():
    else:
        # Fallback: try to infer from the index directory name
        dataset_type = Path(args.index_path).name
-        print(
-            f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'."
-        )
+        print(f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'.")

    queries_file = data_root / "queries" / "nq_open.jsonl"
-    golden_results_file = (
-        data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
-    )
+    golden_results_file = data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"

    print(f"INFO: Detected dataset type: {dataset_type}")
    print(f"INFO: Using queries file: {queries_file}")
@@ -327,7 +304,7 @@ def main():
        searcher = LeannSearcher(args.index_path)
        queries = load_queries(queries_file)

-        with open(golden_results_file, "r") as f:
+        with open(golden_results_file) as f:
            golden_results_data = json.load(f)

        num_eval_queries = min(args.num_queries, len(queries))
@@ -339,9 +316,7 @@ def main():

        for i in range(num_eval_queries):
            start_time = time.time()
-            new_results = searcher.search(
-                queries[i], top_k=args.top_k, ef=args.ef_search
-            )
+            new_results = searcher.search(queries[i], top_k=args.top_k, ef=args.ef_search)
            search_times.append(time.time() - start_time)

            # Correct Recall Calculation: Based on TEXT content