fix ruff errors and formatting

This commit is contained in:
yichuan520030910320
2025-07-27 02:22:54 -07:00
parent 383c6d8d7e
commit af1790395a
35 changed files with 166 additions and 107 deletions

View File

@@ -1 +0,0 @@

View File

@@ -16,4 +16,4 @@ wheel.packages = ["leann_backend_diskann"]
editable.mode = "redirect"
cmake.build-type = "Release"
build.verbose = true
build.tool-args = ["-j8"]
build.tool-args = ["-j8"]

View File

@@ -2,12 +2,12 @@ syntax = "proto3";
package protoembedding;
message NodeEmbeddingRequest {
repeated uint32 node_ids = 1;
message NodeEmbeddingRequest {
repeated uint32 node_ids = 1;
}
message NodeEmbeddingResponse {
bytes embeddings_data = 1; // All embedded binary datas
repeated int32 dimensions = 2; // Shape [batch_size, embedding_dim]
repeated uint32 missing_ids = 3; // Missing node ids
}
}

View File

@@ -52,4 +52,4 @@ set(FAISS_BUILD_AVX512 OFF CACHE BOOL "" FORCE)
# IMPORTANT: Disable building AVX versions to speed up compilation
set(FAISS_BUILD_AVX_VERSIONS OFF CACHE BOOL "" FORCE)
add_subdirectory(third_party/faiss)
add_subdirectory(third_party/faiss)

View File

@@ -72,7 +72,11 @@ def read_vector_raw(f, element_fmt_char):
def read_numpy_vector(f, np_dtype, struct_fmt_char):
"""Reads a vector into a NumPy array."""
count = -1 # Initialize count for robust error handling
print(f" Reading vector (dtype={np_dtype}, fmt='{struct_fmt_char}')... ", end="", flush=True)
print(
f" Reading vector (dtype={np_dtype}, fmt='{struct_fmt_char}')... ",
end="",
flush=True,
)
try:
count, data_bytes = read_vector_raw(f, struct_fmt_char)
print(f"Count={count}, Bytes={len(data_bytes)}")
@@ -647,7 +651,10 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
print(f"Error: Input file not found: {input_filename}", file=sys.stderr)
return False
except MemoryError as e:
print(f"\nFatal MemoryError during conversion: {e}. Insufficient RAM.", file=sys.stderr)
print(
f"\nFatal MemoryError during conversion: {e}. Insufficient RAM.",
file=sys.stderr,
)
# Clean up potentially partially written output file?
try:
os.remove(output_filename)

View File

@@ -9,7 +9,7 @@ name = "leann-backend-hnsw"
version = "0.1.14"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [
"leann-core==0.1.14",
"leann-core==0.1.14",
"numpy",
"pyzmq>=23.0.0",
"msgpack>=1.0.0",
@@ -24,4 +24,4 @@ build.tool-args = ["-j8"]
# CMake definitions to optimize compilation
[tool.scikit-build.cmake.define]
CMAKE_BUILD_PARALLEL_LEVEL = "8"
CMAKE_BUILD_PARALLEL_LEVEL = "8"

View File

@@ -46,4 +46,4 @@ colab = [
leann = "leann.cli:main"
[tool.setuptools.packages.find]
where = ["src"]
where = ["src"]

View File

@@ -245,7 +245,11 @@ def search_hf_models_fuzzy(query: str, limit: int = 10) -> list[str]:
# HF Hub's search is already fuzzy! It handles typos and partial matches
models = list_models(
search=query, filter="text-generation", sort="downloads", direction=-1, limit=limit
search=query,
filter="text-generation",
sort="downloads",
direction=-1,
limit=limit,
)
model_names = [model.id if hasattr(model, "id") else str(model) for model in models]
@@ -582,7 +586,11 @@ class HFChat(LLMInterface):
# Tokenize input
inputs = self.tokenizer(
formatted_prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048
formatted_prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=2048,
)
# Move inputs to device

View File

@@ -37,4 +37,4 @@ For full documentation, visit [https://leann.readthedocs.io](https://leann.readt
## License
MIT License
MIT License

View File

@@ -39,4 +39,4 @@ diskann = [
Homepage = "https://github.com/yourusername/leann"
Documentation = "https://leann.readthedocs.io"
Repository = "https://github.com/yourusername/leann"
Issues = "https://github.com/yourusername/leann/issues"
Issues = "https://github.com/yourusername/leann/issues"

View File

@@ -1,6 +1,6 @@
import json
import sqlite3
import xml.etree.ElementTree as ET
import xml.etree.ElementTree as ElementTree
from pathlib import Path
from typing import Annotated
@@ -26,7 +26,7 @@ def get_safe_path(s: str) -> str:
def process_history(history: str):
if history.startswith("<?xml") or history.startswith("<msg>"):
try:
root = ET.fromstring(history)
root = ElementTree.fromstring(history)
title = root.find(".//title").text if root.find(".//title") is not None else None
quoted = (
root.find(".//refermsg/content").text
@@ -52,7 +52,8 @@ def get_message(history: dict | str):
def export_chathistory(user_id: str):
res = requests.get(
"http://localhost:48065/wechat/chatlog", params={"userId": user_id, "count": 100000}
"http://localhost:48065/wechat/chatlog",
params={"userId": user_id, "count": 100000},
).json()
for i in range(len(res["chatLogs"])):
res["chatLogs"][i]["content"] = process_history(res["chatLogs"][i]["content"])
@@ -116,7 +117,8 @@ def export_sqlite(
all_users = requests.get("http://localhost:48065/wechat/allcontacts").json()
for user in tqdm(all_users):
cursor.execute(
"INSERT OR IGNORE INTO users (id, name) VALUES (?, ?)", (user["arg"], user["title"])
"INSERT OR IGNORE INTO users (id, name) VALUES (?, ?)",
(user["arg"], user["title"]),
)
usr_chatlog = export_chathistory(user["arg"])
for msg in usr_chatlog: