From e728449b8fbcc8cd8be4164165a3efeb0942ef99 Mon Sep 17 00:00:00 2001
From: yichuan520030910320 <yichuan_wang@berkeley.edu>
Date: Sat, 19 Jul 2025 19:54:02 -0700
Subject: [PATCH] change chinese

---
 .../leann_backend_diskann/embedding_server.py | 125 +++++++++---------
 research/utils/demo_reader.cpp                |  50 +++----
 2 files changed, 89 insertions(+), 86 deletions(-)

diff --git a/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py b/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py
index 1096ae5..089ec1f 100644
--- a/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py
+++ b/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py
@@ -175,13 +175,13 @@ def create_embedding_server_thread(
     enable_warmup: bool = False,
 ):
     """
-    在当前线程中创建并运行 embedding server
-    这个函数设计为在单独的线程中调用
+    Create and run embedding server in the current thread
+    This function is designed to be called in a separate thread
     """
     logger.info(f"Initializing embedding server thread on port {zmq_port}")
     
     try:
-        # 检查端口是否已被占用
+        # Check if port is already occupied
         import socket
         def check_port(port):
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -212,11 +212,11 @@ def create_embedding_server_thread(
             cuda_available = False
             mps_available = False
         elif embedding_mode == "sentence-transformers":
-            # 初始化模型
+            # Initialize model
             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
             import torch
 
-            # 选择设备
+            # Select device
             mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
             cuda_available = torch.cuda.is_available()
             
@@ -230,11 +230,11 @@ def create_embedding_server_thread(
                 device = torch.device("cpu")
                 logger.info("Using CPU device")
             
-            # 加载模型
+            # Load model
             logger.info(f"Loading model {model_name}")
             model = AutoModel.from_pretrained(model_name).to(device).eval()
 
-            # 优化模型
+            # Optimize model
             if cuda_available or mps_available:
                 try:
                     model = model.half()
@@ -324,7 +324,7 @@ def create_embedding_server_thread(
                 print(f"Error during Protobuf ZMQ warmup: {e}")
 
         class DeviceTimer:
-            """设备计时器"""
+            """Device timer"""
             def __init__(self, name="", device=device):
                 self.name = name
                 self.device = device
@@ -369,60 +369,63 @@ def create_embedding_server_thread(
                     return self.end_time - self.start_time
 
             def print_elapsed(self):
-                print(f"Time taken for {self.name}: {self.elapsed_time():.6f} seconds")
+                elapsed = self.elapsed_time()
+                print(f"[{self.name}] Elapsed time: {elapsed:.3f}s")
 
         def process_batch_pytorch(texts_batch, ids_batch, missing_ids):
-            """处理文本批次"""
-            batch_size = len(texts_batch)
-            logger.info(f"Processing batch of size {batch_size}")
+            """Process text batch"""
+            if not texts_batch:
+                return np.array([])
 
-            tokenize_timer = DeviceTimer("tokenization (batch)", device)
-            to_device_timer = DeviceTimer("transfer to device (batch)", device)
-            embed_timer = DeviceTimer("embedding (batch)", device)
-            pool_timer = DeviceTimer("mean pooling (batch)", device)
+            # Filter out empty texts and their corresponding IDs
+            valid_texts = []
+            valid_ids = []
+            for i, text in enumerate(texts_batch):
+                if text.strip():  # Only include non-empty texts
+                    valid_texts.append(text)
+                    valid_ids.append(ids_batch[i])
 
-            with tokenize_timer.timing():
-                encoded_batch = tokenizer.batch_encode_plus(
-                    texts_batch,
-                    padding="max_length",
+            if not valid_texts:
+                print("WARNING: No valid texts in batch")
+                return np.array([])
+
+            # Tokenize
+            token_timer = DeviceTimer("tokenization")
+            with token_timer.timing():
+                inputs = tokenizer(
+                    valid_texts,
+                    padding=True,
                     truncation=True,
-                    max_length=256,
-                    return_tensors="pt",
-                    return_token_type_ids=False,
-                )
-            tokenize_timer.print_elapsed()
+                    max_length=512,
+                    return_tensors="pt"
+                ).to(device)
 
-            seq_length = encoded_batch["input_ids"].size(1)
-            print(f"Batch size: {batch_size}, Sequence length: {seq_length}")
-
-            with to_device_timer.timing():
-                enc = {k: v.to(device) for k, v in encoded_batch.items()}
-            to_device_timer.print_elapsed()
-
-            with torch.no_grad():
-                with embed_timer.timing():
-                    out = model(enc["input_ids"], enc["attention_mask"])
-                embed_timer.print_elapsed()
-
-                with pool_timer.timing():
-                    hidden_states = out.last_hidden_state if hasattr(out, "last_hidden_state") else out
-                    mask_expanded = enc["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
+            # Compute embeddings
+            embed_timer = DeviceTimer("embedding computation")
+            with embed_timer.timing():
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    hidden_states = outputs.last_hidden_state
+                    
+                    # Mean pooling
+                    attention_mask = inputs['attention_mask']
+                    mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
                     sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
                     sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
                     batch_embeddings = sum_embeddings / sum_mask
-                pool_timer.print_elapsed()
+                embed_timer.print_elapsed()
 
             return batch_embeddings.cpu().numpy()
 
-        # ZMQ server 主循环 - 修改为REP套接字
+        # ZMQ server main loop - modified to use REP socket
         context = zmq.Context()
-        socket = context.socket(zmq.ROUTER)  # 改为REP套接字
+        socket = context.socket(zmq.ROUTER)  # Changed to REP socket
         socket.bind(f"tcp://127.0.0.1:{zmq_port}")
         print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}")
 
-        # 设置超时
-        socket.setsockopt(zmq.RCVTIMEO, 5000)  # 5秒接收超时
-        socket.setsockopt(zmq.SNDTIMEO, 300000)  # 300秒发送超时
+        # Set timeouts
+        socket.setsockopt(zmq.RCVTIMEO, 5000)  # 5 second receive timeout
+        socket.setsockopt(zmq.SNDTIMEO, 300000)  # 300 second send timeout
 
         from . import embedding_pb2
 
@@ -442,18 +445,18 @@ def create_embedding_server_thread(
             try:
                 parts = socket.recv_multipart()
 
-                # --- 恢复稳健的消息格式判断 ---
-                # 必须检查 parts 的长度，避免 IndexError
+                # --- Restore robust message format detection ---
+                # Must check parts length to avoid IndexError
                 if len(parts) >= 3:
                     identity = parts[0]
-                    # empty = parts[1]  # 中间的空帧我们通常不关心
+                    # empty = parts[1]  # We usually don't care about the middle empty frame
                     message = parts[2]
                 elif len(parts) == 2:
-                    # 也能处理没有空帧的情况
+                    # Can also handle cases without empty frame
                     identity = parts[0]
                     message = parts[1]
                 else:
-                    # 如果收到格式错误的消息，打印警告并忽略它，而不是崩溃
+                    # If received message format is wrong, print warning and ignore it instead of crashing
                     print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.")
                     continue
                 print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes")
@@ -555,17 +558,17 @@ def create_embedding_server_thread(
                 e2e_start = time.time()
                 lookup_timer = DeviceTimer("text lookup")
 
-                # 解析请求
+                # Parse request
                 req_proto = embedding_pb2.NodeEmbeddingRequest()
                 req_proto.ParseFromString(message)
                 node_ids = req_proto.node_ids
                 print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}")
 
-                # 添加调试信息
+                # Add debug information
                 if len(node_ids) > 0:
                     print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}")
                 
-                # 查找文本
+                # Look up texts
                 texts = []
                 missing_ids = []
                 with lookup_timer.timing():
@@ -575,8 +578,8 @@ def create_embedding_server_thread(
                         if txt:
                             texts.append(txt)
                         else:
-                            # 如果文本为空，我们仍然需要一个占位符来进行批处理，
-                            # 但将其ID记录为缺失
+                            # If text is empty, we still need a placeholder for batch processing,
+                            # but record its ID as missing
                             texts.append("") 
                             missing_ids.append(nid)
                 lookup_timer.print_elapsed()
@@ -584,7 +587,7 @@ def create_embedding_server_thread(
                 if missing_ids:
                     print(f"WARNING: Missing passages for IDs: {missing_ids}")
 
-                # 处理批次
+                # Process batch
                 total_size = len(texts)
                 print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}")
                 
@@ -623,7 +626,7 @@ def create_embedding_server_thread(
                     else:  # sentence-transformers
                         hidden = process_batch_pytorch(texts, node_ids, missing_ids)
 
-                # 序列化响应
+                # Serialize response
                 ser_start = time.time()
 
                 resp_proto = embedding_pb2.NodeEmbeddingResponse()
@@ -635,7 +638,7 @@ def create_embedding_server_thread(
 
                 response_data = resp_proto.SerializeToString()
                 
-                # REP 套接字发送单个响应
+                # REP socket sends a single response
                 socket.send_multipart([identity, b'', response_data])
 
                 ser_end = time.time()
@@ -656,11 +659,11 @@ def create_embedding_server_thread(
             except Exception as e:
                 print(f"ERROR: Error in ZMQ server: {e}")
                 try:
-                    # 发送空响应以维持REQ-REP状态
+                    # Send empty response to maintain REQ-REP state
                     empty_resp = embedding_pb2.NodeEmbeddingResponse()
                     socket.send(empty_resp.SerializeToString())
                 except:
-                    # 如果发送失败，重新创建socket
+                    # If sending fails, recreate socket
                     socket.close()
                     socket = context.socket(zmq.REP)
                     socket.bind(f"tcp://127.0.0.1:{zmq_port}")
diff --git a/research/utils/demo_reader.cpp b/research/utils/demo_reader.cpp
index 4d7af1c..e149d64 100644
--- a/research/utils/demo_reader.cpp
+++ b/research/utils/demo_reader.cpp
@@ -23,7 +23,7 @@ g++ ./demo_reader.cpp -o ./demo_reader && ./demo_reader --stats \
   f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
 #define SECTOR_SIZE 4096
 
-// 辅助：获取文件大小
+// Helper: Get file size
 static size_t get_file_size(const std::string &fname) {
   std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
   if (ifs.fail() || !ifs.is_open()) {
@@ -32,7 +32,7 @@ static size_t get_file_size(const std::string &fname) {
   return static_cast<size_t>(ifs.tellg());
 }
 
-// 打印 sector 的前若干 hex，用于debug
+// Print first few hex of sector for debug
 static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
   size_t show_len = (len < max_len) ? len : max_len;
   for (size_t i = 0; i < show_len; i++) {
@@ -46,19 +46,19 @@ static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
 }
 
 /*
-  修正后的 demo_reader:
-  1) 从 partition.bin 读:
+  Corrected demo_reader:
+  1) Read from partition.bin:
       - C, partition_nums, nd
-      - graph_partitions[i]: 分区 i 的所有 nodeID
+      - graph_partitions[i]: all nodeIDs in partition i
       - id2partition[nodeID]: nodeID => partition i
-  2) 从 _disk_graph.index 读:
-      a) sector0 里先有 2个 int: meta_n, meta_dim
-      b) 再有 meta_n个 uint64_t
-         例如: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
-  [8]=file_size... 具体位置要结合 relayout 的写法 c) graph_node_len =
-  max_node_len - dim_in_meta*sizeof(float) 3) 用户给定 target_node_id =>
+  2) Read from _disk_graph.index:
+      a) sector0 first has 2 ints: meta_n, meta_dim
+      b) then meta_n uint64_t
+         e.g.: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
+  [8]=file_size... specific positions need to be combined with relayout writing c) graph_node_len =
+  max_node_len - dim_in_meta*sizeof(float) 3) User given target_node_id =>
       partition_id= id2partition[node_id]
-      在 graph_partitions[partition_id] 里找 node 的下标 j
+      find node index j in graph_partitions[partition_id]
       offset = (partition_id+1)*4096 => sector
       adjacency_offset= j*graph_node_len => neighbor_count => neighbors
 */
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
               << "\n";
   }
 
-  // 1) 读取 partition.bin
+  // 1) Read partition.bin
   std::ifstream pf(partition_bin, std::ios::binary);
   if (!pf.is_open()) {
     std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
@@ -119,8 +119,8 @@ int main(int argc, char **argv) {
             << ", partition_nums=" << partition_nums << ", nd=" << nd
             << std::endl;
 
-  // 读取分区节点列表
-  std::vector<std::vector<uint32_t>> graph_partitions(partition_nums);
+  // Read partition node lists
+  std::vector<std::vector<uint32_t> > graph_partitions(partition_nums);
   for (uint64_t i = 0; i < partition_nums; i++) {
     uint32_t psize;
     READ_U32(pf, psize);
@@ -128,7 +128,7 @@ int main(int argc, char **argv) {
     pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
             psize * sizeof(uint32_t));
   }
-  // 读取 _id2partition[node], 大小= nd
+  // Read _id2partition[node], size= nd
   std::vector<uint32_t> id2partition(nd);
   pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
   pf.close();
@@ -140,23 +140,23 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  // 2) 解析 _disk_graph.index
+  // 2) Parse _disk_graph.index
   std::ifstream gf(graph_index, std::ios::binary);
   if (!gf.is_open()) {
     std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
     return 1;
   }
-  // (a) sector0 => 先读 2个 int
+  // (a) sector0 => first read 2 ints
   int meta_n, meta_dim;
   gf.read((char *)&meta_n, sizeof(int));
   gf.read((char *)&meta_dim, sizeof(int));
   std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
 
-  // (b) 读 meta_n个 uint64_t
+  // (b) Read meta_n uint64_t
   std::vector<uint64_t> meta_info(meta_n);
   gf.read(reinterpret_cast<char *>(meta_info.data()),
           meta_n * sizeof(uint64_t));
-  // 打印
+  // Print
   for (int i = 0; i < meta_n; i++) {
     std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
   }
@@ -164,11 +164,11 @@ int main(int argc, char **argv) {
   size_t file_size = get_file_size(graph_index);
   std::cout << "[disk_graph.index size] " << file_size << " bytes\n";
 
-  // **根据 relayout log** 你说: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
+  // **According to relayout log** you said: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
   //    meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
   //    meta_info[4]=16 (C),
-  //    meta_info[8]= 15475261440(文件大小)
-  // 我们这里先手动解析:
+  //    meta_info[8]= 15475261440(file size)
+  // We manually parse here first:
   uint64_t nd_in_meta = meta_info[0];
   uint64_t dim_in_meta = meta_info[1];
   uint64_t max_node_len = meta_info[3];
@@ -182,7 +182,7 @@ int main(int argc, char **argv) {
             << ", c_in_meta= " << c_in_meta
             << ", entire_file_size= " << entire_file_sz << "\n";
 
-  // 计算 graph_node_len
+  // Calculate graph_node_len
   uint64_t dim_size = dim_in_meta * sizeof(float);
   uint64_t graph_node_len = max_node_len - dim_size;
   std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
       // Error check pf_again if needed
     }
 
-    // 3) 找 target_node_id => partition_id => subIndex
+    // 3) Find target_node_id => partition_id => subIndex
     uint32_t partition_id = id2partition[target_node_id];
     if (partition_id >= partition_nums) {
       std::cerr << "Partition ID out-of-range for target node.\n";