change chinese

This commit is contained in:
yichuan520030910320
2025-07-19 19:54:02 -07:00
parent d0c20b14d5
commit e728449b8f
2 changed files with 89 additions and 86 deletions

View File

@@ -175,13 +175,13 @@ def create_embedding_server_thread(
enable_warmup: bool = False, enable_warmup: bool = False,
): ):
""" """
在当前线程中创建并运行 embedding server Create and run embedding server in the current thread
这个函数设计为在单独的线程中调用 This function is designed to be called in a separate thread
""" """
logger.info(f"Initializing embedding server thread on port {zmq_port}") logger.info(f"Initializing embedding server thread on port {zmq_port}")
try: try:
# 检查端口是否已被占用 # Check if port is already occupied
import socket import socket
def check_port(port): def check_port(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -212,11 +212,11 @@ def create_embedding_server_thread(
cuda_available = False cuda_available = False
mps_available = False mps_available = False
elif embedding_mode == "sentence-transformers": elif embedding_mode == "sentence-transformers":
# 初始化模型 # Initialize model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
import torch import torch
# 选择设备 # Select device
mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
cuda_available = torch.cuda.is_available() cuda_available = torch.cuda.is_available()
@@ -230,11 +230,11 @@ def create_embedding_server_thread(
device = torch.device("cpu") device = torch.device("cpu")
logger.info("Using CPU device") logger.info("Using CPU device")
# 加载模型 # Load model
logger.info(f"Loading model {model_name}") logger.info(f"Loading model {model_name}")
model = AutoModel.from_pretrained(model_name).to(device).eval() model = AutoModel.from_pretrained(model_name).to(device).eval()
# 优化模型 # Optimize model
if cuda_available or mps_available: if cuda_available or mps_available:
try: try:
model = model.half() model = model.half()
@@ -324,7 +324,7 @@ def create_embedding_server_thread(
print(f"Error during Protobuf ZMQ warmup: {e}") print(f"Error during Protobuf ZMQ warmup: {e}")
class DeviceTimer: class DeviceTimer:
"""设备计时器""" """Device timer"""
def __init__(self, name="", device=device): def __init__(self, name="", device=device):
self.name = name self.name = name
self.device = device self.device = device
@@ -369,60 +369,63 @@ def create_embedding_server_thread(
return self.end_time - self.start_time return self.end_time - self.start_time
def print_elapsed(self): def print_elapsed(self):
print(f"Time taken for {self.name}: {self.elapsed_time():.6f} seconds") elapsed = self.elapsed_time()
print(f"[{self.name}] Elapsed time: {elapsed:.3f}s")
def process_batch_pytorch(texts_batch, ids_batch, missing_ids): def process_batch_pytorch(texts_batch, ids_batch, missing_ids):
"""处理文本批次""" """Process text batch"""
batch_size = len(texts_batch) if not texts_batch:
logger.info(f"Processing batch of size {batch_size}") return np.array([])
tokenize_timer = DeviceTimer("tokenization (batch)", device) # Filter out empty texts and their corresponding IDs
to_device_timer = DeviceTimer("transfer to device (batch)", device) valid_texts = []
embed_timer = DeviceTimer("embedding (batch)", device) valid_ids = []
pool_timer = DeviceTimer("mean pooling (batch)", device) for i, text in enumerate(texts_batch):
if text.strip(): # Only include non-empty texts
valid_texts.append(text)
valid_ids.append(ids_batch[i])
with tokenize_timer.timing(): if not valid_texts:
encoded_batch = tokenizer.batch_encode_plus( print("WARNING: No valid texts in batch")
texts_batch, return np.array([])
padding="max_length",
# Tokenize
token_timer = DeviceTimer("tokenization")
with token_timer.timing():
inputs = tokenizer(
valid_texts,
padding=True,
truncation=True, truncation=True,
max_length=256, max_length=512,
return_tensors="pt", return_tensors="pt"
return_token_type_ids=False, ).to(device)
)
tokenize_timer.print_elapsed()
seq_length = encoded_batch["input_ids"].size(1) # Compute embeddings
print(f"Batch size: {batch_size}, Sequence length: {seq_length}") embed_timer = DeviceTimer("embedding computation")
with embed_timer.timing():
with to_device_timer.timing(): with torch.no_grad():
enc = {k: v.to(device) for k, v in encoded_batch.items()} outputs = model(**inputs)
to_device_timer.print_elapsed() hidden_states = outputs.last_hidden_state
with torch.no_grad(): # Mean pooling
with embed_timer.timing(): attention_mask = inputs['attention_mask']
out = model(enc["input_ids"], enc["attention_mask"]) mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
embed_timer.print_elapsed()
with pool_timer.timing():
hidden_states = out.last_hidden_state if hasattr(out, "last_hidden_state") else out
mask_expanded = enc["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1) sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
batch_embeddings = sum_embeddings / sum_mask batch_embeddings = sum_embeddings / sum_mask
pool_timer.print_elapsed() embed_timer.print_elapsed()
return batch_embeddings.cpu().numpy() return batch_embeddings.cpu().numpy()
# ZMQ server 主循环 - 修改为REP套接字 # ZMQ server main loop - modified to use REP socket
context = zmq.Context() context = zmq.Context()
socket = context.socket(zmq.ROUTER) # 改为REP套接字 socket = context.socket(zmq.ROUTER) # Changed to REP socket
socket.bind(f"tcp://127.0.0.1:{zmq_port}") socket.bind(f"tcp://127.0.0.1:{zmq_port}")
print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}") print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}")
# 设置超时 # Set timeouts
socket.setsockopt(zmq.RCVTIMEO, 5000) # 5秒接收超时 socket.setsockopt(zmq.RCVTIMEO, 5000) # 5 second receive timeout
socket.setsockopt(zmq.SNDTIMEO, 300000) # 300秒发送超时 socket.setsockopt(zmq.SNDTIMEO, 300000) # 300 second send timeout
from . import embedding_pb2 from . import embedding_pb2
@@ -442,18 +445,18 @@ def create_embedding_server_thread(
try: try:
parts = socket.recv_multipart() parts = socket.recv_multipart()
# --- 恢复稳健的消息格式判断 --- # --- Restore robust message format detection ---
# 必须检查 parts 的长度,避免 IndexError # Must check parts length to avoid IndexError
if len(parts) >= 3: if len(parts) >= 3:
identity = parts[0] identity = parts[0]
# empty = parts[1] # 中间的空帧我们通常不关心 # empty = parts[1] # We usually don't care about the middle empty frame
message = parts[2] message = parts[2]
elif len(parts) == 2: elif len(parts) == 2:
# 也能处理没有空帧的情况 # Can also handle cases without empty frame
identity = parts[0] identity = parts[0]
message = parts[1] message = parts[1]
else: else:
# 如果收到格式错误的消息,打印警告并忽略它,而不是崩溃 # If received message format is wrong, print warning and ignore it instead of crashing
print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.") print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.")
continue continue
print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes") print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes")
@@ -555,17 +558,17 @@ def create_embedding_server_thread(
e2e_start = time.time() e2e_start = time.time()
lookup_timer = DeviceTimer("text lookup") lookup_timer = DeviceTimer("text lookup")
# 解析请求 # Parse request
req_proto = embedding_pb2.NodeEmbeddingRequest() req_proto = embedding_pb2.NodeEmbeddingRequest()
req_proto.ParseFromString(message) req_proto.ParseFromString(message)
node_ids = req_proto.node_ids node_ids = req_proto.node_ids
print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}") print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}")
# 添加调试信息 # Add debug information
if len(node_ids) > 0: if len(node_ids) > 0:
print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}") print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}")
# 查找文本 # Look up texts
texts = [] texts = []
missing_ids = [] missing_ids = []
with lookup_timer.timing(): with lookup_timer.timing():
@@ -575,8 +578,8 @@ def create_embedding_server_thread(
if txt: if txt:
texts.append(txt) texts.append(txt)
else: else:
# 如果文本为空,我们仍然需要一个占位符来进行批处理, # If text is empty, we still need a placeholder for batch processing,
# 但将其ID记录为缺失 # but record its ID as missing
texts.append("") texts.append("")
missing_ids.append(nid) missing_ids.append(nid)
lookup_timer.print_elapsed() lookup_timer.print_elapsed()
@@ -584,7 +587,7 @@ def create_embedding_server_thread(
if missing_ids: if missing_ids:
print(f"WARNING: Missing passages for IDs: {missing_ids}") print(f"WARNING: Missing passages for IDs: {missing_ids}")
# 处理批次 # Process batch
total_size = len(texts) total_size = len(texts)
print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}") print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}")
@@ -623,7 +626,7 @@ def create_embedding_server_thread(
else: # sentence-transformers else: # sentence-transformers
hidden = process_batch_pytorch(texts, node_ids, missing_ids) hidden = process_batch_pytorch(texts, node_ids, missing_ids)
# 序列化响应 # Serialize response
ser_start = time.time() ser_start = time.time()
resp_proto = embedding_pb2.NodeEmbeddingResponse() resp_proto = embedding_pb2.NodeEmbeddingResponse()
@@ -635,7 +638,7 @@ def create_embedding_server_thread(
response_data = resp_proto.SerializeToString() response_data = resp_proto.SerializeToString()
# REP 套接字发送单个响应 # REP socket sends a single response
socket.send_multipart([identity, b'', response_data]) socket.send_multipart([identity, b'', response_data])
ser_end = time.time() ser_end = time.time()
@@ -656,11 +659,11 @@ def create_embedding_server_thread(
except Exception as e: except Exception as e:
print(f"ERROR: Error in ZMQ server: {e}") print(f"ERROR: Error in ZMQ server: {e}")
try: try:
# 发送空响应以维持REQ-REP状态 # Send empty response to maintain REQ-REP state
empty_resp = embedding_pb2.NodeEmbeddingResponse() empty_resp = embedding_pb2.NodeEmbeddingResponse()
socket.send(empty_resp.SerializeToString()) socket.send(empty_resp.SerializeToString())
except: except:
# 如果发送失败,重新创建socket # If sending fails, recreate socket
socket.close() socket.close()
socket = context.socket(zmq.REP) socket = context.socket(zmq.REP)
socket.bind(f"tcp://127.0.0.1:{zmq_port}") socket.bind(f"tcp://127.0.0.1:{zmq_port}")

View File

@@ -23,7 +23,7 @@ g++ ./demo_reader.cpp -o ./demo_reader && ./demo_reader --stats \
f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t)) f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
#define SECTOR_SIZE 4096 #define SECTOR_SIZE 4096
// 辅助:获取文件大小 // Helper: Get file size
static size_t get_file_size(const std::string &fname) { static size_t get_file_size(const std::string &fname) {
std::ifstream ifs(fname, std::ios::binary | std::ios::ate); std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
if (ifs.fail() || !ifs.is_open()) { if (ifs.fail() || !ifs.is_open()) {
@@ -32,7 +32,7 @@ static size_t get_file_size(const std::string &fname) {
return static_cast<size_t>(ifs.tellg()); return static_cast<size_t>(ifs.tellg());
} }
// 打印 sector 的前若干 hex用于debug // Print first few hex of sector for debug
static void print_hex(const char *buf, size_t len, size_t max_len = 64) { static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
size_t show_len = (len < max_len) ? len : max_len; size_t show_len = (len < max_len) ? len : max_len;
for (size_t i = 0; i < show_len; i++) { for (size_t i = 0; i < show_len; i++) {
@@ -46,19 +46,19 @@ static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
} }
/* /*
修正后的 demo_reader: Corrected demo_reader:
1) partition.bin: 1) Read from partition.bin:
- C, partition_nums, nd - C, partition_nums, nd
- graph_partitions[i]: 分区 i 的所有 nodeID - graph_partitions[i]: all nodeIDs in partition i
- id2partition[nodeID]: nodeID => partition i - id2partition[nodeID]: nodeID => partition i
2) _disk_graph.index: 2) Read from _disk_graph.index:
a) sector0 里先有 2 int: meta_n, meta_dim a) sector0 first has 2 ints: meta_n, meta_dim
b) 再有 meta_n uint64_t b) then meta_n uint64_t
例如: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??, e.g.: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
[8]=file_size... 具体位置要结合 relayout 的写法 c) graph_node_len = [8]=file_size... specific positions need to be combined with relayout writing c) graph_node_len =
max_node_len - dim_in_meta*sizeof(float) 3) 用户给定 target_node_id => max_node_len - dim_in_meta*sizeof(float) 3) User given target_node_id =>
partition_id= id2partition[node_id] partition_id= id2partition[node_id]
graph_partitions[partition_id] 里找 node 的下标 j find node index j in graph_partitions[partition_id]
offset = (partition_id+1)*4096 => sector offset = (partition_id+1)*4096 => sector
adjacency_offset= j*graph_node_len => neighbor_count => neighbors adjacency_offset= j*graph_node_len => neighbor_count => neighbors
*/ */
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
<< "\n"; << "\n";
} }
// 1) 读取 partition.bin // 1) Read partition.bin
std::ifstream pf(partition_bin, std::ios::binary); std::ifstream pf(partition_bin, std::ios::binary);
if (!pf.is_open()) { if (!pf.is_open()) {
std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl; std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
@@ -119,8 +119,8 @@ int main(int argc, char **argv) {
<< ", partition_nums=" << partition_nums << ", nd=" << nd << ", partition_nums=" << partition_nums << ", nd=" << nd
<< std::endl; << std::endl;
// 读取分区节点列表 // Read partition node lists
std::vector<std::vector<uint32_t>> graph_partitions(partition_nums); std::vector<std::vector<uint32_t> > graph_partitions(partition_nums);
for (uint64_t i = 0; i < partition_nums; i++) { for (uint64_t i = 0; i < partition_nums; i++) {
uint32_t psize; uint32_t psize;
READ_U32(pf, psize); READ_U32(pf, psize);
@@ -128,7 +128,7 @@ int main(int argc, char **argv) {
pf.read(reinterpret_cast<char *>(graph_partitions[i].data()), pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
psize * sizeof(uint32_t)); psize * sizeof(uint32_t));
} }
// 读取 _id2partition[node], 大小= nd // Read _id2partition[node], size= nd
std::vector<uint32_t> id2partition(nd); std::vector<uint32_t> id2partition(nd);
pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t)); pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
pf.close(); pf.close();
@@ -140,23 +140,23 @@ int main(int argc, char **argv) {
return 1; return 1;
} }
// 2) 解析 _disk_graph.index // 2) Parse _disk_graph.index
std::ifstream gf(graph_index, std::ios::binary); std::ifstream gf(graph_index, std::ios::binary);
if (!gf.is_open()) { if (!gf.is_open()) {
std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl; std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
return 1; return 1;
} }
// (a) sector0 => 先读 2 int // (a) sector0 => first read 2 ints
int meta_n, meta_dim; int meta_n, meta_dim;
gf.read((char *)&meta_n, sizeof(int)); gf.read((char *)&meta_n, sizeof(int));
gf.read((char *)&meta_dim, sizeof(int)); gf.read((char *)&meta_dim, sizeof(int));
std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n"; std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
// (b) meta_n uint64_t // (b) Read meta_n uint64_t
std::vector<uint64_t> meta_info(meta_n); std::vector<uint64_t> meta_info(meta_n);
gf.read(reinterpret_cast<char *>(meta_info.data()), gf.read(reinterpret_cast<char *>(meta_info.data()),
meta_n * sizeof(uint64_t)); meta_n * sizeof(uint64_t));
// 打印 // Print
for (int i = 0; i < meta_n; i++) { for (int i = 0; i < meta_n; i++) {
std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n"; std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
} }
@@ -164,11 +164,11 @@ int main(int argc, char **argv) {
size_t file_size = get_file_size(graph_index); size_t file_size = get_file_size(graph_index);
std::cout << "[disk_graph.index size] " << file_size << " bytes\n"; std::cout << "[disk_graph.index size] " << file_size << " bytes\n";
// **根据 relayout log** 你说: meta_info[0]=nd=60450220, meta_info[1]=dim=769, // **According to relayout log** you said: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
// meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320, // meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
// meta_info[4]=16 (C), // meta_info[4]=16 (C),
// meta_info[8]= 15475261440(文件大小) // meta_info[8]= 15475261440(file size)
// 我们这里先手动解析: // We manually parse here first:
uint64_t nd_in_meta = meta_info[0]; uint64_t nd_in_meta = meta_info[0];
uint64_t dim_in_meta = meta_info[1]; uint64_t dim_in_meta = meta_info[1];
uint64_t max_node_len = meta_info[3]; uint64_t max_node_len = meta_info[3];
@@ -182,7 +182,7 @@ int main(int argc, char **argv) {
<< ", c_in_meta= " << c_in_meta << ", c_in_meta= " << c_in_meta
<< ", entire_file_size= " << entire_file_sz << "\n"; << ", entire_file_size= " << entire_file_sz << "\n";
// 计算 graph_node_len // Calculate graph_node_len
uint64_t dim_size = dim_in_meta * sizeof(float); uint64_t dim_size = dim_in_meta * sizeof(float);
uint64_t graph_node_len = max_node_len - dim_size; uint64_t graph_node_len = max_node_len - dim_size;
std::cout << " => graph_node_len= " << graph_node_len << "\n\n"; std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
// Error check pf_again if needed // Error check pf_again if needed
} }
// 3) target_node_id => partition_id => subIndex // 3) Find target_node_id => partition_id => subIndex
uint32_t partition_id = id2partition[target_node_id]; uint32_t partition_id = id2partition[target_node_id];
if (partition_id >= partition_nums) { if (partition_id >= partition_nums) {
std::cerr << "Partition ID out-of-range for target node.\n"; std::cerr << "Partition ID out-of-range for target node.\n";