Merge branch 'main' into readme-polish
This commit is contained in:
22
README.md
22
README.md
@@ -363,6 +363,28 @@ If you find Leann useful, please cite:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## ✨ Features
|
||||||
|
|
||||||
|
### 🔥 Core Features
|
||||||
|
|
||||||
|
- **🔄 Real-time Embeddings** - Eliminate heavy embedding storage with dynamic computation using optimized ZMQ servers and highly optimized search paradigm (overlapping and batching) with highly optimized embedding engine
|
||||||
|
- **📈 Scalable Architecture** - Handles millions of documents on consumer hardware; the larger your dataset, the more LEANN can save
|
||||||
|
- **🎯 Graph Pruning** - Advanced techniques to minimize the storage overhead of vector search to a limited footprint
|
||||||
|
- **🏗️ Pluggable Backends** - DiskANN, HNSW/FAISS with unified API
|
||||||
|
|
||||||
|
### 🛠️ Technical Highlights
|
||||||
|
- **🔄 Recompute Mode** - Highest accuracy scenarios while eliminating vector storage overhead
|
||||||
|
- **⚡ Zero-copy Operations** - Minimize IPC overhead by transferring distances instead of embeddings
|
||||||
|
- **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency
|
||||||
|
- **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional)
|
||||||
|
- **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead
|
||||||
|
- **🚀 MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](test/build_mlx_index.py))
|
||||||
|
|
||||||
|
### 🎨 Developer Experience
|
||||||
|
|
||||||
|
- **Simple Python API** - Get started in minutes
|
||||||
|
- **Extensible backend system** - Easy to add new algorithms
|
||||||
|
- **Comprehensive examples** - From basic usage to production deployment
|
||||||
|
|
||||||
## 🤝 Contributing
|
## 🤝 Contributing
|
||||||
|
|
||||||
|
|||||||
9525
demo.ipynb
9525
demo.ipynb
File diff suppressed because it is too large
Load Diff
@@ -190,16 +190,16 @@ class WeChatHistoryReader(BaseReader):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _concatenate_messages(self, messages: List[Dict], min_length: int = 128, max_length: int = 1000,
|
def _concatenate_messages(self, messages: List[Dict], max_length: int = 128,
|
||||||
time_window_minutes: int = 30) -> List[Dict]:
|
time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
Concatenate messages based on length and time rules.
|
Concatenate messages based on length and time rules.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
messages: List of message dictionaries
|
messages: List of message dictionaries
|
||||||
min_length: Minimum length for concatenated message groups
|
|
||||||
max_length: Maximum length for concatenated message groups
|
max_length: Maximum length for concatenated message groups
|
||||||
time_window_minutes: Time window in minutes to group messages together
|
time_window_minutes: Time window in minutes to group messages together
|
||||||
|
overlap_messages: Number of messages to overlap between consecutive groups
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of concatenated message groups
|
List of concatenated message groups
|
||||||
@@ -235,37 +235,46 @@ class WeChatHistoryReader(BaseReader):
|
|||||||
time_diff_minutes = (create_time - last_timestamp) / 60
|
time_diff_minutes = (create_time - last_timestamp) / 60
|
||||||
if time_diff_minutes > time_window_minutes:
|
if time_diff_minutes > time_window_minutes:
|
||||||
# Time gap too large, start new group
|
# Time gap too large, start new group
|
||||||
if current_group and current_length >= min_length:
|
if current_group:
|
||||||
concatenated_groups.append({
|
concatenated_groups.append({
|
||||||
'messages': current_group,
|
'messages': current_group,
|
||||||
'total_length': current_length,
|
'total_length': current_length,
|
||||||
'start_time': current_group[0].get('createTime', 0),
|
'start_time': current_group[0].get('createTime', 0),
|
||||||
'end_time': current_group[-1].get('createTime', 0)
|
'end_time': current_group[-1].get('createTime', 0)
|
||||||
})
|
})
|
||||||
current_group = []
|
# Keep last few messages for overlap
|
||||||
current_length = 0
|
if overlap_messages > 0 and len(current_group) > overlap_messages:
|
||||||
|
current_group = current_group[-overlap_messages:]
|
||||||
|
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
|
||||||
|
else:
|
||||||
|
current_group = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
# Check length constraint
|
# Check length constraint
|
||||||
message_length = len(readable_text)
|
message_length = len(readable_text)
|
||||||
if current_length + message_length > max_length and current_group:
|
if current_length + message_length > max_length and current_group:
|
||||||
# Current group would exceed max length, save it and start new
|
# Current group would exceed max length, save it and start new
|
||||||
if current_length >= min_length:
|
concatenated_groups.append({
|
||||||
concatenated_groups.append({
|
'messages': current_group,
|
||||||
'messages': current_group,
|
'total_length': current_length,
|
||||||
'total_length': current_length,
|
'start_time': current_group[0].get('createTime', 0),
|
||||||
'start_time': current_group[0].get('createTime', 0),
|
'end_time': current_group[-1].get('createTime', 0)
|
||||||
'end_time': current_group[-1].get('createTime', 0)
|
})
|
||||||
})
|
# Keep last few messages for overlap
|
||||||
current_group = []
|
if overlap_messages > 0 and len(current_group) > overlap_messages:
|
||||||
current_length = 0
|
current_group = current_group[-overlap_messages:]
|
||||||
|
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
|
||||||
|
else:
|
||||||
|
current_group = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
# Add message to current group
|
# Add message to current group
|
||||||
current_group.append(message)
|
current_group.append(message)
|
||||||
current_length += message_length
|
current_length += message_length
|
||||||
last_timestamp = create_time
|
last_timestamp = create_time
|
||||||
|
|
||||||
# Add the last group if it meets minimum length
|
# Add the last group if it exists
|
||||||
if current_group and current_length >= min_length:
|
if current_group:
|
||||||
concatenated_groups.append({
|
concatenated_groups.append({
|
||||||
'messages': current_group,
|
'messages': current_group,
|
||||||
'total_length': current_length,
|
'total_length': current_length,
|
||||||
@@ -343,6 +352,12 @@ Contact: {contact_name}
|
|||||||
Time Range: {start_time_str} - {end_time_str}
|
Time Range: {start_time_str} - {end_time_str}
|
||||||
Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
||||||
|
|
||||||
|
{concatenated_text}
|
||||||
|
"""
|
||||||
|
|
||||||
|
doc_content = f"""
|
||||||
|
Contact: {contact_name}
|
||||||
|
|
||||||
{concatenated_text}
|
{concatenated_text}
|
||||||
"""
|
"""
|
||||||
return doc_content
|
return doc_content
|
||||||
@@ -358,16 +373,15 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
|||||||
wechat_export_dir (str): Custom path to WeChat export directory.
|
wechat_export_dir (str): Custom path to WeChat export directory.
|
||||||
include_non_text (bool): Whether to include non-text messages (images, emojis, etc.)
|
include_non_text (bool): Whether to include non-text messages (images, emojis, etc.)
|
||||||
concatenate_messages (bool): Whether to concatenate messages based on length rules.
|
concatenate_messages (bool): Whether to concatenate messages based on length rules.
|
||||||
min_length (int): Minimum length for concatenated message groups (default: 128).
|
|
||||||
max_length (int): Maximum length for concatenated message groups (default: 1000).
|
max_length (int): Maximum length for concatenated message groups (default: 1000).
|
||||||
time_window_minutes (int): Time window in minutes to group messages together (default: 30).
|
time_window_minutes (int): Time window in minutes to group messages together (default: 30).
|
||||||
|
overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
|
||||||
"""
|
"""
|
||||||
docs: List[Document] = []
|
docs: List[Document] = []
|
||||||
max_count = load_kwargs.get('max_count', 1000)
|
max_count = load_kwargs.get('max_count', 1000)
|
||||||
wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
|
wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
|
||||||
include_non_text = load_kwargs.get('include_non_text', False)
|
include_non_text = load_kwargs.get('include_non_text', False)
|
||||||
concatenate_messages = load_kwargs.get('concatenate_messages', False)
|
concatenate_messages = load_kwargs.get('concatenate_messages', False)
|
||||||
min_length = load_kwargs.get('min_length', 128)
|
|
||||||
max_length = load_kwargs.get('max_length', 1000)
|
max_length = load_kwargs.get('max_length', 1000)
|
||||||
time_window_minutes = load_kwargs.get('time_window_minutes', 30)
|
time_window_minutes = load_kwargs.get('time_window_minutes', 30)
|
||||||
|
|
||||||
@@ -417,9 +431,9 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
|||||||
# Concatenate messages based on rules
|
# Concatenate messages based on rules
|
||||||
message_groups = self._concatenate_messages(
|
message_groups = self._concatenate_messages(
|
||||||
readable_messages,
|
readable_messages,
|
||||||
min_length=min_length,
|
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
time_window_minutes=time_window_minutes
|
time_window_minutes=time_window_minutes,
|
||||||
|
overlap_messages=2 # Keep 2 messages overlap between groups
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create documents from concatenated groups
|
# Create documents from concatenated groups
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
documents = reader.load_data(
|
documents = reader.load_data(
|
||||||
wechat_export_dir=str(export_dir),
|
wechat_export_dir=str(export_dir),
|
||||||
max_count=max_count,
|
max_count=max_count,
|
||||||
concatenate_messages=False, # Disable concatenation - one message per document
|
concatenate_messages=True, # Disable concatenation - one message per document
|
||||||
)
|
)
|
||||||
if documents:
|
if documents:
|
||||||
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
||||||
@@ -222,9 +222,9 @@ async def query_leann_index(index_path: str, query: str):
|
|||||||
print(f"You: {query}")
|
print(f"You: {query}")
|
||||||
chat_response = chat.ask(
|
chat_response = chat.ask(
|
||||||
query,
|
query,
|
||||||
top_k=5,
|
top_k=20,
|
||||||
recompute_beighbor_embeddings=True,
|
recompute_beighbor_embeddings=True,
|
||||||
complexity=32,
|
complexity=64,
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
llm_config={
|
llm_config={
|
||||||
"type": "openai",
|
"type": "openai",
|
||||||
@@ -252,7 +252,7 @@ async def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--index-dir",
|
"--index-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default="./wechat_history_index_leann_test",
|
default="./wechat_history_june19_test",
|
||||||
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
|
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -175,13 +175,13 @@ def create_embedding_server_thread(
|
|||||||
enable_warmup: bool = False,
|
enable_warmup: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
在当前线程中创建并运行 embedding server
|
Create and run embedding server in the current thread
|
||||||
这个函数设计为在单独的线程中调用
|
This function is designed to be called in a separate thread
|
||||||
"""
|
"""
|
||||||
logger.info(f"Initializing embedding server thread on port {zmq_port}")
|
logger.info(f"Initializing embedding server thread on port {zmq_port}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 检查端口是否已被占用
|
# Check if port is already occupied
|
||||||
import socket
|
import socket
|
||||||
def check_port(port):
|
def check_port(port):
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
@@ -212,11 +212,11 @@ def create_embedding_server_thread(
|
|||||||
cuda_available = False
|
cuda_available = False
|
||||||
mps_available = False
|
mps_available = False
|
||||||
elif embedding_mode == "sentence-transformers":
|
elif embedding_mode == "sentence-transformers":
|
||||||
# 初始化模型
|
# Initialize model
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
# 选择设备
|
# Select device
|
||||||
mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
|
mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
|
||||||
cuda_available = torch.cuda.is_available()
|
cuda_available = torch.cuda.is_available()
|
||||||
|
|
||||||
@@ -230,11 +230,11 @@ def create_embedding_server_thread(
|
|||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
logger.info("Using CPU device")
|
logger.info("Using CPU device")
|
||||||
|
|
||||||
# 加载模型
|
# Load model
|
||||||
logger.info(f"Loading model {model_name}")
|
logger.info(f"Loading model {model_name}")
|
||||||
model = AutoModel.from_pretrained(model_name).to(device).eval()
|
model = AutoModel.from_pretrained(model_name).to(device).eval()
|
||||||
|
|
||||||
# 优化模型
|
# Optimize model
|
||||||
if cuda_available or mps_available:
|
if cuda_available or mps_available:
|
||||||
try:
|
try:
|
||||||
model = model.half()
|
model = model.half()
|
||||||
@@ -324,7 +324,7 @@ def create_embedding_server_thread(
|
|||||||
print(f"Error during Protobuf ZMQ warmup: {e}")
|
print(f"Error during Protobuf ZMQ warmup: {e}")
|
||||||
|
|
||||||
class DeviceTimer:
|
class DeviceTimer:
|
||||||
"""设备计时器"""
|
"""Device timer"""
|
||||||
def __init__(self, name="", device=device):
|
def __init__(self, name="", device=device):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.device = device
|
self.device = device
|
||||||
@@ -369,60 +369,63 @@ def create_embedding_server_thread(
|
|||||||
return self.end_time - self.start_time
|
return self.end_time - self.start_time
|
||||||
|
|
||||||
def print_elapsed(self):
|
def print_elapsed(self):
|
||||||
print(f"Time taken for {self.name}: {self.elapsed_time():.6f} seconds")
|
elapsed = self.elapsed_time()
|
||||||
|
print(f"[{self.name}] Elapsed time: {elapsed:.3f}s")
|
||||||
|
|
||||||
def process_batch_pytorch(texts_batch, ids_batch, missing_ids):
|
def process_batch_pytorch(texts_batch, ids_batch, missing_ids):
|
||||||
"""处理文本批次"""
|
"""Process text batch"""
|
||||||
batch_size = len(texts_batch)
|
if not texts_batch:
|
||||||
logger.info(f"Processing batch of size {batch_size}")
|
return np.array([])
|
||||||
|
|
||||||
tokenize_timer = DeviceTimer("tokenization (batch)", device)
|
# Filter out empty texts and their corresponding IDs
|
||||||
to_device_timer = DeviceTimer("transfer to device (batch)", device)
|
valid_texts = []
|
||||||
embed_timer = DeviceTimer("embedding (batch)", device)
|
valid_ids = []
|
||||||
pool_timer = DeviceTimer("mean pooling (batch)", device)
|
for i, text in enumerate(texts_batch):
|
||||||
|
if text.strip(): # Only include non-empty texts
|
||||||
|
valid_texts.append(text)
|
||||||
|
valid_ids.append(ids_batch[i])
|
||||||
|
|
||||||
with tokenize_timer.timing():
|
if not valid_texts:
|
||||||
encoded_batch = tokenizer.batch_encode_plus(
|
print("WARNING: No valid texts in batch")
|
||||||
texts_batch,
|
return np.array([])
|
||||||
padding="max_length",
|
|
||||||
|
# Tokenize
|
||||||
|
token_timer = DeviceTimer("tokenization")
|
||||||
|
with token_timer.timing():
|
||||||
|
inputs = tokenizer(
|
||||||
|
valid_texts,
|
||||||
|
padding=True,
|
||||||
truncation=True,
|
truncation=True,
|
||||||
max_length=256,
|
max_length=512,
|
||||||
return_tensors="pt",
|
return_tensors="pt"
|
||||||
return_token_type_ids=False,
|
).to(device)
|
||||||
)
|
|
||||||
tokenize_timer.print_elapsed()
|
|
||||||
|
|
||||||
seq_length = encoded_batch["input_ids"].size(1)
|
# Compute embeddings
|
||||||
print(f"Batch size: {batch_size}, Sequence length: {seq_length}")
|
embed_timer = DeviceTimer("embedding computation")
|
||||||
|
with embed_timer.timing():
|
||||||
with to_device_timer.timing():
|
with torch.no_grad():
|
||||||
enc = {k: v.to(device) for k, v in encoded_batch.items()}
|
outputs = model(**inputs)
|
||||||
to_device_timer.print_elapsed()
|
hidden_states = outputs.last_hidden_state
|
||||||
|
|
||||||
with torch.no_grad():
|
# Mean pooling
|
||||||
with embed_timer.timing():
|
attention_mask = inputs['attention_mask']
|
||||||
out = model(enc["input_ids"], enc["attention_mask"])
|
mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
|
||||||
embed_timer.print_elapsed()
|
|
||||||
|
|
||||||
with pool_timer.timing():
|
|
||||||
hidden_states = out.last_hidden_state if hasattr(out, "last_hidden_state") else out
|
|
||||||
mask_expanded = enc["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
|
|
||||||
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
|
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
|
||||||
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
||||||
batch_embeddings = sum_embeddings / sum_mask
|
batch_embeddings = sum_embeddings / sum_mask
|
||||||
pool_timer.print_elapsed()
|
embed_timer.print_elapsed()
|
||||||
|
|
||||||
return batch_embeddings.cpu().numpy()
|
return batch_embeddings.cpu().numpy()
|
||||||
|
|
||||||
# ZMQ server 主循环 - 修改为REP套接字
|
# ZMQ server main loop - modified to use REP socket
|
||||||
context = zmq.Context()
|
context = zmq.Context()
|
||||||
socket = context.socket(zmq.ROUTER) # 改为REP套接字
|
socket = context.socket(zmq.ROUTER) # Changed to REP socket
|
||||||
socket.bind(f"tcp://127.0.0.1:{zmq_port}")
|
socket.bind(f"tcp://127.0.0.1:{zmq_port}")
|
||||||
print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}")
|
print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}")
|
||||||
|
|
||||||
# 设置超时
|
# Set timeouts
|
||||||
socket.setsockopt(zmq.RCVTIMEO, 5000) # 5秒接收超时
|
socket.setsockopt(zmq.RCVTIMEO, 5000) # 5 second receive timeout
|
||||||
socket.setsockopt(zmq.SNDTIMEO, 300000) # 300秒发送超时
|
socket.setsockopt(zmq.SNDTIMEO, 300000) # 300 second send timeout
|
||||||
|
|
||||||
from . import embedding_pb2
|
from . import embedding_pb2
|
||||||
|
|
||||||
@@ -442,18 +445,18 @@ def create_embedding_server_thread(
|
|||||||
try:
|
try:
|
||||||
parts = socket.recv_multipart()
|
parts = socket.recv_multipart()
|
||||||
|
|
||||||
# --- 恢复稳健的消息格式判断 ---
|
# --- Restore robust message format detection ---
|
||||||
# 必须检查 parts 的长度,避免 IndexError
|
# Must check parts length to avoid IndexError
|
||||||
if len(parts) >= 3:
|
if len(parts) >= 3:
|
||||||
identity = parts[0]
|
identity = parts[0]
|
||||||
# empty = parts[1] # 中间的空帧我们通常不关心
|
# empty = parts[1] # We usually don't care about the middle empty frame
|
||||||
message = parts[2]
|
message = parts[2]
|
||||||
elif len(parts) == 2:
|
elif len(parts) == 2:
|
||||||
# 也能处理没有空帧的情况
|
# Can also handle cases without empty frame
|
||||||
identity = parts[0]
|
identity = parts[0]
|
||||||
message = parts[1]
|
message = parts[1]
|
||||||
else:
|
else:
|
||||||
# 如果收到格式错误的消息,打印警告并忽略它,而不是崩溃
|
# If received message format is wrong, print warning and ignore it instead of crashing
|
||||||
print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.")
|
print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.")
|
||||||
continue
|
continue
|
||||||
print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes")
|
print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes")
|
||||||
@@ -555,17 +558,17 @@ def create_embedding_server_thread(
|
|||||||
e2e_start = time.time()
|
e2e_start = time.time()
|
||||||
lookup_timer = DeviceTimer("text lookup")
|
lookup_timer = DeviceTimer("text lookup")
|
||||||
|
|
||||||
# 解析请求
|
# Parse request
|
||||||
req_proto = embedding_pb2.NodeEmbeddingRequest()
|
req_proto = embedding_pb2.NodeEmbeddingRequest()
|
||||||
req_proto.ParseFromString(message)
|
req_proto.ParseFromString(message)
|
||||||
node_ids = req_proto.node_ids
|
node_ids = req_proto.node_ids
|
||||||
print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}")
|
print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}")
|
||||||
|
|
||||||
# 添加调试信息
|
# Add debug information
|
||||||
if len(node_ids) > 0:
|
if len(node_ids) > 0:
|
||||||
print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}")
|
print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}")
|
||||||
|
|
||||||
# 查找文本
|
# Look up texts
|
||||||
texts = []
|
texts = []
|
||||||
missing_ids = []
|
missing_ids = []
|
||||||
with lookup_timer.timing():
|
with lookup_timer.timing():
|
||||||
@@ -575,8 +578,8 @@ def create_embedding_server_thread(
|
|||||||
if txt:
|
if txt:
|
||||||
texts.append(txt)
|
texts.append(txt)
|
||||||
else:
|
else:
|
||||||
# 如果文本为空,我们仍然需要一个占位符来进行批处理,
|
# If text is empty, we still need a placeholder for batch processing,
|
||||||
# 但将其ID记录为缺失
|
# but record its ID as missing
|
||||||
texts.append("")
|
texts.append("")
|
||||||
missing_ids.append(nid)
|
missing_ids.append(nid)
|
||||||
lookup_timer.print_elapsed()
|
lookup_timer.print_elapsed()
|
||||||
@@ -584,7 +587,7 @@ def create_embedding_server_thread(
|
|||||||
if missing_ids:
|
if missing_ids:
|
||||||
print(f"WARNING: Missing passages for IDs: {missing_ids}")
|
print(f"WARNING: Missing passages for IDs: {missing_ids}")
|
||||||
|
|
||||||
# 处理批次
|
# Process batch
|
||||||
total_size = len(texts)
|
total_size = len(texts)
|
||||||
print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}")
|
print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}")
|
||||||
|
|
||||||
@@ -600,7 +603,7 @@ def create_embedding_server_thread(
|
|||||||
chunk_ids = node_ids[i:end_idx]
|
chunk_ids = node_ids[i:end_idx]
|
||||||
|
|
||||||
if embedding_mode == "mlx":
|
if embedding_mode == "mlx":
|
||||||
embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name)
|
embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name, batch_size=16)
|
||||||
elif embedding_mode == "openai":
|
elif embedding_mode == "openai":
|
||||||
embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name)
|
embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name)
|
||||||
else: # sentence-transformers
|
else: # sentence-transformers
|
||||||
@@ -617,13 +620,13 @@ def create_embedding_server_thread(
|
|||||||
print(f"INFO: Combined embeddings shape: {hidden.shape}")
|
print(f"INFO: Combined embeddings shape: {hidden.shape}")
|
||||||
else:
|
else:
|
||||||
if embedding_mode == "mlx":
|
if embedding_mode == "mlx":
|
||||||
hidden = compute_embeddings_mlx(texts, model_name)
|
hidden = compute_embeddings_mlx(texts, model_name, batch_size=16)
|
||||||
elif embedding_mode == "openai":
|
elif embedding_mode == "openai":
|
||||||
hidden = compute_embeddings_openai(texts, model_name)
|
hidden = compute_embeddings_openai(texts, model_name)
|
||||||
else: # sentence-transformers
|
else: # sentence-transformers
|
||||||
hidden = process_batch_pytorch(texts, node_ids, missing_ids)
|
hidden = process_batch_pytorch(texts, node_ids, missing_ids)
|
||||||
|
|
||||||
# 序列化响应
|
# Serialize response
|
||||||
ser_start = time.time()
|
ser_start = time.time()
|
||||||
|
|
||||||
resp_proto = embedding_pb2.NodeEmbeddingResponse()
|
resp_proto = embedding_pb2.NodeEmbeddingResponse()
|
||||||
@@ -635,7 +638,7 @@ def create_embedding_server_thread(
|
|||||||
|
|
||||||
response_data = resp_proto.SerializeToString()
|
response_data = resp_proto.SerializeToString()
|
||||||
|
|
||||||
# REP 套接字发送单个响应
|
# REP socket sends a single response
|
||||||
socket.send_multipart([identity, b'', response_data])
|
socket.send_multipart([identity, b'', response_data])
|
||||||
|
|
||||||
ser_end = time.time()
|
ser_end = time.time()
|
||||||
@@ -656,11 +659,11 @@ def create_embedding_server_thread(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"ERROR: Error in ZMQ server: {e}")
|
print(f"ERROR: Error in ZMQ server: {e}")
|
||||||
try:
|
try:
|
||||||
# 发送空响应以维持REQ-REP状态
|
# Send empty response to maintain REQ-REP state
|
||||||
empty_resp = embedding_pb2.NodeEmbeddingResponse()
|
empty_resp = embedding_pb2.NodeEmbeddingResponse()
|
||||||
socket.send(empty_resp.SerializeToString())
|
socket.send(empty_resp.SerializeToString())
|
||||||
except:
|
except:
|
||||||
# 如果发送失败,重新创建socket
|
# If sending fails, recreate socket
|
||||||
socket.close()
|
socket.close()
|
||||||
socket = context.socket(zmq.REP)
|
socket = context.socket(zmq.REP)
|
||||||
socket.bind(f"tcp://127.0.0.1:{zmq_port}")
|
socket.bind(f"tcp://127.0.0.1:{zmq_port}")
|
||||||
|
|||||||
@@ -423,7 +423,7 @@ def create_hnsw_embedding_server(
|
|||||||
from leann.api import compute_embeddings
|
from leann.api import compute_embeddings
|
||||||
|
|
||||||
# Compute embeddings using MLX
|
# Compute embeddings using MLX
|
||||||
embeddings = compute_embeddings(texts_batch, model_name, use_mlx=True)
|
embeddings = compute_embeddings(texts_batch, model_name, mode="mlx", use_server=False)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"[leann_backend_hnsw.hnsw_embedding_server LOG]: MLX embeddings computed for {len(texts_batch)} texts"
|
f"[leann_backend_hnsw.hnsw_embedding_server LOG]: MLX embeddings computed for {len(texts_batch)} texts"
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ requires-python = ">=3.9"
|
|||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"numpy>=1.20.0"
|
"numpy>=1.20.0",
|
||||||
|
"tqdm>=4.60.0"
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ def compute_embeddings(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
mode: str = "sentence-transformers",
|
mode: str = "sentence-transformers",
|
||||||
use_server: bool = True,
|
use_server: bool = True,
|
||||||
|
use_mlx: bool = False # Backward compatibility: if True, override mode to 'mlx',
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Computes embeddings using different backends.
|
Computes embeddings using different backends.
|
||||||
@@ -38,12 +39,16 @@ def compute_embeddings(
|
|||||||
Returns:
|
Returns:
|
||||||
numpy array of embeddings
|
numpy array of embeddings
|
||||||
"""
|
"""
|
||||||
|
# Override mode for backward compatibility
|
||||||
|
if use_mlx:
|
||||||
|
mode = "mlx"
|
||||||
|
|
||||||
# Auto-detect mode based on model name if not explicitly set
|
# Auto-detect mode based on model name if not explicitly set
|
||||||
if mode == "sentence-transformers" and model_name.startswith("text-embedding-"):
|
if mode == "sentence-transformers" and model_name.startswith("text-embedding-"):
|
||||||
mode = "openai"
|
mode = "openai"
|
||||||
|
|
||||||
if mode == "mlx":
|
if mode == "mlx":
|
||||||
return compute_embeddings_mlx(chunks, model_name)
|
return compute_embeddings_mlx(chunks, model_name, batch_size=16)
|
||||||
elif mode == "openai":
|
elif mode == "openai":
|
||||||
return compute_embeddings_openai(chunks, model_name)
|
return compute_embeddings_openai(chunks, model_name)
|
||||||
elif mode == "sentence-transformers":
|
elif mode == "sentence-transformers":
|
||||||
@@ -158,7 +163,7 @@ def _compute_embeddings_sentence_transformers_direct(
|
|||||||
# Generate embeddings
|
# Generate embeddings
|
||||||
# give use an warning if OOM here means we need to turn down the batch size
|
# give use an warning if OOM here means we need to turn down the batch size
|
||||||
embeddings = model.encode(
|
embeddings = model.encode(
|
||||||
chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=8
|
chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=16
|
||||||
)
|
)
|
||||||
|
|
||||||
return embeddings
|
return embeddings
|
||||||
@@ -188,13 +193,19 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
|
|||||||
# OpenAI has a limit on batch size and input length
|
# OpenAI has a limit on batch size and input length
|
||||||
max_batch_size = 100 # Conservative batch size
|
max_batch_size = 100 # Conservative batch size
|
||||||
all_embeddings = []
|
all_embeddings = []
|
||||||
|
|
||||||
for i in range(0, len(chunks), max_batch_size):
|
try:
|
||||||
batch_chunks = chunks[i : i + max_batch_size]
|
from tqdm import tqdm
|
||||||
print(
|
total_batches = (len(chunks) + max_batch_size - 1) // max_batch_size
|
||||||
f"INFO: Processing batch {i // max_batch_size + 1}/{(len(chunks) + max_batch_size - 1) // max_batch_size}"
|
batch_range = range(0, len(chunks), max_batch_size)
|
||||||
)
|
batch_iterator = tqdm(batch_range, desc="Computing embeddings", unit="batch", total=total_batches)
|
||||||
|
except ImportError:
|
||||||
|
# Fallback without progress bar
|
||||||
|
batch_iterator = range(0, len(chunks), max_batch_size)
|
||||||
|
|
||||||
|
for i in batch_iterator:
|
||||||
|
batch_chunks = chunks[i:i + max_batch_size]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = client.embeddings.create(model=model_name, input=batch_chunks)
|
response = client.embeddings.create(model=model_name, input=batch_chunks)
|
||||||
batch_embeddings = [embedding.embedding for embedding in response.data]
|
batch_embeddings = [embedding.embedding for embedding in response.data]
|
||||||
@@ -210,42 +221,64 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
|
|||||||
return embeddings
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings_mlx(chunks: List[str], model_name: str) -> np.ndarray:
|
def compute_embeddings_mlx(chunks: List[str], model_name: str, batch_size: int = 16) -> np.ndarray:
|
||||||
"""Computes embeddings using an MLX model."""
|
"""Computes embeddings using an MLX model."""
|
||||||
try:
|
try:
|
||||||
import mlx.core as mx
|
import mlx.core as mx
|
||||||
from mlx_lm.utils import load
|
from mlx_lm.utils import load
|
||||||
|
from tqdm import tqdm
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"MLX or related libraries not available. Install with: uv pip install mlx mlx-lm"
|
"MLX or related libraries not available. Install with: uv pip install mlx mlx-lm"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}'..."
|
f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}' with batch_size={batch_size}..."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load model and tokenizer
|
# Load model and tokenizer
|
||||||
model, tokenizer = load(model_name)
|
model, tokenizer = load(model_name)
|
||||||
|
|
||||||
# Process each chunk
|
# Process chunks in batches with progress bar
|
||||||
all_embeddings = []
|
all_embeddings = []
|
||||||
for chunk in chunks:
|
|
||||||
# Tokenize
|
try:
|
||||||
token_ids = tokenizer.encode(chunk) # type: ignore
|
from tqdm import tqdm
|
||||||
|
batch_iterator = tqdm(range(0, len(chunks), batch_size), desc="Computing embeddings", unit="batch")
|
||||||
|
except ImportError:
|
||||||
|
batch_iterator = range(0, len(chunks), batch_size)
|
||||||
|
|
||||||
|
for i in batch_iterator:
|
||||||
|
batch_chunks = chunks[i:i + batch_size]
|
||||||
|
|
||||||
|
# Tokenize all chunks in the batch
|
||||||
|
batch_token_ids = []
|
||||||
|
for chunk in batch_chunks:
|
||||||
|
token_ids = tokenizer.encode(chunk) # type: ignore
|
||||||
|
batch_token_ids.append(token_ids)
|
||||||
|
|
||||||
|
# Pad sequences to the same length for batch processing
|
||||||
|
max_length = max(len(ids) for ids in batch_token_ids)
|
||||||
|
padded_token_ids = []
|
||||||
|
for token_ids in batch_token_ids:
|
||||||
|
# Pad with tokenizer.pad_token_id or 0
|
||||||
|
padded = token_ids + [0] * (max_length - len(token_ids))
|
||||||
|
padded_token_ids.append(padded)
|
||||||
|
|
||||||
|
# Convert to MLX array with batch dimension
|
||||||
|
input_ids = mx.array(padded_token_ids)
|
||||||
|
|
||||||
# Convert to MLX array and add batch dimension
|
# Get embeddings for the batch
|
||||||
input_ids = mx.array([token_ids])
|
|
||||||
|
|
||||||
# Get embeddings
|
|
||||||
embeddings = model(input_ids)
|
embeddings = model(input_ids)
|
||||||
|
|
||||||
# Mean pooling (since we only have one sequence, just take the mean)
|
# Mean pooling for each sequence in the batch
|
||||||
pooled = embeddings.mean(axis=1) # Shape: (1, hidden_size)
|
pooled = embeddings.mean(axis=1) # Shape: (batch_size, hidden_size)
|
||||||
|
|
||||||
# Convert individual embedding to numpy via list (to handle bfloat16)
|
# Convert batch embeddings to numpy
|
||||||
pooled_list = pooled[0].tolist() # Remove batch dimension and convert to list
|
for j in range(len(batch_chunks)):
|
||||||
pooled_numpy = np.array(pooled_list, dtype=np.float32)
|
pooled_list = pooled[j].tolist() # Convert to list
|
||||||
all_embeddings.append(pooled_numpy)
|
pooled_numpy = np.array(pooled_list, dtype=np.float32)
|
||||||
|
all_embeddings.append(pooled_numpy)
|
||||||
|
|
||||||
# Stack numpy arrays
|
# Stack numpy arrays
|
||||||
return np.stack(all_embeddings)
|
return np.stack(all_embeddings)
|
||||||
@@ -311,6 +344,8 @@ class LeannBuilder:
|
|||||||
self.dimensions = dimensions
|
self.dimensions = dimensions
|
||||||
self.embedding_mode = embedding_mode
|
self.embedding_mode = embedding_mode
|
||||||
self.backend_kwargs = backend_kwargs
|
self.backend_kwargs = backend_kwargs
|
||||||
|
if 'mlx' in self.embedding_model:
|
||||||
|
self.embedding_mode = "mlx"
|
||||||
self.chunks: List[Dict[str, Any]] = []
|
self.chunks: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
|
def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
|
||||||
@@ -340,7 +375,13 @@ class LeannBuilder:
|
|||||||
offset_file = index_dir / f"{index_name}.passages.idx"
|
offset_file = index_dir / f"{index_name}.passages.idx"
|
||||||
offset_map = {}
|
offset_map = {}
|
||||||
with open(passages_file, "w", encoding="utf-8") as f:
|
with open(passages_file, "w", encoding="utf-8") as f:
|
||||||
for chunk in self.chunks:
|
try:
|
||||||
|
from tqdm import tqdm
|
||||||
|
chunk_iterator = tqdm(self.chunks, desc="Writing passages", unit="chunk")
|
||||||
|
except ImportError:
|
||||||
|
chunk_iterator = self.chunks
|
||||||
|
|
||||||
|
for chunk in chunk_iterator:
|
||||||
offset = f.tell()
|
offset = f.tell()
|
||||||
json.dump(
|
json.dump(
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -175,7 +175,7 @@ class EmbeddingServerManager:
|
|||||||
self.backend_module_name = backend_module_name
|
self.backend_module_name = backend_module_name
|
||||||
self.server_process: Optional[subprocess.Popen] = None
|
self.server_process: Optional[subprocess.Popen] = None
|
||||||
self.server_port: Optional[int] = None
|
self.server_port: Optional[int] = None
|
||||||
# atexit.register(self.stop_server)
|
atexit.register(self.stop_server)
|
||||||
|
|
||||||
def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool:
|
def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ g++ ./demo_reader.cpp -o ./demo_reader && ./demo_reader --stats \
|
|||||||
f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
|
f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
|
||||||
#define SECTOR_SIZE 4096
|
#define SECTOR_SIZE 4096
|
||||||
|
|
||||||
// 辅助:获取文件大小
|
// Helper: Get file size
|
||||||
static size_t get_file_size(const std::string &fname) {
|
static size_t get_file_size(const std::string &fname) {
|
||||||
std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
|
std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
|
||||||
if (ifs.fail() || !ifs.is_open()) {
|
if (ifs.fail() || !ifs.is_open()) {
|
||||||
@@ -32,7 +32,7 @@ static size_t get_file_size(const std::string &fname) {
|
|||||||
return static_cast<size_t>(ifs.tellg());
|
return static_cast<size_t>(ifs.tellg());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 打印 sector 的前若干 hex,用于debug
|
// Print first few hex of sector for debug
|
||||||
static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
|
static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
|
||||||
size_t show_len = (len < max_len) ? len : max_len;
|
size_t show_len = (len < max_len) ? len : max_len;
|
||||||
for (size_t i = 0; i < show_len; i++) {
|
for (size_t i = 0; i < show_len; i++) {
|
||||||
@@ -46,19 +46,19 @@ static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
修正后的 demo_reader:
|
Corrected demo_reader:
|
||||||
1) 从 partition.bin 读:
|
1) Read from partition.bin:
|
||||||
- C, partition_nums, nd
|
- C, partition_nums, nd
|
||||||
- graph_partitions[i]: 分区 i 的所有 nodeID
|
- graph_partitions[i]: all nodeIDs in partition i
|
||||||
- id2partition[nodeID]: nodeID => partition i
|
- id2partition[nodeID]: nodeID => partition i
|
||||||
2) 从 _disk_graph.index 读:
|
2) Read from _disk_graph.index:
|
||||||
a) sector0 里先有 2个 int: meta_n, meta_dim
|
a) sector0 first has 2 ints: meta_n, meta_dim
|
||||||
b) 再有 meta_n个 uint64_t
|
b) then meta_n uint64_t
|
||||||
例如: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
|
e.g.: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
|
||||||
[8]=file_size... 具体位置要结合 relayout 的写法 c) graph_node_len =
|
[8]=file_size... specific positions need to be combined with relayout writing c) graph_node_len =
|
||||||
max_node_len - dim_in_meta*sizeof(float) 3) 用户给定 target_node_id =>
|
max_node_len - dim_in_meta*sizeof(float) 3) User given target_node_id =>
|
||||||
partition_id= id2partition[node_id]
|
partition_id= id2partition[node_id]
|
||||||
在 graph_partitions[partition_id] 里找 node 的下标 j
|
find node index j in graph_partitions[partition_id]
|
||||||
offset = (partition_id+1)*4096 => sector
|
offset = (partition_id+1)*4096 => sector
|
||||||
adjacency_offset= j*graph_node_len => neighbor_count => neighbors
|
adjacency_offset= j*graph_node_len => neighbor_count => neighbors
|
||||||
*/
|
*/
|
||||||
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
|
|||||||
<< "\n";
|
<< "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1) 读取 partition.bin
|
// 1) Read partition.bin
|
||||||
std::ifstream pf(partition_bin, std::ios::binary);
|
std::ifstream pf(partition_bin, std::ios::binary);
|
||||||
if (!pf.is_open()) {
|
if (!pf.is_open()) {
|
||||||
std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
|
std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
|
||||||
@@ -119,8 +119,8 @@ int main(int argc, char **argv) {
|
|||||||
<< ", partition_nums=" << partition_nums << ", nd=" << nd
|
<< ", partition_nums=" << partition_nums << ", nd=" << nd
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
|
|
||||||
// 读取分区节点列表
|
// Read partition node lists
|
||||||
std::vector<std::vector<uint32_t>> graph_partitions(partition_nums);
|
std::vector<std::vector<uint32_t> > graph_partitions(partition_nums);
|
||||||
for (uint64_t i = 0; i < partition_nums; i++) {
|
for (uint64_t i = 0; i < partition_nums; i++) {
|
||||||
uint32_t psize;
|
uint32_t psize;
|
||||||
READ_U32(pf, psize);
|
READ_U32(pf, psize);
|
||||||
@@ -128,7 +128,7 @@ int main(int argc, char **argv) {
|
|||||||
pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
|
pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
|
||||||
psize * sizeof(uint32_t));
|
psize * sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
// 读取 _id2partition[node], 大小= nd
|
// Read _id2partition[node], size= nd
|
||||||
std::vector<uint32_t> id2partition(nd);
|
std::vector<uint32_t> id2partition(nd);
|
||||||
pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
|
pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
|
||||||
pf.close();
|
pf.close();
|
||||||
@@ -140,23 +140,23 @@ int main(int argc, char **argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) 解析 _disk_graph.index
|
// 2) Parse _disk_graph.index
|
||||||
std::ifstream gf(graph_index, std::ios::binary);
|
std::ifstream gf(graph_index, std::ios::binary);
|
||||||
if (!gf.is_open()) {
|
if (!gf.is_open()) {
|
||||||
std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
|
std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
// (a) sector0 => 先读 2个 int
|
// (a) sector0 => first read 2 ints
|
||||||
int meta_n, meta_dim;
|
int meta_n, meta_dim;
|
||||||
gf.read((char *)&meta_n, sizeof(int));
|
gf.read((char *)&meta_n, sizeof(int));
|
||||||
gf.read((char *)&meta_dim, sizeof(int));
|
gf.read((char *)&meta_dim, sizeof(int));
|
||||||
std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
|
std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
|
||||||
|
|
||||||
// (b) 读 meta_n个 uint64_t
|
// (b) Read meta_n uint64_t
|
||||||
std::vector<uint64_t> meta_info(meta_n);
|
std::vector<uint64_t> meta_info(meta_n);
|
||||||
gf.read(reinterpret_cast<char *>(meta_info.data()),
|
gf.read(reinterpret_cast<char *>(meta_info.data()),
|
||||||
meta_n * sizeof(uint64_t));
|
meta_n * sizeof(uint64_t));
|
||||||
// 打印
|
// Print
|
||||||
for (int i = 0; i < meta_n; i++) {
|
for (int i = 0; i < meta_n; i++) {
|
||||||
std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
|
std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
|
||||||
}
|
}
|
||||||
@@ -164,11 +164,11 @@ int main(int argc, char **argv) {
|
|||||||
size_t file_size = get_file_size(graph_index);
|
size_t file_size = get_file_size(graph_index);
|
||||||
std::cout << "[disk_graph.index size] " << file_size << " bytes\n";
|
std::cout << "[disk_graph.index size] " << file_size << " bytes\n";
|
||||||
|
|
||||||
// **根据 relayout log** 你说: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
|
// **According to relayout log** you said: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
|
||||||
// meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
|
// meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
|
||||||
// meta_info[4]=16 (C),
|
// meta_info[4]=16 (C),
|
||||||
// meta_info[8]= 15475261440(文件大小)
|
// meta_info[8]= 15475261440(file size)
|
||||||
// 我们这里先手动解析:
|
// We manually parse here first:
|
||||||
uint64_t nd_in_meta = meta_info[0];
|
uint64_t nd_in_meta = meta_info[0];
|
||||||
uint64_t dim_in_meta = meta_info[1];
|
uint64_t dim_in_meta = meta_info[1];
|
||||||
uint64_t max_node_len = meta_info[3];
|
uint64_t max_node_len = meta_info[3];
|
||||||
@@ -182,7 +182,7 @@ int main(int argc, char **argv) {
|
|||||||
<< ", c_in_meta= " << c_in_meta
|
<< ", c_in_meta= " << c_in_meta
|
||||||
<< ", entire_file_size= " << entire_file_sz << "\n";
|
<< ", entire_file_size= " << entire_file_sz << "\n";
|
||||||
|
|
||||||
// 计算 graph_node_len
|
// Calculate graph_node_len
|
||||||
uint64_t dim_size = dim_in_meta * sizeof(float);
|
uint64_t dim_size = dim_in_meta * sizeof(float);
|
||||||
uint64_t graph_node_len = max_node_len - dim_size;
|
uint64_t graph_node_len = max_node_len - dim_size;
|
||||||
std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
|
std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
|
||||||
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
|
|||||||
// Error check pf_again if needed
|
// Error check pf_again if needed
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3) 找 target_node_id => partition_id => subIndex
|
// 3) Find target_node_id => partition_id => subIndex
|
||||||
uint32_t partition_id = id2partition[target_node_id];
|
uint32_t partition_id = id2partition[target_node_id];
|
||||||
if (partition_id >= partition_nums) {
|
if (partition_id >= partition_nums) {
|
||||||
std::cerr << "Partition ID out-of-range for target node.\n";
|
std::cerr << "Partition ID out-of-range for target node.\n";
|
||||||
|
|||||||
@@ -264,7 +264,7 @@ def run_mlx_benchmark():
|
|||||||
}
|
}
|
||||||
|
|
||||||
config = BenchmarkConfig(
|
config = BenchmarkConfig(
|
||||||
model_path="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ",
|
model_path="mlx-community/all-MiniLM-L6-v2-4bit",
|
||||||
use_mlx=True
|
use_mlx=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user