387 lines
14 KiB
C++
387 lines
14 KiB
C++
/*
|
||
Run with
|
||
g++ ./demo_reader.cpp -o ./demo_reader && ./demo_reader --stats \
|
||
/powerrag/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/diskann/_partition.bin
|
||
\
|
||
/powerrag/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/diskann/_disk_graph.index
|
||
*/
|
||
|
||
#include <algorithm>
|
||
#include <cstdint>
|
||
#include <cstdlib>
|
||
#include <cstring>
|
||
#include <fstream>
|
||
#include <iomanip>
|
||
#include <iostream>
|
||
#include <limits> // Include for std::numeric_limits
|
||
#include <string> // Include for std::string comparison
|
||
#include <vector>
|
||
|
||
#define READ_U64(f, val) \
|
||
f.read(reinterpret_cast<char *>(&val), sizeof(uint64_t))
|
||
#define READ_U32(f, val) \
|
||
f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
|
||
#define SECTOR_SIZE 4096
|
||
|
||
// 辅助:获取文件大小
|
||
static size_t get_file_size(const std::string &fname) {
|
||
std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
|
||
if (ifs.fail() || !ifs.is_open()) {
|
||
return 0;
|
||
}
|
||
return static_cast<size_t>(ifs.tellg());
|
||
}
|
||
|
||
// 打印 sector 的前若干 hex,用于debug
|
||
static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
|
||
size_t show_len = (len < max_len) ? len : max_len;
|
||
for (size_t i = 0; i < show_len; i++) {
|
||
unsigned char c = (unsigned char)buf[i];
|
||
std::cout << std::hex << std::setw(2) << std::setfill('0') << (unsigned)c
|
||
<< " ";
|
||
if ((i + 1) % 16 == 0)
|
||
std::cout << "\n ";
|
||
}
|
||
std::cout << std::dec << "\n";
|
||
}
|
||
|
||
/*
|
||
修正后的 demo_reader:
|
||
1) 从 partition.bin 读:
|
||
- C, partition_nums, nd
|
||
- graph_partitions[i]: 分区 i 的所有 nodeID
|
||
- id2partition[nodeID]: nodeID => partition i
|
||
2) 从 _disk_graph.index 读:
|
||
a) sector0 里先有 2个 int: meta_n, meta_dim
|
||
b) 再有 meta_n个 uint64_t
|
||
例如: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
|
||
[8]=file_size... 具体位置要结合 relayout 的写法 c) graph_node_len =
|
||
max_node_len - dim_in_meta*sizeof(float) 3) 用户给定 target_node_id =>
|
||
partition_id= id2partition[node_id]
|
||
在 graph_partitions[partition_id] 里找 node 的下标 j
|
||
offset = (partition_id+1)*4096 => sector
|
||
adjacency_offset= j*graph_node_len => neighbor_count => neighbors
|
||
*/
|
||
int main(int argc, char **argv) {
|
||
bool calculate_stats = false;
|
||
// int arg_offset = 0; // Offset for positional arguments
|
||
std::string partition_bin;
|
||
std::string graph_index;
|
||
uint64_t target_node_id = 0; // Initialize
|
||
|
||
if (argc != 4) {
|
||
std::cerr << "Usage:\n"
|
||
<< " " << argv[0]
|
||
<< " <partition.bin> <disk_graph.index> <target_node_id> (Reads "
|
||
"adjacency for a node)\n"
|
||
<< " " << argv[0]
|
||
<< " --stats <partition.bin> <disk_graph.index> "
|
||
"(Calculates degree statistics)\n";
|
||
return 1;
|
||
}
|
||
|
||
// Check if the first argument is the stats flag
|
||
if (std::string(argv[1]) == "--stats") {
|
||
calculate_stats = true;
|
||
partition_bin = argv[2];
|
||
graph_index = argv[3];
|
||
std::cout << "Mode: Calculating Degree Statistics\n";
|
||
} else {
|
||
// Assume default mode (single node lookup)
|
||
calculate_stats = false;
|
||
partition_bin = argv[1];
|
||
graph_index = argv[2];
|
||
try { // Add error handling for stoull
|
||
target_node_id = std::stoull(argv[3]);
|
||
} catch (const std::invalid_argument &ia) {
|
||
std::cerr << "Error: Invalid target_node_id: " << argv[3] << std::endl;
|
||
return 1;
|
||
} catch (const std::out_of_range &oor) {
|
||
std::cerr << "Error: target_node_id out of range: " << argv[3]
|
||
<< std::endl;
|
||
return 1;
|
||
}
|
||
std::cout << "Mode: Single Node Lookup for Node ID " << target_node_id
|
||
<< "\n";
|
||
}
|
||
|
||
// 1) 读取 partition.bin
|
||
std::ifstream pf(partition_bin, std::ios::binary);
|
||
if (!pf.is_open()) {
|
||
std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
|
||
return 1;
|
||
}
|
||
uint64_t C, partition_nums, nd;
|
||
READ_U64(pf, C);
|
||
READ_U64(pf, partition_nums);
|
||
READ_U64(pf, nd);
|
||
std::cout << "[partition.bin header] C=" << C
|
||
<< ", partition_nums=" << partition_nums << ", nd=" << nd
|
||
<< std::endl;
|
||
|
||
// 读取分区节点列表
|
||
std::vector<std::vector<uint32_t>> graph_partitions(partition_nums);
|
||
for (uint64_t i = 0; i < partition_nums; i++) {
|
||
uint32_t psize;
|
||
READ_U32(pf, psize);
|
||
graph_partitions[i].resize(psize);
|
||
pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
|
||
psize * sizeof(uint32_t));
|
||
}
|
||
// 读取 _id2partition[node], 大小= nd
|
||
std::vector<uint32_t> id2partition(nd);
|
||
pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
|
||
pf.close();
|
||
std::cout << "Done loading partition info.\n";
|
||
|
||
if (target_node_id >= nd) {
|
||
std::cerr << "target_node_id=" << target_node_id
|
||
<< " out of range nd=" << nd << std::endl;
|
||
return 1;
|
||
}
|
||
|
||
// 2) 解析 _disk_graph.index
|
||
std::ifstream gf(graph_index, std::ios::binary);
|
||
if (!gf.is_open()) {
|
||
std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
|
||
return 1;
|
||
}
|
||
// (a) sector0 => 先读 2个 int
|
||
int meta_n, meta_dim;
|
||
gf.read((char *)&meta_n, sizeof(int));
|
||
gf.read((char *)&meta_dim, sizeof(int));
|
||
std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
|
||
|
||
// (b) 读 meta_n个 uint64_t
|
||
std::vector<uint64_t> meta_info(meta_n);
|
||
gf.read(reinterpret_cast<char *>(meta_info.data()),
|
||
meta_n * sizeof(uint64_t));
|
||
// 打印
|
||
for (int i = 0; i < meta_n; i++) {
|
||
std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
|
||
}
|
||
|
||
size_t file_size = get_file_size(graph_index);
|
||
std::cout << "[disk_graph.index size] " << file_size << " bytes\n";
|
||
|
||
// **根据 relayout log** 你说: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
|
||
// meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
|
||
// meta_info[4]=16 (C),
|
||
// meta_info[8]= 15475261440(文件大小)
|
||
// 我们这里先手动解析:
|
||
uint64_t nd_in_meta = meta_info[0];
|
||
uint64_t dim_in_meta = meta_info[1];
|
||
uint64_t max_node_len = meta_info[3];
|
||
uint64_t c_in_meta = meta_info[4];
|
||
uint64_t entire_file_sz = meta_info[8];
|
||
|
||
std::cout << "Based on meta_info:\n"
|
||
<< " nd_in_meta= " << nd_in_meta
|
||
<< ", dim_in_meta= " << dim_in_meta
|
||
<< ", max_node_len= " << max_node_len
|
||
<< ", c_in_meta= " << c_in_meta
|
||
<< ", entire_file_size= " << entire_file_sz << "\n";
|
||
|
||
// 计算 graph_node_len
|
||
uint64_t dim_size = dim_in_meta * sizeof(float);
|
||
uint64_t graph_node_len = max_node_len - dim_size;
|
||
std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
|
||
|
||
if (calculate_stats) {
|
||
// --- Degree Statistics Calculation Mode ---
|
||
std::cout << " Calculated graph_node_len = " << graph_node_len << "\n\n";
|
||
|
||
if (nd == 0) {
|
||
std::cerr << "Graph has 0 nodes (nd=0). Cannot calculate stats."
|
||
<< std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
|
||
uint32_t min_degree = std::numeric_limits<uint32_t>::max();
|
||
uint32_t max_degree = 0;
|
||
uint64_t total_degree = 0;
|
||
uint64_t nodes_processed = 0;
|
||
std::vector<char> sectorBuf(SECTOR_SIZE);
|
||
|
||
std::cout << "Calculating degrees for " << nd << " nodes across "
|
||
<< partition_nums << " partitions..." << std::endl;
|
||
|
||
for (uint32_t p = 0; p < partition_nums; ++p) {
|
||
uint64_t sector_offset = uint64_t(p + 1) * SECTOR_SIZE;
|
||
gf.seekg(sector_offset, std::ios::beg);
|
||
if (gf.fail()) {
|
||
std::cerr << "Error seeking to sector offset for partition " << p
|
||
<< std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
gf.read(sectorBuf.data(), SECTOR_SIZE);
|
||
if (gf.fail() && !gf.eof()) {
|
||
std::cerr << "Error reading sector data for partition " << p
|
||
<< std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
gf.clear(); // Reset fail bits
|
||
|
||
const auto &part_list = graph_partitions[p];
|
||
for (size_t j = 0; j < part_list.size(); ++j) {
|
||
uint64_t node_offset = j * graph_node_len;
|
||
if (node_offset + sizeof(uint32_t) > SECTOR_SIZE) {
|
||
std::cerr << "Error: Node offset out of sector bounds.\n"
|
||
<< " Partition=" << p << ", node_subIndex=" << j
|
||
<< ", node_offset=" << node_offset
|
||
<< ", graph_node_len=" << graph_node_len << std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
char *adjacency_ptr = sectorBuf.data() + node_offset;
|
||
uint32_t neighbor_count = *reinterpret_cast<uint32_t *>(adjacency_ptr);
|
||
min_degree = std::min(min_degree, neighbor_count);
|
||
max_degree = std::max(max_degree, neighbor_count);
|
||
total_degree += neighbor_count;
|
||
nodes_processed++;
|
||
}
|
||
if (p % 10 == 0 || p == partition_nums - 1) {
|
||
std::cout << " Processed partition " << p + 1 << " / "
|
||
<< partition_nums << "...\r" << std::flush;
|
||
}
|
||
}
|
||
std::cout << "\nFinished processing partitions." << std::endl;
|
||
|
||
if (nodes_processed != nd) {
|
||
std::cerr << "Warning: Processed " << nodes_processed
|
||
<< " nodes, but expected " << nd << std::endl;
|
||
}
|
||
|
||
double avg_degree = (nd > 0) ? static_cast<double>(total_degree) / nd : 0.0;
|
||
std::cout << "\n--- Degree Statistics ---\n";
|
||
std::cout << "Min Degree: "
|
||
<< (min_degree == std::numeric_limits<uint32_t>::max()
|
||
? 0
|
||
: min_degree)
|
||
<< std::endl; // Handle case of 0 nodes
|
||
std::cout << "Max Degree: " << max_degree << std::endl;
|
||
std::cout << "Avg Degree: " << std::fixed << std::setprecision(2)
|
||
<< avg_degree << std::endl;
|
||
std::cout << "Total Degree (Sum): " << total_degree << std::endl;
|
||
std::cout << "Nodes Processed: " << nodes_processed << std::endl;
|
||
|
||
} else {
|
||
uint64_t nd_in_meta = meta_info[0];
|
||
uint64_t c_in_meta = meta_info[4];
|
||
uint64_t entire_file_sz = meta_info[8];
|
||
std::cout << "Based on meta_info:\n"
|
||
<< " nd_in_meta= " << nd_in_meta
|
||
<< ", dim_in_meta= " << dim_in_meta
|
||
<< ", max_node_len= " << max_node_len
|
||
<< ", c_in_meta= " << c_in_meta
|
||
<< ", entire_file_size= " << entire_file_sz << "\n";
|
||
std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
|
||
|
||
if (target_node_id >= nd) {
|
||
std::cerr << "target_node_id=" << target_node_id
|
||
<< " out of range nd=" << nd << std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
|
||
// We need id2partition only for single-node lookup
|
||
std::vector<uint32_t> id2partition(nd);
|
||
{ // Read id2partition again as it was skipped before
|
||
std::ifstream pf_again(partition_bin, std::ios::binary);
|
||
uint64_t header_offset =
|
||
3 * sizeof(uint64_t); // Skip C, partition_nums, nd
|
||
uint64_t partition_list_offset = 0;
|
||
for (uint64_t i = 0; i < partition_nums; i++) {
|
||
partition_list_offset += sizeof(uint32_t); // Size field
|
||
partition_list_offset +=
|
||
graph_partitions[i].size() * sizeof(uint32_t); // Data
|
||
}
|
||
pf_again.seekg(header_offset + partition_list_offset, std::ios::beg);
|
||
pf_again.read(reinterpret_cast<char *>(id2partition.data()),
|
||
nd * sizeof(uint32_t));
|
||
// Error check pf_again if needed
|
||
}
|
||
|
||
// 3) 找 target_node_id => partition_id => subIndex
|
||
uint32_t partition_id = id2partition[target_node_id];
|
||
if (partition_id >= partition_nums) {
|
||
std::cerr << "Partition ID out-of-range for target node.\n";
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
const auto &part_list = graph_partitions[partition_id]; // Use const ref
|
||
auto it =
|
||
std::find(part_list.begin(), part_list.end(), (uint32_t)target_node_id);
|
||
if (it == part_list.end()) {
|
||
std::cerr << "Cannot find node " << target_node_id << " in partition "
|
||
<< partition_id << std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
size_t j = std::distance(part_list.begin(), it);
|
||
|
||
// 4) sector => (partition_id+1)* 4096
|
||
uint64_t sector_offset = uint64_t(partition_id + 1) * SECTOR_SIZE;
|
||
gf.seekg(sector_offset, std::ios::beg);
|
||
std::vector<char> sectorBuf(SECTOR_SIZE);
|
||
gf.read(sectorBuf.data(), SECTOR_SIZE);
|
||
if (gf.fail() && !gf.eof()) {
|
||
std::cerr << "Error reading sector data for partition " << partition_id
|
||
<< std::endl;
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
gf.clear(); // Reset fail bits
|
||
|
||
std::cout << "Partition #" << partition_id
|
||
<< ", nodeCount= " << part_list.size()
|
||
<< ", offset= " << sector_offset << "\n"
|
||
<< " first64 hex:\n ";
|
||
print_hex(sectorBuf.data(), SECTOR_SIZE, 64);
|
||
|
||
// adjacency_offset= j* graph_node_len
|
||
uint64_t node_offset = j * graph_node_len;
|
||
if (node_offset + sizeof(uint32_t) >
|
||
SECTOR_SIZE) { // Check only for neighbor_count read first
|
||
std::cerr << "Out-of-range. j=" << j << ", node_offset=" << node_offset
|
||
<< ", node_offset+4=" << (node_offset + sizeof(uint32_t))
|
||
<< "> 4096\n";
|
||
gf.close();
|
||
return 1;
|
||
}
|
||
|
||
char *adjacency_ptr = sectorBuf.data() + node_offset;
|
||
uint32_t neighbor_count = *reinterpret_cast<uint32_t *>(adjacency_ptr);
|
||
std::cout << "[Node " << target_node_id << "] partition=" << partition_id
|
||
<< ", subIndex=" << j << ", adjacency_offset=" << node_offset
|
||
<< ", neighbor_count=" << neighbor_count << "\n";
|
||
|
||
size_t needed = neighbor_count * sizeof(uint32_t);
|
||
if (node_offset + sizeof(uint32_t) + needed > SECTOR_SIZE) {
|
||
std::cerr << "Neighbors partly out-of-range => neighbor_count="
|
||
<< neighbor_count << "\n";
|
||
// Option: Can still print partial list if needed, but indicating it's
|
||
// truncated
|
||
gf.close();
|
||
return 1; // Or handle differently
|
||
}
|
||
std::vector<uint32_t> neighbors(neighbor_count);
|
||
memcpy(neighbors.data(), adjacency_ptr + sizeof(uint32_t), needed);
|
||
|
||
std::cout << " neighbors=[";
|
||
for (size_t kk = 0; kk < std::min<size_t>(10, neighbor_count); kk++) {
|
||
std::cout << neighbors[kk];
|
||
if (kk + 1 < std::min<size_t>(10, neighbor_count))
|
||
std::cout << ", ";
|
||
}
|
||
if (neighbor_count > 10)
|
||
std::cout << " ... (total " << neighbor_count << ")";
|
||
std::cout << "]\n";
|
||
} // End of else (single node lookup mode)
|
||
|
||
gf.close();
|
||
return 0;
|
||
} |