Files
LEANN/research/utils/demo_reader.cpp
yichuan520030910320 e728449b8f change chinese
2025-07-19 19:54:02 -07:00

387 lines
14 KiB
C++

/*
Run with
g++ ./demo_reader.cpp -o ./demo_reader && ./demo_reader --stats \
/powerrag/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/diskann/_partition.bin
\
/powerrag/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/diskann/_disk_graph.index
*/
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <limits> // Include for std::numeric_limits
#include <string> // Include for std::string comparison
#include <vector>
#define READ_U64(f, val) \
f.read(reinterpret_cast<char *>(&val), sizeof(uint64_t))
#define READ_U32(f, val) \
f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
#define SECTOR_SIZE 4096
// Helper: Get file size
static size_t get_file_size(const std::string &fname) {
std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
if (ifs.fail() || !ifs.is_open()) {
return 0;
}
return static_cast<size_t>(ifs.tellg());
}
// Print first few hex of sector for debug
static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
size_t show_len = (len < max_len) ? len : max_len;
for (size_t i = 0; i < show_len; i++) {
unsigned char c = (unsigned char)buf[i];
std::cout << std::hex << std::setw(2) << std::setfill('0') << (unsigned)c
<< " ";
if ((i + 1) % 16 == 0)
std::cout << "\n ";
}
std::cout << std::dec << "\n";
}
/*
Corrected demo_reader:
1) Read from partition.bin:
- C, partition_nums, nd
- graph_partitions[i]: all nodeIDs in partition i
- id2partition[nodeID]: nodeID => partition i
2) Read from _disk_graph.index:
a) sector0 first has 2 ints: meta_n, meta_dim
b) then meta_n uint64_t
e.g.: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
[8]=file_size... specific positions need to be combined with relayout writing c) graph_node_len =
max_node_len - dim_in_meta*sizeof(float) 3) User given target_node_id =>
partition_id= id2partition[node_id]
find node index j in graph_partitions[partition_id]
offset = (partition_id+1)*4096 => sector
adjacency_offset= j*graph_node_len => neighbor_count => neighbors
*/
int main(int argc, char **argv) {
bool calculate_stats = false;
// int arg_offset = 0; // Offset for positional arguments
std::string partition_bin;
std::string graph_index;
uint64_t target_node_id = 0; // Initialize
if (argc != 4) {
std::cerr << "Usage:\n"
<< " " << argv[0]
<< " <partition.bin> <disk_graph.index> <target_node_id> (Reads "
"adjacency for a node)\n"
<< " " << argv[0]
<< " --stats <partition.bin> <disk_graph.index> "
"(Calculates degree statistics)\n";
return 1;
}
// Check if the first argument is the stats flag
if (std::string(argv[1]) == "--stats") {
calculate_stats = true;
partition_bin = argv[2];
graph_index = argv[3];
std::cout << "Mode: Calculating Degree Statistics\n";
} else {
// Assume default mode (single node lookup)
calculate_stats = false;
partition_bin = argv[1];
graph_index = argv[2];
try { // Add error handling for stoull
target_node_id = std::stoull(argv[3]);
} catch (const std::invalid_argument &ia) {
std::cerr << "Error: Invalid target_node_id: " << argv[3] << std::endl;
return 1;
} catch (const std::out_of_range &oor) {
std::cerr << "Error: target_node_id out of range: " << argv[3]
<< std::endl;
return 1;
}
std::cout << "Mode: Single Node Lookup for Node ID " << target_node_id
<< "\n";
}
// 1) Read partition.bin
std::ifstream pf(partition_bin, std::ios::binary);
if (!pf.is_open()) {
std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
return 1;
}
uint64_t C, partition_nums, nd;
READ_U64(pf, C);
READ_U64(pf, partition_nums);
READ_U64(pf, nd);
std::cout << "[partition.bin header] C=" << C
<< ", partition_nums=" << partition_nums << ", nd=" << nd
<< std::endl;
// Read partition node lists
std::vector<std::vector<uint32_t> > graph_partitions(partition_nums);
for (uint64_t i = 0; i < partition_nums; i++) {
uint32_t psize;
READ_U32(pf, psize);
graph_partitions[i].resize(psize);
pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
psize * sizeof(uint32_t));
}
// Read _id2partition[node], size= nd
std::vector<uint32_t> id2partition(nd);
pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
pf.close();
std::cout << "Done loading partition info.\n";
if (target_node_id >= nd) {
std::cerr << "target_node_id=" << target_node_id
<< " out of range nd=" << nd << std::endl;
return 1;
}
// 2) Parse _disk_graph.index
std::ifstream gf(graph_index, std::ios::binary);
if (!gf.is_open()) {
std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
return 1;
}
// (a) sector0 => first read 2 ints
int meta_n, meta_dim;
gf.read((char *)&meta_n, sizeof(int));
gf.read((char *)&meta_dim, sizeof(int));
std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";
// (b) Read meta_n uint64_t
std::vector<uint64_t> meta_info(meta_n);
gf.read(reinterpret_cast<char *>(meta_info.data()),
meta_n * sizeof(uint64_t));
// Print
for (int i = 0; i < meta_n; i++) {
std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
}
size_t file_size = get_file_size(graph_index);
std::cout << "[disk_graph.index size] " << file_size << " bytes\n";
// **According to relayout log** you said: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
// meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
// meta_info[4]=16 (C),
// meta_info[8]= 15475261440(file size)
// We manually parse here first:
uint64_t nd_in_meta = meta_info[0];
uint64_t dim_in_meta = meta_info[1];
uint64_t max_node_len = meta_info[3];
uint64_t c_in_meta = meta_info[4];
uint64_t entire_file_sz = meta_info[8];
std::cout << "Based on meta_info:\n"
<< " nd_in_meta= " << nd_in_meta
<< ", dim_in_meta= " << dim_in_meta
<< ", max_node_len= " << max_node_len
<< ", c_in_meta= " << c_in_meta
<< ", entire_file_size= " << entire_file_sz << "\n";
// Calculate graph_node_len
uint64_t dim_size = dim_in_meta * sizeof(float);
uint64_t graph_node_len = max_node_len - dim_size;
std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
if (calculate_stats) {
// --- Degree Statistics Calculation Mode ---
std::cout << " Calculated graph_node_len = " << graph_node_len << "\n\n";
if (nd == 0) {
std::cerr << "Graph has 0 nodes (nd=0). Cannot calculate stats."
<< std::endl;
gf.close();
return 1;
}
uint32_t min_degree = std::numeric_limits<uint32_t>::max();
uint32_t max_degree = 0;
uint64_t total_degree = 0;
uint64_t nodes_processed = 0;
std::vector<char> sectorBuf(SECTOR_SIZE);
std::cout << "Calculating degrees for " << nd << " nodes across "
<< partition_nums << " partitions..." << std::endl;
for (uint32_t p = 0; p < partition_nums; ++p) {
uint64_t sector_offset = uint64_t(p + 1) * SECTOR_SIZE;
gf.seekg(sector_offset, std::ios::beg);
if (gf.fail()) {
std::cerr << "Error seeking to sector offset for partition " << p
<< std::endl;
gf.close();
return 1;
}
gf.read(sectorBuf.data(), SECTOR_SIZE);
if (gf.fail() && !gf.eof()) {
std::cerr << "Error reading sector data for partition " << p
<< std::endl;
gf.close();
return 1;
}
gf.clear(); // Reset fail bits
const auto &part_list = graph_partitions[p];
for (size_t j = 0; j < part_list.size(); ++j) {
uint64_t node_offset = j * graph_node_len;
if (node_offset + sizeof(uint32_t) > SECTOR_SIZE) {
std::cerr << "Error: Node offset out of sector bounds.\n"
<< " Partition=" << p << ", node_subIndex=" << j
<< ", node_offset=" << node_offset
<< ", graph_node_len=" << graph_node_len << std::endl;
gf.close();
return 1;
}
char *adjacency_ptr = sectorBuf.data() + node_offset;
uint32_t neighbor_count = *reinterpret_cast<uint32_t *>(adjacency_ptr);
min_degree = std::min(min_degree, neighbor_count);
max_degree = std::max(max_degree, neighbor_count);
total_degree += neighbor_count;
nodes_processed++;
}
if (p % 10 == 0 || p == partition_nums - 1) {
std::cout << " Processed partition " << p + 1 << " / "
<< partition_nums << "...\r" << std::flush;
}
}
std::cout << "\nFinished processing partitions." << std::endl;
if (nodes_processed != nd) {
std::cerr << "Warning: Processed " << nodes_processed
<< " nodes, but expected " << nd << std::endl;
}
double avg_degree = (nd > 0) ? static_cast<double>(total_degree) / nd : 0.0;
std::cout << "\n--- Degree Statistics ---\n";
std::cout << "Min Degree: "
<< (min_degree == std::numeric_limits<uint32_t>::max()
? 0
: min_degree)
<< std::endl; // Handle case of 0 nodes
std::cout << "Max Degree: " << max_degree << std::endl;
std::cout << "Avg Degree: " << std::fixed << std::setprecision(2)
<< avg_degree << std::endl;
std::cout << "Total Degree (Sum): " << total_degree << std::endl;
std::cout << "Nodes Processed: " << nodes_processed << std::endl;
} else {
uint64_t nd_in_meta = meta_info[0];
uint64_t c_in_meta = meta_info[4];
uint64_t entire_file_sz = meta_info[8];
std::cout << "Based on meta_info:\n"
<< " nd_in_meta= " << nd_in_meta
<< ", dim_in_meta= " << dim_in_meta
<< ", max_node_len= " << max_node_len
<< ", c_in_meta= " << c_in_meta
<< ", entire_file_size= " << entire_file_sz << "\n";
std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
if (target_node_id >= nd) {
std::cerr << "target_node_id=" << target_node_id
<< " out of range nd=" << nd << std::endl;
gf.close();
return 1;
}
// We need id2partition only for single-node lookup
std::vector<uint32_t> id2partition(nd);
{ // Read id2partition again as it was skipped before
std::ifstream pf_again(partition_bin, std::ios::binary);
uint64_t header_offset =
3 * sizeof(uint64_t); // Skip C, partition_nums, nd
uint64_t partition_list_offset = 0;
for (uint64_t i = 0; i < partition_nums; i++) {
partition_list_offset += sizeof(uint32_t); // Size field
partition_list_offset +=
graph_partitions[i].size() * sizeof(uint32_t); // Data
}
pf_again.seekg(header_offset + partition_list_offset, std::ios::beg);
pf_again.read(reinterpret_cast<char *>(id2partition.data()),
nd * sizeof(uint32_t));
// Error check pf_again if needed
}
// 3) Find target_node_id => partition_id => subIndex
uint32_t partition_id = id2partition[target_node_id];
if (partition_id >= partition_nums) {
std::cerr << "Partition ID out-of-range for target node.\n";
gf.close();
return 1;
}
const auto &part_list = graph_partitions[partition_id]; // Use const ref
auto it =
std::find(part_list.begin(), part_list.end(), (uint32_t)target_node_id);
if (it == part_list.end()) {
std::cerr << "Cannot find node " << target_node_id << " in partition "
<< partition_id << std::endl;
gf.close();
return 1;
}
size_t j = std::distance(part_list.begin(), it);
// 4) sector => (partition_id+1)* 4096
uint64_t sector_offset = uint64_t(partition_id + 1) * SECTOR_SIZE;
gf.seekg(sector_offset, std::ios::beg);
std::vector<char> sectorBuf(SECTOR_SIZE);
gf.read(sectorBuf.data(), SECTOR_SIZE);
if (gf.fail() && !gf.eof()) {
std::cerr << "Error reading sector data for partition " << partition_id
<< std::endl;
gf.close();
return 1;
}
gf.clear(); // Reset fail bits
std::cout << "Partition #" << partition_id
<< ", nodeCount= " << part_list.size()
<< ", offset= " << sector_offset << "\n"
<< " first64 hex:\n ";
print_hex(sectorBuf.data(), SECTOR_SIZE, 64);
// adjacency_offset= j* graph_node_len
uint64_t node_offset = j * graph_node_len;
if (node_offset + sizeof(uint32_t) >
SECTOR_SIZE) { // Check only for neighbor_count read first
std::cerr << "Out-of-range. j=" << j << ", node_offset=" << node_offset
<< ", node_offset+4=" << (node_offset + sizeof(uint32_t))
<< "> 4096\n";
gf.close();
return 1;
}
char *adjacency_ptr = sectorBuf.data() + node_offset;
uint32_t neighbor_count = *reinterpret_cast<uint32_t *>(adjacency_ptr);
std::cout << "[Node " << target_node_id << "] partition=" << partition_id
<< ", subIndex=" << j << ", adjacency_offset=" << node_offset
<< ", neighbor_count=" << neighbor_count << "\n";
size_t needed = neighbor_count * sizeof(uint32_t);
if (node_offset + sizeof(uint32_t) + needed > SECTOR_SIZE) {
std::cerr << "Neighbors partly out-of-range => neighbor_count="
<< neighbor_count << "\n";
// Option: Can still print partial list if needed, but indicating it's
// truncated
gf.close();
return 1; // Or handle differently
}
std::vector<uint32_t> neighbors(neighbor_count);
memcpy(neighbors.data(), adjacency_ptr + sizeof(uint32_t), needed);
std::cout << " neighbors=[";
for (size_t kk = 0; kk < std::min<size_t>(10, neighbor_count); kk++) {
std::cout << neighbors[kk];
if (kk + 1 < std::min<size_t>(10, neighbor_count))
std::cout << ", ";
}
if (neighbor_count > 10)
std::cout << " ... (total " << neighbor_count << ")";
std::cout << "]\n";
} // End of else (single node lookup mode)
gf.close();
return 0;
}