Files
LEANN/research/utils/subsample_data_new.py
yichuan520030910320 46f6cc100b Initial commit
2025-06-30 09:05:05 +00:00

60 lines
2.1 KiB
Python
Executable File

import os
import time
import json
import numpy as np
import multiprocessing
def subsample_jsonl_random(input_file_path, output_file_path, ratio=0.1, seed=42):
"""
Subsamples 10% of the data from a JSONL file efficiently.
Args:
input_file_path (str): Path to the input JSONL file.
output_file_path (str): Path to the output JSONL file where the subsample will be saved.
seed (int): Seed for the random number generator to ensure reproducibility.
"""
start_time = time.time()
# First pass: count the number of lines in the file
line_count = 0
with open(input_file_path, "r", encoding="utf-8") as file:
for _ in file:
line_count += 1
print(f"Total lines: {line_count}")
# Calculate indices for 10% sample
np.random.seed(seed)
sample_size = int(line_count * ratio)
selected_indices = set(np.random.choice(line_count, sample_size, replace=False))
# Second pass: write the selected lines to the output file
print(f"Subsampling {sample_size} lines")
current_index = 0
with (
open(input_file_path, "r", encoding="utf-8") as input_file,
open(output_file_path, "w", encoding="utf-8") as output_file,
):
for line in input_file:
if current_index in selected_indices:
output_file.write(line)
current_index += 1
end_time = time.time()
print(
f"Time: {(end_time - start_time) / 60:.2f} minutes\tRaw Size: {line_count}\t Sampled Size: {sample_size}"
)
if __name__ == "__main__":
# input_dir = '/mnt/md-256k/redpajama_v1/common_crawl_2023_06'
# output_dir = '/mnt/md-256k/massive_ds_data/subsampled_0.1/rpj_common_crawl_2023_06'
input_dir = "/mnt/md-256k/massive_ds_data/full/dpr_wiki"
output_dir = "/mnt/md-256k/massive_ds_data/subsampled_0.1/dpr_wiki"
os.makedirs(output_dir, exist_ok=True)
for filename in os.listdir(input_dir):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(output_dir, filename)
subsample_jsonl_random(input_path, output_path)