import os import time import json import numpy as np import multiprocessing def subsample_jsonl_random(input_file_path, output_file_path, ratio=0.1, seed=42): """ Subsamples 10% of the data from a JSONL file efficiently. Args: input_file_path (str): Path to the input JSONL file. output_file_path (str): Path to the output JSONL file where the subsample will be saved. seed (int): Seed for the random number generator to ensure reproducibility. """ start_time = time.time() # First pass: count the number of lines in the file line_count = 0 with open(input_file_path, "r", encoding="utf-8") as file: for _ in file: line_count += 1 print(f"Total lines: {line_count}") # Calculate indices for 10% sample np.random.seed(seed) sample_size = int(line_count * ratio) selected_indices = set(np.random.choice(line_count, sample_size, replace=False)) # Second pass: write the selected lines to the output file print(f"Subsampling {sample_size} lines") current_index = 0 with ( open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, "w", encoding="utf-8") as output_file, ): for line in input_file: if current_index in selected_indices: output_file.write(line) current_index += 1 end_time = time.time() print( f"Time: {(end_time - start_time) / 60:.2f} minutes\tRaw Size: {line_count}\t Sampled Size: {sample_size}" ) if __name__ == "__main__": # input_dir = '/mnt/md-256k/redpajama_v1/common_crawl_2023_06' # output_dir = '/mnt/md-256k/massive_ds_data/subsampled_0.1/rpj_common_crawl_2023_06' input_dir = "/mnt/md-256k/massive_ds_data/full/dpr_wiki" output_dir = "/mnt/md-256k/massive_ds_data/subsampled_0.1/dpr_wiki" os.makedirs(output_dir, exist_ok=True) for filename in os.listdir(input_dir): input_path = os.path.join(input_dir, filename) output_path = os.path.join(output_dir, filename) subsample_jsonl_random(input_path, output_path)