Files
yichuan520030910320 46f6cc100b Initial commit
2025-06-30 09:05:05 +00:00

34 lines
831 B
Python
Executable File

import json
import os
input_file = "/gscratch/zlab/rulins/data/lm-eval-data/raw_mmlu.jsonl"
output_file = "/gscratch/zlab/rulins/data/lm-eval-data/mmlu.jsonl"
raw_data = []
with open(input_file, "r") as fin:
for line in fin:
raw_data.append(json.loads(line))
def deduplicate_dicts(dict_list):
unique_dicts = set()
unique_items = []
for item in dict_list:
# Make a hashable version of the dictionary by sorting it
hashable_item = tuple(sorted(item.items()))
if hashable_item not in unique_dicts:
unique_dicts.add(hashable_item)
unique_items.append(item)
return unique_items
unique_data = deduplicate_dicts(raw_data)
print(len(unique_data))
with open(output_file, "w") as fout:
for ex in unique_data:
fout.write(json.dumps(ex) + "\n")