34 lines
831 B
Python
Executable File
34 lines
831 B
Python
Executable File
import json
|
|
import os
|
|
|
|
|
|
input_file = "/gscratch/zlab/rulins/data/lm-eval-data/raw_mmlu.jsonl"
|
|
output_file = "/gscratch/zlab/rulins/data/lm-eval-data/mmlu.jsonl"
|
|
|
|
|
|
raw_data = []
|
|
|
|
with open(input_file, "r") as fin:
|
|
for line in fin:
|
|
raw_data.append(json.loads(line))
|
|
|
|
|
|
def deduplicate_dicts(dict_list):
|
|
unique_dicts = set()
|
|
unique_items = []
|
|
for item in dict_list:
|
|
# Make a hashable version of the dictionary by sorting it
|
|
hashable_item = tuple(sorted(item.items()))
|
|
if hashable_item not in unique_dicts:
|
|
unique_dicts.add(hashable_item)
|
|
unique_items.append(item)
|
|
return unique_items
|
|
|
|
|
|
unique_data = deduplicate_dicts(raw_data)
|
|
print(len(unique_data))
|
|
|
|
with open(output_file, "w") as fout:
|
|
for ex in unique_data:
|
|
fout.write(json.dumps(ex) + "\n")
|