Files
LEANN/research/utils/generate_dataset_cache.fish
yichuan520030910320 46f6cc100b Initial commit
2025-06-30 09:05:05 +00:00

61 lines
1.8 KiB
Fish

#!/usr/bin/env fish
# Set default parameters
set domain "rpj_wiki"
set embedder "facebook/contriever-msmarco"
set k 5
set tasks "nq" "trivia" "hotpot" "gpqa"
# Parse command line arguments
for i in (seq 1 (count $argv))
switch $argv[$i]
case "--domain"
set domain $argv[(math $i + 1)]
case "--embedder"
set embedder $argv[(math $i + 1)]
case "--k"
set k $argv[(math $i + 1)]
case "--tasks"
set j (math $i + 1)
set tasks
while test $j -le (count $argv) && not string match -q -- "--*" $argv[$j]
set -a tasks $argv[$j]
set j (math $j + 1)
end
end
end
echo "Running with the following parameters:"
echo "Domain: $domain"
echo "Embedder: $embedder"
echo "k: $k"
echo "Datasets: $tasks"
# Create directory for results
set results_dir "retrieval_results"
mkdir -p $results_dir
# Process each dataset using retrieval_demo directly
for task in $tasks
echo ""
echo "===== Processing dataset: $task ====="
# Step 1: Run retrieval_demo with flat index to generate cache and get results
echo "Running retrieval for $task..."
echo "python demo/main.py --domain $domain --task $task --search --load flat --lazy"
python demo/main.py --domain $domain --task $task --search --load flat --lazy
# Check if successful
if test $status -ne 0
echo "Retrieval for $task failed"
continue
end
echo "Completed processing for $task"
echo "--------------------------------"
end
echo "All operations completed successfully!"
echo "The cache files have been created at the locations specified by get_flat_cache_path() in config.py"
echo "You can now use test_all_datasets.py to view the results"