-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
88 lines (65 loc) · 3.55 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import argparse
import tqdm
import numpy as np
import os
import datasets
import pandas as pd
from evaluate.solution_judge import SolutionJudge
parser = argparse.ArgumentParser(description = 'Use the judge to get scores.')
parser.add_argument('--pred_file', type = str, help = 'Input path for the .csv files with model-provided solutions.')
parser.add_argument('--judge_model', default = 'Qwen/Qwen2-1.5B-Instruct', type = str, help = 'Hugging Face model name for the judge.')
parser.add_argument('--prompt_lang', default = 'ro', type = str, help = 'Judge system prompt language.')
parser.add_argument('--output', default = 'results/', type = str, help = 'Output folder path for the .csv files with scores.')
parser.add_argument('--translated', action = 'store_true', help = 'Is it translated?')
args = parser.parse_args()
HF_TOKEN = os.environ.get('HF_TOKEN', None)
print("Running evaluation for", args.__dict__)
prediction_df = pd.read_csv(args.pred_file)
prediction_df['solution'] = prediction_df['solution'].astype(str)
prediction_df['response'] = prediction_df['response'].astype(str)
if 'answer' in prediction_df.columns:
prediction_df['answer'] = prediction_df['answer'].astype(str)
else:
prediction_df['answer'] = prediction_df['solution']
model_name = prediction_df['model'].unique()[0]
dataset = prediction_df['dataset'].unique()[0]
shots = prediction_df['shots'].unique()[0]
temperature = prediction_df['temperature'].unique()[0]
output_filename = f'{args.output}/{model_name.replace("/", "-")}_{dataset}_{shots}_{temperature}.csv'
if os.path.exists(output_filename):
print(f'!!Output file "{output_filename}" already exists, skipping.')
exit()
judge = SolutionJudge(args.judge_model, prompt = args.prompt_lang)
dataset_name = dataset
# TODO this is a hack, remove it later!
if args.translated:
path = './translated/'
if dataset_name == 'bac':
path += 'romath-bac-test_model_nllb-200-3.3B.csv'
else:
path += 'romath-comps-test_model_nllb-200-3.3B.csv'
translated_df = pd.read_csv(path)
translated_df = translated_df[['idx', 'translated_problem_unchanged_math', 'translated_solution_unchanged_math']]
prediction_df = pd.merge(prediction_df, translated_df, on = 'idx')
prediction_df['problem'] = prediction_df['translated_problem_unchanged_math']
prediction_df['solution'] = prediction_df['translated_solution_unchanged_math']
else:
test_dataset = datasets.load_dataset('cosmadrian/romath', dataset_name, split = 'test')
problem_dict = {row['idx']: row['problem'] for row in test_dataset}
solution_dict = {row['idx']: row['solution'] for row in test_dataset}
good_problems = pd.DataFrame(problem_dict.items(), columns = ['idx', 'problem'])
good_solutions = pd.DataFrame(solution_dict.items(), columns = ['idx', 'solution'])
prediction_df = prediction_df.drop(columns = ['problem', 'solution'])
prediction_df = pd.merge(prediction_df, good_problems, on = 'idx')
prediction_df = pd.merge(prediction_df, good_solutions, on = 'idx')
prediction_df['judge_pred'] = None
for i, row in tqdm.tqdm(prediction_df.iterrows(), total = len(prediction_df)):
prediction_df.loc[i, 'judge_pred'] = judge.evaluate(
question = row['problem'],
true = row['solution'] if row['answer'] == 'Proof' else row['answer'],
prediction = row['response'],
has_single_answer = row['answer'] != 'Proof'
)
prediction_df = prediction_df[['idx', 'dataset', 'domain', 'model', 'shots', 'temperature', 'judge_pred']]
os.makedirs(args.output, exist_ok = True)
prediction_df.to_csv(output_filename, index = False)