-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalc_h2h_pairs.py
129 lines (104 loc) · 4.19 KB
/
calc_h2h_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
import os
from tqdm import tqdm
from tabulate import tabulate
from utils import extract_choice
import argparse
###### const #########
model_pairs = [("M22","M23"), ("M23","M17"),("M17","M12"),("M12","M13"), ("M13","M15"), ("M15","M14"), ("M14","M8"),("M8","M9"), ("M9","M10"), ("M10","M20"),("M20","M11")]
source_data_dir = "comp_data"
res_root_dir = "comp_res"
id2dim = {
0:"relevance",
1:"consistency",
2:"fluency",
3:"coherence"
}
########### helper functions ############
def add_score(ans, score1, score2):
if ans == 0:
score1 += 0.5
score2 += 0.5
elif ans == 1:
score1 += 1
elif ans == 2:
score2 += 1
return score1, score2
def prep_result(file_path, toggle=False):
res_dict = {}
human_dict = {}
with open(file_path, 'r', encoding='utf-8') as f:
dataset = [json.loads(line) for line in f]
for data in dataset:
score1 = data["human_score_1"]
score2 = data["human_score_2"]
choice = extract_choice(data["resp"], toggle)
id = data["id"]
res_dict[id] = choice
# get human scores
if score1 == score2:
human_res = 0 # equal
elif score1 > score2:
human_res = 1 # larger
else:
human_res = 2 # smaller
human_dict[id] = human_res
assert set(res_dict.keys()) == set(human_dict.keys())
return res_dict, human_dict
def parse_arguments(parser):
###Eval Hyperparameters
# NOTE: "gpt-3.5-turbo-0301" may be deprecated, change to latest api model
# gpt-4, gpt-4-0314, gpt-4-32k, gpt-4-32k-0314, gpt-3.5-turbo, gpt-3.5-turbo-0301
parser.add_argument('--eval_model', type=str, default="GPT-3.5-Turbo-0301", help="the ChatGPT model to use")
args = parser.parse_args()
for k in args.__dict__:
print(k + ": " + str(args.__dict__[k]))
return args
########################
def main():
parser = argparse.ArgumentParser()
config = parse_arguments(parser)
eval_model = config.eval_model
res_dir = os.path.join(res_root_dir, eval_model)
all_stats = [["model pairs", 'coherence','consistency', 'fluency', 'relevance']]
for model_1, model_2 in model_pairs:
all_stats.append([model_1+"-"+model_2])
all_stats.append(["total"])
for dim in ['coherence','consistency', 'fluency', 'relevance']:
correct_pairs = 0
total_pairs = 0
for idx, (model_1, model_2) in enumerate(model_pairs):
row_idx = idx + 1
# print(f"eval {model_1}-{model_2}, {dim}")
file_path = os.path.join(res_dir, model_1+"-"+model_2+"_" + dim +".result")
res_dict, true_dict = prep_result(file_path, toggle=False)
toggle_file_path = os.path.join(res_dir, model_1+"-"+model_2+"_" + dim +"_toggle.result")
toggle_res_dict, _ = prep_result(toggle_file_path, toggle=True)
assert set(res_dict.keys()) == set(toggle_res_dict.keys())
# convert to h2h scores
human_score_1 = 0
human_score_2 = 0
llm_score_1 = 0
llm_score_2 = 0
for key in true_dict.keys():
truth = true_dict[key]
human_score_1, human_score_2 = add_score(truth, human_score_1, human_score_2)
res = res_dict[key]
llm_score_1, llm_score_2 = add_score(res, llm_score_1, llm_score_2)
toggle_res = toggle_res_dict[key]
llm_score_1, llm_score_2 = add_score(toggle_res, llm_score_1, llm_score_2)
# get the average of the two
llm_score_1 /= 2
llm_score_2 /= 2
total_pairs += 1
if llm_score_1 > 50 and human_score_1 > 50:
correct_pairs += 1
elif llm_score_1 == 50 and human_score_1 == 50:
correct_pairs += 1
elif llm_score_1 < 50 and human_score_1 < 50:
correct_pairs += 1
all_stats[row_idx].append(f"{llm_score_1}/{human_score_1}")
all_stats[-1].append(f"{correct_pairs}/{total_pairs}")
print(tabulate(all_stats))
if __name__ == "__main__":
main()