-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathper_model_corr.py
87 lines (63 loc) · 2.8 KB
/
per_model_corr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from scipy.stats import pearsonr, spearmanr, kendalltau
import json
import argparse
import os
from tabulate import tabulate
###### const #########
M_ID_LIST = ["M8","M9","M10","M11","M12","M13","M14","M15","M17","M20","M22","M23"]
postfix_dict = {
0: "_rts",
1: "_mcq",
2: "_stareval"
}
########### helper functions ############
def parse_arguments(parser):
###Eval Hyperparameters
# NOTE: "gpt-3.5-turbo-0301" may be deprecated, change to latest api model
# gpt-4, gpt-4-0314, gpt-4-32k, gpt-4-32k-0314, gpt-3.5-turbo, gpt-3.5-turbo-0301
parser.add_argument('--eval_model', type=str, default="GPT-3.5-Turbo-0301", help="the ChatGPT model used")
parser.add_argument('--eval_type', type=int, default=0, choices = [0,1,2], help="evaluation method, 0 for rts, 1 for mcq, 2 for stareval")
args = parser.parse_args()
for k in args.__dict__:
print(k + ": " + str(args.__dict__[k]))
return args
def main():
parser = argparse.ArgumentParser()
config = parse_arguments(parser)
eval_model = config.eval_model
eval_type_id = config.eval_type
data_path = "summeval_"+ eval_model+".json"
if not os.path.exists(data_path):
print(f"please prepare file {data_path} first.")
exit()
postfix = postfix_dict[eval_type_id]
corr_val_table = [["Candidate"]]
corr_sig_table = [["Candidate"]]
for M_ID in M_ID_LIST:
corr_val_table.append([M_ID])
corr_sig_table.append([M_ID])
with open(data_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for dim in ['coherence','consistency', 'fluency', 'relevance']:
corr_val_table[0]+= [dim+"_S", dim+"_P", dim+"_K"]
corr_sig_table[0]+= [dim+"_S", dim+"_P", dim+"_K"]
for M_ID in M_ID_LIST:
row_idx = M_ID_LIST.index(M_ID)+1
human_score = []
pred_score = []
for id in data.keys(): # this ensures the same article scores are paird
human_score.append(data[id]['sys_summs'][M_ID]['scores'][dim])
pred_score.append(data[id]['sys_summs'][M_ID]['scores'][dim + postfix])
s_corr = spearmanr(human_score,pred_score)
p_corr = pearsonr(human_score,pred_score)
k_corr = kendalltau(human_score,pred_score)
corr_val_table[row_idx] += [round(s_corr[0], 3), round(p_corr[0], 3), round(k_corr[0], 3)]
corr_sig_table[row_idx].append("*" if s_corr[1] < 0.05 else "")
corr_sig_table[row_idx].append("*" if p_corr[1] < 0.05 else "")
corr_sig_table[row_idx].append("*" if k_corr[1] < 0.05 else "")
print(f"per model corr values")
print(tabulate(corr_val_table))
print(f"per model corr significance: * for p < 0.05")
print(tabulate(corr_sig_table))
if __name__ == "__main__":
main()