-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate.py
301 lines (229 loc) · 12 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
"""
Evaluation script to accompany the paper
Halterman, Keith, Sarwar, and O'Connor. "Corpus-Level Evaluation for Event QA:
The IndiaPoliceEvents Corpus Covering the 2002 Gujarat Violence." Findings of ACL, 2021.
If you use this script, please cite the paper
@inproceedings{halterman2021corpus,
author = {Halterman, Andrew and Keith, Katherine A. and Sarwar, Sheikh Muhammad, and O'Connor, Brendan},
title = {Corpus-Level Evaluation for Event {QA}:
The {IndiaPoliceEvents} Corpus Covering the 2002 {G}ujarat Violence},
booktitle = {{Findings of ACL}},
year = 2021}
"""
import argparse
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import average_precision_score, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, log_loss
from scipy.stats import spearmanr
def create_ranked_list(y_true, y_score):
"""
Ranks based on the scores
"""
assert len(y_true) == len(y_score)
true_with_score = [(x, y) for x, y in zip(y_true, y_score)]
ranked_list = sorted(true_with_score, key=lambda kv: -kv[1])
return ranked_list
def test_create_ranked_list():
y_true = [1, 1, 0, 0]
y_score = [0.7, 0.2, 0.5, 0.4]
assert create_ranked_list(y_true, y_score) == [(1, 0.7), (0, 0.5), (0, 0.4), (1, 0.2)]
y_true = [1, 1, 1, 0, 0]
y_score = [0.2, 0.9, 0.5, 0.3, 0.4]
assert create_ranked_list(y_true, y_score) == [(1, 0.9), (1, 0.5), (0, 0.4), (0, 0.3), (1, 0.2)]
def propRead_at_recall(y_true, y_score, recall_target=0.95):
"""
Returns:
Proportion of the corpus read to get that percentage recall
e.g. PropRead@Recall95 means proportion of the corpus read
to achieve 95% recall
Inputs:
y_true : array, length=number of documents
with integers 0, 1 indicating true binary classification
y_score : array, length=number of documents
with floats between 0.0 and 1.0 indicating the relevance scores
recall_target : recall achievement target
(e.g. recall_target=0.95 will return means proportion of the corpus read
to achieve 95% recall)
"""
assert len(y_true) == len(y_score)
#brute force way, go threshold by threshold
ranked_list = create_ranked_list(y_true, y_score)
y_true = [x[0] for x in ranked_list]
for docsRead in range(len(y_true)):
y_pred = [1 for x in range(docsRead)]+[0 for x in range(len(y_true)-docsRead)]
recall = recall_score(y_true, y_pred)
if recall >= recall_target:
return docsRead/len(y_true)
#end condition, have to read all the docs
return 1.0
def test_propRead_at_recall():
y_true = [1, 1, 1, 0, 0]
y_score = [0.2, 0.9, 0.5, 0.3, 0.4]
#[(1, 0.9), (1, 0.5), (0, 0.4), (0, 0.3), (1, 0.2)]
assert propRead_at_recall(y_true, y_score, recall_target=0.95) == 1.0
y_true = [1, 1, 1, 0, 0, 0]
y_score = [0.9, 0.9, 0.9, 0.6, 0.6, 0.6]
assert propRead_at_recall(y_true, y_score, recall_target=0.95) == 3/6
y_true = [1, 1, 1, 0, 0, 0]
y_score = [0.9, 0.9, 0.5, 0.6, 0.4, 0.4]
assert propRead_at_recall(y_true, y_score, recall_target=0.95) == 4/6
y_true = [0, 1]
y_score = [0.6, 0.4]
assert propRead_at_recall(y_true, y_score, recall_target=0.95) == 2/2
y_true = [1, 0]
y_score = [0.6, 0.4]
assert propRead_at_recall(y_true, y_score, recall_target=0.95) == 1/2
def task1_sentence_classification(label2pred, label2gold):
"""
Prints F1 score, precision and recall for each label
"""
print('TASK 1 EVAL\n')
for label in LABEL_LIST:
print('CLASS=', label)
pred_array = label2pred[label]
gold_array = label2gold[label]
print('sentence level F1=', f1_score(gold_array, pred_array))
print('\t precision=', precision_score(gold_array, pred_array))
print('\t recall=', recall_score(gold_array, pred_array))
print()
def task2_document_ranking(label2pred, label2gold):
"""
Prints average precision and PropRead@RecallX for each label
"""
print('TASK 2 EVAL\n')
for label in LABEL_LIST:
print('CLASS=', label)
pred_score_array = label2pred[label]
gold_array = label2gold[label]
print('average precision=', average_precision_score(gold_array, pred_score_array))
print('PropRead@Recall95=', propRead_at_recall(gold_array, pred_score_array, recall_target=0.95))
print()
def load_docid2date():
metadata = pd.read_csv('data/final/metadata.csv')
docid2date = {doc_id: date for doc_id, date in zip(metadata['doc_id'], metadata['date'])}
return docid2date
def aggregate_by_date(label2pred, label2gold, docid_order, docid2date):
gold_label2date2count = {label: Counter() for label in LABEL_LIST}
pred_label2date2count = {label: Counter() for label in LABEL_LIST}
for label in LABEL_LIST:
for docid, pred, gold in zip(docid_order, label2pred[label], label2gold[label]):
docid_date = docid2date[docid]
gold_label2date2count[label][docid_date] += gold
pred_label2date2count[label][docid_date] += pred
return gold_label2date2count, pred_label2date2count
def test_aggregate_by_date():
label2pred = {'KILL': [0, 1, 1]}
label2gold = {'KILL': [1, 1, 1]}
docid_order = [1, 2, 3]
docid2date = {1: '2020-01', 2: '2020-01', 3: '2020-02'}
true_gold_label2date2count = {'KILL': {'2020-01': 2, '2020-02': 1}}
true_pred_label2date2count = {'KILL': {'2020-01': 1, '2020-02': 1}}
assert true_gold_label2date2count, true_pred_label2date2count == aggregate_by_date(label2pred, label2gold, docid_order, docid2date)
def metric_aggregate_spearman_rho(gold_date2count, pred_date2count):
assert len(gold_date2count) == len(pred_date2count)
#put data in same date order
gold_arr = []
pred_arr = []
for date in gold_date2count.keys():
gold_arr.append(gold_date2count[date])
pred_arr.append(pred_date2count[date])
#calculate spearmanr
rho, pval = spearmanr(gold_arr, pred_arr)
return rho
def test_metric_aggregate_spearman_rho():
gold_date2count = {'2020-01': 2, '2020-02': 1, '2020-03': 5}
pred_date2count = {'2020-01': 1, '2020-02': 1, '2020-03': 6}
assert np.abs(metric_aggregate_spearman_rho(gold_date2count, pred_date2count) - 0.8660254037844387) <= 1e-5
def task3_temporal_aggregates(label2pred, label2gold, docid_order):
"""
Prints Spearman's rho between gold-standard and predicted document predictions aggregated by publication
date of the article
"""
print('TASK 3 EVAL\n')
docid2date = load_docid2date()
gold_label2date2count, pred_label2date2count = aggregate_by_date(label2pred, label2gold, docid_order, docid2date)
for label in LABEL_LIST:
print('CLASS=', label)
print('Spearmans rho=', metric_aggregate_spearman_rho(gold_label2date2count[label], pred_label2date2count[label]))
print()
def load_gold_file(task_eval_choice):
if task_eval_choice == 1: #task 1: sentence classification
gold_file = 'data/final/sents.csv'
elif task_eval_choice == 2: #task 2: document ranking
gold_file = 'data/final/docs.csv'
elif task_eval_choice == 3: # task 3: substantive temporal aggregates (at the doc level)
gold_file = 'data/final/docs.csv'
return gold_file
def check_prediction_file_in_correct_format(pred_data, gold_data, task_eval_choice):
#check that they are the same number of lines
if len(pred_data) != len(gold_data):
raise Exception("number of lines in the prediction file does not equal number of lines in the gold file")
if task_eval_choice == 1: #sentence level, unique identifier (docid, sentid)
if not np.array_equal(pred_data['doc_id'], gold_data['doc_id']) or not np.array_equal(pred_data['sent_id'], gold_data['sent_id']):
raise Exception('prediction_file (docid, sentid) order does not match gold_file')
elif task_eval_choice in [2, 3]: #doc level, unique indentifier (docid)
if not np.array_equal(pred_data['doc_id'], gold_data['doc_id']):
raise Exception('prediction_file (docid, sentid) order does not match gold_file')
print('confirmed prediction_file in correct format')
def format_predictions_gold(prediction_file, gold_file, task_eval_choice):
pred_df = pd.read_csv(prediction_file)
gold_df = pd.read_csv(gold_file)
check_prediction_file_in_correct_format(pred_df, gold_df, task_eval_choice)
label2pred = {}
label2gold = {}
for label in LABEL_LIST:
#check to make sure Task 1 and Task 3 have binary labels (0, 1) and Task 2 has scores (e.g real-values between 0 and 1)
pred_array = pred_df[label].to_numpy()
if task_eval_choice in [1, 3] and not np.issubdtype(pred_array.dtype, np.integer):
raise Exception ('predictions for Task 1 and 3 must be dtype=integer')
elif task_eval_choice == 2 and not np.issubdtype(pred_array.dtype, np.floating):
raise Exception ('predictions for Task 2 must be dtype=floating, real-valued document-level relevance scores between 0.0 and 1.0')
label2pred[label] = pred_df[label].to_numpy()
label2gold[label] = gold_df[label].to_numpy()
docid_order = gold_df['doc_id'].to_numpy()
return label2pred, label2gold, docid_order
def go_evaluate(label2pred, label2gold, docid_order, task_eval_choice):
if task_eval_choice == 1:
task1_sentence_classification(label2pred, label2gold)
elif task_eval_choice == 2:
task2_document_ranking(label2pred, label2gold)
elif task_eval_choice == 3:
task3_temporal_aggregates(label2pred, label2gold, docid_order)
if __name__ == '__main__':
"""
This script assumes the gold-standard files are in a data/final/ directory.
Inputs:
args.prediction_file
The prediction file used for this evaluation script must be a .csv file (similar to the gold file)
Note: the ordering of docids, sentids is assumed to be the same in the prediction
and gold standard file (the script will throw an exception if it is not.)
Task 1: The prediction_file should contain a sentence-level binary label (0 or 1)
for the ['KILL', 'ARREST', 'ANY_ACTION', 'FAIL', 'FORCE'] columns
Task 2: The prediction_file should contain a document-level relevance score between 0.0 and 1.0
for the ['KILL', 'ARREST', 'ANY_ACTION', 'FAIL', 'FORCE'] columns
Task 3: The prediction_file should contain a document-level binary label (0 or 1)
for the ['KILL', 'ARREST', 'ANY_ACTION', 'FAIL', 'FORCE'] columns
args.task_eval_choice
The task one wishes to evaluate the predictions on
1 = Task1--Sentence Classification
2 = Task2--Document Ranking
3 = Task3--Substantive temporal aggregates
Example usage:
python evaluate.py sent_predictions.csv 1
python evaluate.py document_ranking_predictions.csv 2
python evaluate.py document_classification_predictions.csv 3
(where sent_predictions.csv, document_ranking_predictions.csv, document_classification_predictions.csv
are your own prediction files)
"""
LABEL_LIST = ['KILL', 'ARREST', 'ANY_ACTION', 'FAIL', 'FORCE']
parser = argparse.ArgumentParser()
parser.add_argument("prediction_file", help=".jsonl file", type=str)
parser.add_argument("task_eval_choice", help="Choose from 1=task1, 2=task2, 3=task3", type=int) #optional argument that if flag is made, stores as true
args = parser.parse_args()
task_eval_choice = args.task_eval_choice
if task_eval_choice not in [1, 2, 3]:
raise Exception('task_eval_choice must be in {1,2,3} corresponding to task1, task2, or task3')
gold_file = load_gold_file(task_eval_choice)
label2pred, label2gold, docid_order = format_predictions_gold(args.prediction_file, gold_file, task_eval_choice)
go_evaluate(label2pred, label2gold, docid_order, task_eval_choice)