forked from vedic-partap/negtool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_extraction.py
181 lines (168 loc) · 8.04 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import numpy as np
import sklearn.metrics as metrics
from pystruct.models import ChainCRF
from pystruct.learners import FrankWolfeSSVM
from nltk.corpus import words
from utils import *
def extract_features_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon, mode='training'):
"""
Extracts features for the cue classifier from the sentence dictionaries.
Returns (modified) sentence dictionaries, a list of feature dictionaries, and
if called in training mode, a list of labels.
"""
instances = []
for sent in sentence_dicts:
for key, value in sent.iteritems():
features = {}
if isinstance(key, int):
if not_known_cue_word(value[3].lower(), cue_lexicon, affixal_cue_lexicon):
sent[key]['not-pred-cue'] = True
continue
features['token'] = value[3].lower()
features['lemma'] = value[4].lower()
features['pos'] = value[5]
if key == 0:
features['bw-bigram1'] = 'null'
else:
features['bw-bigram1'] = "%s_*" %sent[key-1][4].lower()
if not (key+1) in sent:
features['fw-bigram1'] = 'null'
else:
features['fw-bigram1'] = "*_%s" %sent[key+1][4].lower()
affix = get_affix_cue(value[3].lower(), affixal_cue_lexicon)
if affix != None:
base = value[3].lower().replace(affix, "")
features['char-5gram1'], features['char-5gram2'] = get_character_ngrams(base, affix, 5)
features['char-4gram1'], features['char-4gram2'] = get_character_ngrams(base, affix, 4)
features['char-3gram1'], features['char-3gram2'] = get_character_ngrams(base, affix, 3)
features['char-2gram1'], features['char-2gram2'] = get_character_ngrams(base, affix, 2)
features['char-1gram1'], features['char-1gram2'] = get_character_ngrams(base, affix, 1)
features['affix'] = affix
else:
features['char-5gram1'], features['char-5gram2'] = 'null','null'
features['char-4gram1'], features['char-4gram2'] = 'null','null'
features['char-3gram1'], features['char-3gram2'] = 'null','null'
features['char-2gram1'], features['char-2gram2'] = 'null','null'
features['char-1gram1'], features['char-1gram2'] = 'null','null'
features['affix'] = 'null'
instances.append(features)
if mode == 'training':
labels = extract_labels_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon)
return sentence_dicts, instances, labels
return sentence_dicts, instances
def extract_labels_cue(sentence_dicts, cue_lexicon, affixal_cue_lexicon):
"""
Extracts labels for training the cue classifier. Skips the words that are not
known cue words. For known cue words, label 1 means cue and label -1 means non
cue. Returns a list of integer labels.
"""
labels = []
for sent in sentence_dicts:
for key, value in sent.iteritems():
if isinstance(key, int):
if not_known_cue_word(value[3].lower(), cue_lexicon, affixal_cue_lexicon):
continue
if any(cue_position == key for (cue, cue_position, cue_type) in sent['cues']) or any(mw_pos == key for (mw_cue, mw_pos) in sent['mw_cues']):
labels.append(1)
else:
labels.append(-1)
return labels
def extract_features_scope(sentence_dicts, mode='training'):
"""
Extracts features for the scope classifier from the sentence dictionaries.
Returns (modified) sentence dictionaries, a list of feature dictionaries,
a list of the sentence lengths, and if called in training mode, a list of labels.
"""
instances = []
sentence_splits = []
for sent in sentence_dicts:
if not sent['neg']:
continue
graph = make_dir_graph_for_sentence(sent)
bidir_graph = make_bidir_graph_for_sentence(sent)
for cue_i, (cue, cue_position, cue_type) in enumerate(sent['cues']):
seq_length = -1
for key, value in sent.iteritems():
features = {}
if isinstance(key, int):
features['token'] = value[3]
features['lemma'] = value[4]
features['pos'] = value[5]
features['dir-dep-dist'] = get_shortest_path(graph, sent, cue_position, key)
features['dep-graph-path'] = get_dep_graph_path(bidir_graph, sent, cue_position, key)
dist = key - cue_position
nor_index = find_nor_index(sent)
if cue == "neither" and nor_index > -1 and abs(key-nor_index) < abs(dist):
dist = key - nor_index
#token is to the left of cue
if dist < 0:
if abs(dist) <= 9:
features['left-cue-dist'] = 'A'
else:
features['left-cue-dist'] = 'B'
features['right-cue-dist'] = 'null'
#token is to the right of cue
elif dist > 0:
if dist <= 15:
features['right-cue-dist'] = 'A'
else:
features['right-cue-dist'] = 'B'
features['left-cue-dist'] = 'null'
else:
features['left-cue-dist'] = '0'
features['right-cue-dist'] = '0'
features['cue-type'] = cue_type
features['cue-pos'] = sent[cue_position][5]
if key == 0:
features['bw-bigram1'] = 'null'
features['bw-bigram2'] = 'null'
else:
features['bw-bigram1'] = "%s_*" %sent[key-1][4]
features['bw-bigram2'] = "%s_*" %sent[key-1][5]
if not (key+1) in sent:
features['fw-bigram1'] = 'null'
features['fw-bigram2'] = 'null'
else:
features['fw-bigram1'] = "*_%s" %sent[key+1][4]
features['fw-bigram2'] = "*_%s" %sent[key+1][5]
instances.append(features)
if key > seq_length:
seq_length = key
sentence_splits.append(seq_length)
if mode == 'training':
labels = extract_labels_scope(sentence_dicts, mode)
return sentence_dicts, instances, labels, sentence_splits
return sentence_dicts, instances, sentence_splits
def extract_labels_scope(sentence_dicts, config):
"""
Extracts labels for training the scope classifier. Skips the sentences that
do not contain a cue. Label values:
In-scope: 0
Out of scope: 1
Beginning of scope: 2
Cue: 3
Returns a list of labels.
"""
labels = []
for sent in sentence_dicts:
if not sent['neg']:
continue
for cue_i, (cue, cue_position, cue_type) in enumerate(sent['cues']):
prev_label = 1
for key, value in sent.iteritems():
if isinstance(key, int):
scope = sent['scopes'][cue_i]
if any(key in s for s in scope):
if prev_label == 1:
labels.append(2)
prev_label = 2
else:
labels.append(0)
prev_label = 0
elif key == cue_position:
labels.append(3)
prev_label = 3
else:
labels.append(1)
prev_label = 1
return labels