forked from NLPatVCU/NER-OUS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
214 lines (181 loc) · 8.03 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os
import re
from copy import copy
from medacy.pipeline_components.metamap.metamap import MetaMap
#Constant Lists
#MetaMap semantic types corresponding to medical problems
problem_list = [
'amph', #Amphibian
'famg', #Family Group
'ffas', #Fully Formed Anatomical Structure
'orgm', #Organism
'humn', #Human
'rnlw', #Regulation or Law
'nusq', #Nucleotide Sequence
'eehu', #Environmental Effect of Humans
'sosy', #Sign or Symptom
'patf', #Pathologic Function
'dsyn', #Disease or Syndrome
'inpo', #Injury or Poisoning
'bact', #Bacterium
'gora', #Governmental or Regulatory Activity
'grpa', #Group Attribute
'anab', #Anatomical Abnormality
'neop', #Neoplastic Process
'cgab', #Congenital Abnormality
]
#MetaMap semantic types corresponding to medical tests
test_list = [
'mbrt', #Molecular Biology Research Technique
'lbpr', #Laboratory Procedure
'diap', #Diagnostic Procedure
]
#MetaMap semantic types corresponding to medical treatments
treatment_list = [
'clnd', #Clinical Drug
'drdd', #Drug Delivery Device
'edac', #Educational Activity
'shro', #Self-help or Relief Organization
'amas', #Amino Acid Sequence
'antb', #Antibiotic
'mcha', #Machine Activity
'lang', #Language
'horm', #Hormone
]
def stripped_filename(filename):
"""
Quickly strip extension from a filename.
:param filename: Filename to remove extension from.
:return: Stripped filename.
"""
return os.path.splitext(filename)[0]
def build_metamap_semantic_dictionary(medacy_metamap_component, in_file_path):
"""
Uses an instance of Medacy MetaMap to build a dictionary of semantic objects and map them in SpaCy ANN format.
:param medacy_metamap_component: Initialized Medacy MetaMap component to use.
:param in_file_path: Path of the file to build the semantic dictionary for.
:return: Metamap annotation dictionary in SpaCy ANN format.
"""
metamap_dict = medacy_metamap_component.map_file(in_file_path)
metamap_terms = medacy_metamap_component.extract_mapped_terms(metamap_dict)
metamap_annotations = medacy_metamap_component.mapped_terms_to_spacy_ann(metamap_terms)
return metamap_annotations
def build_word_dictionary(in_file):
"""
Builds a dictionary of words indexed by starting position in a document. For mapping ANN annotations to CON format.
[start_index, text, sentence #, word #, problem st, test st, treatment st]
:param in_file: Open file to be read and mapped.
:return: Dictionary of words indexed by starting position.
"""
index_dict = {}
line_index_counter = 0
sentence_counter = 1
word_counter = 0
#For each line, extract the words and their start locations.
line = in_file.readline().rstrip('\r\n')
while line:
#Find all locations of whitespace.
spaces = [m.start() for m in re.finditer(' ', line)]
#If we have whitespace, process the words.
if len(spaces):
word_start = -1
if not spaces[0] == 0:
word_start = 0
for i in range(0, len(spaces)):
if not word_start == -1:
#[start_index, text, sentence #, word #, problem st, test st, treatment st]
word_info = [word_start + line_index_counter, line[word_start:spaces[i]], sentence_counter, word_counter, 0, 0, 0]
index_dict[word_start + line_index_counter] = copy(word_info)
word_counter += 1
word_start = -1
if i == (len(spaces)-1):
if not spaces[i] == (len(line)-1):
word_info = [(spaces[i]+1) + line_index_counter, line[(spaces[i]+1):], sentence_counter, word_counter, 0, 0, 0]
index_dict[(spaces[i]+1) + line_index_counter] = copy(word_info)
else:
if not spaces[i] == (spaces[i+1] - 1):
word_start = spaces[i] + 1
#No whitespace, so if there is text, just add it as a word.
else:
if len(line):
word_info = [line_index_counter, line, sentence_counter, 0, 0, 0, 0]
index_dict[line_index_counter] = copy(word_info)
#Increment and loop
line_index_counter = in_file.tell()
sentence_counter += 1
word_counter = 0
line = in_file.readline().rstrip('\r\n')
return index_dict
def write_semantic_annotations_to_file(index_dict, out_file_path):
"""
Writes semantic annotations to file for easy retrieval. CSV format. Indicies below.
sentence #, word #, problem st, test st, treatment st
:param index_dict: Processed dictionary of words and annotations to write to file.
:param out_file_path: Path to write to.
:return: None
"""
out_file = open(out_file_path, 'w')
for v in index_dict.values():
if v[4] or v[5] or v[6]:
out_buff = [str(v[2]), str(v[3]), str(v[4]), str(v[5]), str(v[6])]
out_file.write(','.join(out_buff) + '\n')
out_file.close()
def build_single_semantic_type_annotations(config, in_file_path, medacy_metamap_component=None):
"""
Builds a dictionary of words indexed by starting position in a document and adds metamap annotations.
This function does not write to file.
[start_index, text, sentence #, word #, problem st, test st, treatment st]
:param config: Configuration file to utilize.
:param in_file_path: Path to file to be read and mapped.
:param medacy_metamap_component: Optinal open instance of a Medacy MetaMap component. If not provided, will open one.
:return: Dictionary of words indexed by starting position in a document and metamap annotations.
"""
if not medacy_metamap_component:
medacy_metamap_component = MetaMap(metamap_path=config['CONFIGURATION']['METAMAP_PATH'])
metamap_annotations = build_metamap_semantic_dictionary(medacy_metamap_component, in_file_path)
in_file = open(in_file_path, 'r')
index_dict = build_word_dictionary(in_file)
in_file.close()
#Sort the index_dict for easy traversal.
key_list = sorted(index_dict.keys())
for v in metamap_annotations["entities"].values():
mark_location = -1
if v[2] in problem_list:
mark_location = 4
elif v[2] in test_list:
mark_location = 5
elif v[2] in treatment_list:
mark_location = 6
if mark_location > -1:
try:
start_index = key_list.index(v[0])
while key_list[start_index] < v[1]:
index_dict[key_list[start_index]][mark_location] = 1
start_index += 1
if start_index >= len(key_list):
break
except ValueError:
print("Found a non-existant index. Continuing.")
return index_dict
def build_semantic_type_annotations(config):
"""
For a directory, builds a dictionary of words indexed by starting position in a document and adds metamap annotations then converts and writes to file.
CSV format. Indicies below.
sentence #, word #, problem st, test st, treatment st
:param config: Configuration file to utilize.
:return: None
"""
medacy_metamap_component = MetaMap(metamap_path=config['CONFIGURATION']['METAMAP_PATH'])
input_directory = config['CONFIGURATION']['RAW_FILE_PATH']
output_directory = config['CONFIGURATION']['SEMANTIC_ANNOTATION_FILE_PATH']
# cd into test file directory
cwd = os.getcwd()
os.chdir(input_directory)
#Iterate over documents in the raw_file_path directory
for document in os.listdir():
out_file_path = os.path.join(cwd, output_directory, stripped_filename(document) + ".st")
if not os.path.exists(out_file_path) or config['CONFIGURATION']['OVERRIDE_SEMANTIC_ANNOTATIONS'] == "1":
index_dict = build_single_semantic_type_annotations(config, document, medacy_metamap_component)
write_semantic_annotations_to_file(index_dict, out_file_path)
#Return to original path
os.chdir(cwd)