-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcombine_and_wsd_extractor_output.py
76 lines (62 loc) · 2.8 KB
/
combine_and_wsd_extractor_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import codecs
import time
import os
import nltk
from nltk.wsd import lesk
from nltk.corpus import wordnet
import re
def nltk_tag_to_wordnet_tag(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
wordnet_tag_dict = {"J":wordnet.ADJ, "V":wordnet.VERB, "N":wordnet.NOUN, "R":wordnet.ADV}
out_file = codecs.open("path_to_output","w","UTF-8")
source_path = "path_to_source"
folders = sorted(os.listdir(source_path))
file_count = 0
t = time.time()
token_count = 0
skip = False
space = False
for folder_no, folder in enumerate(folders):
files = sorted(os.listdir(source_path + "/" + folder))
for file in files:
with codecs.open(source_path + "/" + folder + "/" + file) as f:
lines = f.read().splitlines()
for line in lines:
length = len(line)
if skip: # skips only a single line afer setting skip = True
skip = False
elif line[0:4] == "<doc":
skip = True
elif not (line == "</doc>" or line == ""):
sentences = line.split(". ")
for sent in sentences:
# lesk algorithm
to_be_wsd = re.sub(r'\W+', ' ', sent).lower()
nltk_tagged = nltk.pos_tag(nltk.word_tokenize(to_be_wsd))
for tup in nltk_tagged:
try:
syn = lesk(to_be_wsd.split(), tup[0], wordnet_tag_dict[tup[1][0]])
except:
syn = lesk(to_be_wsd.split(), tup[0])
if syn is not None:
out_file.write(syn.name().replace(".","_"))
out_file.write(" ")
else:
out_file.write(tup[0]+"_x_01")
out_file.write(" ")
token_count += 1
elapsed = time.time() - t
print(str(folder_no+1)) Folder: " + folder + " Tokens Processed: " + str(token_count) + " Elapsed Time: " + str(int(elapsed/3600)) + "h " + str(int((elapsed % 3600)/60)) + "m")
out_file.close()
elapsed = time.time() - t
print("\nAll extractor outputs are processed and combined in " + str(int(elapsed/3600)) + "h " + str(int((elapsed % 3600)/60)) + "m")
print("Token Count: " + str(token_count))