-
Notifications
You must be signed in to change notification settings - Fork 0
/
spanish_processing.py
151 lines (91 loc) · 3.1 KB
/
spanish_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from operator import itemgetter
import file_reader
import nltk
from nltk.tag.stanford import NERTagger
from nltk.tag.stanford import POSTagger
def spanish_pos(text):
""" Parts of speech tagger for Spanish """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
def spanish_nouns(pos_tagged):
""" Creates a list of nouns only ordered by their number of appearances in the text"""
nouns_dict = {}
for wordpair in pos_tagged[0]:
if wordpair[1] in ('np00000', 'nc0s000') and len(wordpair[0]) > 2:
nouns_dict[wordpair] = nouns_dict.get(wordpair, 0) + 1
nouns_list = [(key[0], value) for key, value in nouns_dict.items()]
#Alpha sort
nouns_list.sort()
#Sorts by number of appearances
sorted_nouns = sorted(nouns_list, key=itemgetter(1), reverse = True)
allnouns = [word[0] for word in sorted_nouns]
return allnouns
def exclude_entities(allnouns, text):
""" exclude nouns already identified as entities """
exclude = postprocess(spanish_ner(text))
singlelist = [[x for x in i] for i in exclude]
cleanlist = set(allnouns).difference(singlelist[0])
cleanlist = list(cleanlist)
return cleanlist[:20]
def spanish_ner(text):
""" Moves the list of words through the NER tagger"""
text = text.encode('utf8')
st = NERTagger('/Users/Lena/src/context/stanford-ner/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz',
'/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8')
tagged = st.tag(text.split())
return tagged
def join_items(tagged, ent):
"""Joins ngrams from tagged sentences given a type of entity"""
ngram_list = []
for sentence in tagged:
incomplete = False
ngram = []
for i in range(len(sentence)):
wordpair = sentence[i]
if wordpair[1] == ent:
incomplete = True
ngram.append(wordpair)
else:
if incomplete == True:
incomplete = False
ngram_list.append(ngram)
ngram = []
string_list = []
for i in ngram_list:
name = []
for wordpair in i:
name.append(wordpair[0])
string_list.append(' '.join(name))
return string_list
def postprocess(tagged):
""" Takes the output of the NER tagger and returns it as dictionaries"""
entities = {}
entities['PERSON'] = join_items(tagged, 'PERS')
entities['LOCATION'] = join_items(tagged, 'LUG')
entities['ORGANIZATION'] = join_items(tagged, 'ORG')
if 'ORGANIZATION' in entities:
organizations = set(entities['ORGANIZATION'])
else:
organizations = None
if 'LOCATION' in entities:
locations = set(entities['LOCATION'])
else:
locations = None
if 'PERSON' in entities:
people = set(entities['PERSON'])
else:
people = None
return organizations, locations, people
def ner(text):
return postprocess(spanish_ner(text))
def pos(text):
return exclude_entities(spanish_nouns(spanish_pos(text)), text)
def main():
""" Tests """
#text = file_reader.read_file('spanish_sample2.txt')
#print pos(text)
if __name__ == "__main__":
main()