-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkeyword_feature.py
126 lines (95 loc) · 3.54 KB
/
keyword_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import config
import sys, pymongo, color
from collections import defaultdict, Counter
from nltk.stem.wordnet import WordNetLemmatizer
db = pymongo.Connection(config.mongo_addr)[config.db_name]
keyword_list = []
lmtzr = WordNetLemmatizer()
## input: udocID
## output: a dictionary of (word: occurrence)
def get_keyword_feature(udocID):
keywordFeature = Counter()
## find all words in the document <udocID>
words = []
POSs = []
sent_mdocs = list( co_sents.find( {'udocID': udocID} ) )
for sent_mdoc in sent_mdocs:
words.extend( sent_mdoc['sent'].split(' ') ) # words: list of 'happy'
POSs.extend( sent_mdoc['sent_pos'].split(' ') ) # POSs: list of 'happy/JJ'
if config.verbose:
print >> sys.stderr, '\t%s (%d words)\t' % ( color.render('#' + str(udocID), 'y'), len(words))
for idx, word in enumerate(words):
word = word.lower()
if config.lemma:
POS = POSs[idx].split('/').pop()
if POS.startswith('J'): pos = 'a'
elif POS.startswith('V'): pos = 'v'
elif POS.startswith('R'): pos = 'r'
else: pos = 'n'
word = lmtzr.lemmatize(word, pos)
if word in keyword_list:
keywordFeature[ word ] += 1
return keywordFeature
def create_keyword_features(setting_id):
## list of emotions
emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ]
for (ie, gold_emotion) in enumerate(emotions):
## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing)
docs = list( co_docs.find( { 'emotion': gold_emotion } ) )
if config.verbose:
print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) )
for doc in docs:
mdoc = {
"emotion": gold_emotion,
"udocID": doc['udocID'],
"feature": get_keyword_feature(udocID=doc['udocID']).items(),
"setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e"
}
co_feature.insert(mdoc)
co_feature.create_index("setting")
if __name__ == '__main__':
## select mongo collections
co_emotions = db[config.co_emotions_name]
co_docs = db[config.co_docs_name]
co_pats = db[config.co_pats_name]
co_sents = db[config.co_sents_name]
co_keywords = db['resource.WordNetAffect']
## target mongo collections
co_setting = db['features.settings']
co_feature = db['features.keyword']
## input arguments
import getopt
add_opts = [
('-k', ['-k: keyword set in WordNetAffect',
' 0: basic',
' 1: extend']),
('--lemma', ['--lemma: use word lemma when looking for keywords'])
]
try:
opts, args = getopt.getopt(sys.argv[1:],'hk:v',['help', 'keyword_type=', 'lemma', 'verbose'])
except getopt.GetoptError:
config.help(config.keywordFeat_name, addon=add_opts, exit=2)
for opt, arg in opts:
if opt in ('-h', '--help'): config.help(config.keywordFeat_name, addon=add_opts)
elif opt in ('-k','--keyword_type'):
if int(arg.strip()) == 0: config.keyword_type = 'basic'
elif int(arg.strip()) == 1: config.keyword_type = 'extend'
elif opt in ('--lemma'): config.lemma = True
elif opt in ('-v','--verbose'): config.verbose = True
## insert metadata
setting = {
"feature_name": "keyword",
"keyword_type": config.keyword_type,
"lemma": config.lemma
}
## print confirm message
config.print_confirm(setting.items(), bar=40, halt=True)
## insert metadata
setting_id = str(co_setting.insert( setting ))
## create keyword_list
keyword_list = [ mdoc['word'] for mdoc in list( co_keywords.find( {'type': config.keyword_type} ) ) ]
## run
import time
s = time.time()
create_keyword_features(setting_id)
print 'Time total:',time.time() - s,'sec'