-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment_analysis.py
147 lines (106 loc) · 4.86 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Reference Implementation: https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import casual_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import re, string, random
import pickle
import os
stop_words = stopwords.words('english')
def saveClassifier(classifier):
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sentiment_classifier.pickle')
f = open(path, 'wb')
pickle.dump(classifier, f)
f.close()
def loadClassifier():
global __classifier
path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sentiment_classifier.pickle')
try:
f = open(path, 'rb')
__classifier = pickle.load(f)
f.close()
except:
__classifier = None
loadClassifier()
def __getClassifier():
return __classifier
def getSentencePositivity(sentence):
"""
Returns positivity of the given sentence from -1.0 (very negative) to 1.0 (very positive).
May return None if no classifier exists to perform sentiment analysis.
"""
classifier = __getClassifier()
if classifier is None:
return None
#prepare for classifier
tokenized = list(map(lambda x: 'I' if x == 'i' else x, casual_tokenize(sentence)))
custom_tokens = __remove_noise(tokenized)
#classify and get probability
probdist = classifier.prob_classify(dict([token, True] for token in custom_tokens))
pos = probdist.prob('Positive')
normalized_pos = pos * 2 - 1
#handle negation
negation_count = len(list(filter(lambda x: x[1] == 'RB' and x[0] in ("not", "n't"), pos_tag(tokenized))))
normalized_pos *= (-0.2)**negation_count #invert with lower magnitude if negation is detected in sentence
#return result
return normalized_pos
def __remove_noise(tweet_tokens, stop_words = ()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
def __get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def __get_tweets_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
if __name__ == "__main__":
if __classifier is not None:
print('train classifier [t] or use previous [p]?')
if __classifier is None or (str(input()) + ' ').lower()[0] == 't':
print('preprocessing data...')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(__remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(__remove_noise(tokens, stop_words))
all_pos_words = __get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))
positive_tokens_for_model = __get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = __get_tweets_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
print('dataset size: %d' % len(dataset))
print('splitting dataset...')
random.shuffle(dataset)
data_len = len(dataset)
train_data = dataset[:int(data_len*0.6)]
test_data = dataset[int(data_len*0.6):]
print('training classifier...')
__classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(__classifier, test_data))
saveClassifier(__classifier)
print(__classifier.show_most_informative_features(10))