-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain_file.py
188 lines (151 loc) · 7.31 KB
/
main_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from apiclient.discovery import build
from comment_threads import get_comment_threads
from search import youtube_search
import nltk
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import precision
from nltk.metrics import recall
from nltk.metrics import f_measure
import collections
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import PlaintextCorpusReader
corpus_pathpos = 'C:/Users/User/Downloads/NLP Project Files/project/project/polarity/pos'
corpus_pathneg = 'C:/Users/User/Downloads/NLP Project Files/project/project/polarity/neg'
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy
from collections import Counter
stopfile='english'
######################
pos_wordlist = PlaintextCorpusReader(corpus_pathpos,'.*')
neg_wordlist = PlaintextCorpusReader(corpus_pathneg,'.*')
positive_reviews = []
negative_reviews = []
for id in pos_wordlist.fileids():
a =[str(word) for word in pos_wordlist.words(id)]
positive_reviews.append(' '.join(a))
for id in neg_wordlist.fileids():
a =[str(word) for word in neg_wordlist.words(id)]
negative_reviews.append(' '.join(a))
# arguments to be passed to build function
DEVELOPER_KEY = "************************"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
negids=movie_reviews.fileids('neg')
posids=movie_reviews.fileids('pos')
def bag_of_words1(words):
x = dict([(word, True) for word in words])
return x
def bag_of_words(words):
return dict([(word, True) for word in words])
N=10
ind=10
width=0.35
flag = 1;
i = 0
result = []
commentEncoded = []
if __name__ == "__main__":
negfeats=[(bag_of_words(movie_reviews.words(fileids=[f])),'neg') for f in negids]
posfeats=[(bag_of_words(movie_reviews.words(fileids=[f])),'pos') for f in posids]
totalneg=len(negfeats)
totalpos=len(posfeats)
trainfeats=negfeats[:totalneg]+posfeats[:totalpos]
badwords=stopwords.words(stopfile)
negtest_feats=[(bag_of_words(negative_review),'neg') for negative_review in negative_reviews]
postest_feats=[(bag_of_words(positive_review),'pos') for positive_review in positive_reviews]
test_neg=len(negtest_feats)
test_pos=len(postest_feats)
testfeats=negtest_feats[:test_neg]+postest_feats[:test_pos]
while(flag==1): #this flag is set so that user gets option to pass another query
i = 0 #i is set for indexing the 10 extracted videos
result = [] #double dimensional array to store 100 comments of each video
pos_neg_list = [] #this list stores the positive and negative counts of all 10 videos as a tuple
query = raw_input('enter a query word:')
(vid_ids, vid_titles,vid_likes,vid_dislikes,comment_count) = youtube_search(query) #youtube_search function returns five parameters
print("no. of videos",len(vid_ids))
nb_classifier=NaiveBayesClassifier.train(trainfeats)
#nb_precisions, nb_recalls= precision_recall(nb_classifier, testfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = nb_classifier.classify(feats)
testsets[observed].add(i)
print("Accuracy:",nltk.classify.accuracy(nb_classifier, testfeats))
print("Positive Precision:",precision(refsets['pos'], testsets['pos']))
print('Positive Recall:', recall(refsets['pos'], testsets['pos']))
print('Positive F-measure:',f_measure(refsets['pos'], testsets['pos']))
print("Negative Precision:",precision(refsets['neg'], testsets['neg']))
print('Negative Recall:', recall(refsets['neg'], testsets['neg']))
print('Negative F-measure:', f_measure(refsets['neg'], testsets['neg']))
for vid_id in vid_ids:
probs=[] #this list stores the pos or neg tag for each comment of a video and is reinitialized to zero for next video
print "\n"
print i+1, ". VIDEO TITLE: ",vid_titles[i]
print "no. of likes: ",vid_likes[i]
print "no. of dislikes: ",vid_dislikes[i]
print "no. of total comments: ",comment_count[i]
comment = get_comment_threads(youtube,vid_id) #getting comments list(100)
result.append(comment)
print "no.of comments extracted", len(result[i])
fig, ax=plt.subplots()
rects1=ax.bar(ind, vid_likes[i],width, color='r')
rects2=ax.bar(ind+width, vid_dislikes[i],width,color='y')
ax.set_ylabel('Count')
ax.set_title('Likes and Dislikes Chart')
ax.set_xticklabels(('G1'))
ax.legend((rects1[0], rects2[0]), ('Likes','Dislikes'))
def autolabel(rects):
for rect in rects:
height=rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height,
'%d' % int(height),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
#plt.show()
#plot=plt.fig()
figname="img"+str(i)+".png"
plt.savefig(figname)
for j in range(0,len(result[i])):
result[i][j] = result[i][j].encode('ascii','ignore')
splitted_data = result[i][j].split() #splitting each comment of a video
bag = bag_of_words1(set(splitted_data)-set(badwords))
testfeats = bag
# print j,'Classification:', nb_classifier.classify(testfeats)
probs.append(nb_classifier.classify(testfeats))
c=Counter(probs)
print c #printing the total pos and neg counts in 100 comments
positive_count=c['pos']
negative_count=c['neg']
print "like and dislike ratio", (vid_likes[i]/vid_dislikes[i])
print "pos neg ratio", (positive_count/ negative_count)
pos_neg_list.append((positive_count,negative_count))
fig, ax=plt.subplots()
bars1=ax.bar(ind, pos_neg_list[i][0],width, color='r')
bars2=ax.bar(ind+width, pos_neg_list[i][1],width,color='y')
ax.set_ylabel('Count')
ax.set_title('Positive and Negative Count')
ax.set_xticklabels(('G1'))
ax.legend((bars1[0], bars2[0]), ('Positive','Negative'))
def autolabel(bars):
for rect in bars:
height=rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height,
'%d' % int(height),
ha='center', va='bottom')
autolabel(bars1)
autolabel(bars2)
#plt.show()
#plot=plt.fig()
figname="image"+str(i)+".png"
plt.savefig(figname)
i=i+1
print pos_neg_list
flag=int(raw_input("do you want to enter another query?(1 for yes and 0 for no: )"))