forked from fajri91/Search-Engine-TFIDF-with-Query-Expansion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRunning.py
163 lines (141 loc) · 3.83 KB
/
Running.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
from scipy import spatial
from heapq import nlargest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from operator import itemgetter
import re
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
import nltk
#load stopwords
def load_stopword ():
stopwords = []
with open('stopword') as ins:
for line in ins:
stopwords.append(line.replace('\n',''))
return stopwords
#load data
def load_vocab():
dictio = {}
fileName = []
allFile = []
# FOR EACH DATASET
for data in os.listdir("./clean_doc_no_stem2/"):
allFile.append(data)
start = open('./clean_doc_no_stem2/'+ data, 'r')
content = start.readlines()
sentence = " ".join(content)
for word in sentence.split():
dictio[word] = 1
return allFile, dictio.keys()
def clean (query):
query = re.sub('[^A-Za-z0-9 .,]+', '', query)
query = query.lower()
clean_query = ''
for word in filter(None, re.split("[., ]", query)):
if word not in stopwords:
clean_query += (word + ' ')
return clean_query
def init():
#vocab = load_vocab()
#stopwords = load_stopword()
print 'finish building TFIDF - Space'
def getContent (kalimat):
n = 30
ret = ''
#print kalimat
kalimat = re.sub('[^A-Za-z0-9 .,]+', '', kalimat)
kalimat = kalimat.lower()
for word in kalimat.split():
ret += word + ' '
n = n - 1
if n == 0:
break
return ret
def getVectorTFIDF():
cv_tfidf = TfidfVectorizer(vocabulary=vocab)
train_set = []
all_sentence = ''
#dokumen
for f_part in allFile:
start = open('./clean_doc_no_stem2/'+ f_part, 'r')
content = start.readlines()
sentence = " ".join(content)
all_sentence += (' ' + sentence)
#import ipdb; ipdb.set_trace()
train_set.append(sentence)
bigram = ngrams(all_sentence.split(), 2)
return cv_tfidf, train_set, bigram
stopwords = load_stopword ()
allFile, vocab = load_vocab()
cv_tfidf, train_set, bigram = getVectorTFIDF()
tfidf_matrix_train = cv_tfidf.fit_transform(train_set) #finds the tfidf score with normalization
fdist = nltk.FreqDist(bigram)
keyNya = sorted(fdist, key=fdist.get)
import time
def search(words, nRank = 7):
start = time.time()
query = clean(words)
score = {}
cv_tfidf = TfidfVectorizer(vocabulary=vocab)
#compute frequency distribution for all the bigrams in the text
final_query = ''
qu = query.split()
for i in range (0, len(qu)):
if i == 0:
ada = 0
for key in keyNya:
if qu[i] == key[1] and fdist[key[1]] > 2:
final_query += (' ' + " ".join(key))
#print key
ada = 1
break
if ada == 0:
final_query += (' ' + qu[i])
if i == len(qu[i])-1:
ada = 0
for key in keyNya:
if qu[i] == key[0] and fdist[key[0]] > 2:
final_query += (' ' + " ".join(key))
#print key
ada = 1
break
if ada == 0:
final_query += (' ' + qu[i])
else:
final_query += (' ' + qu[i])
query_vector = cv_tfidf.fit_transform([final_query])
#train_set.insert(0, final_query)
#import ipdb; ipdb.set_trace()
#tfidf_matrix_train.insert(0, query_vector)
#import ipdb; ipdb.set_trace()
#train_set.pop(0)
res = cosine_similarity(query_vector, tfidf_matrix_train)
#tfidf_matrix_train.pop(0)
#import ipdb; ipdb.set_trace()
print (time.time()-start)
i = 0
for f_part in allFile:
f_part = f_part.replace('\n', '')
score [f_part] = res[0][i]
i += 1
rel_result = []
#for name, n in nlargest(50, score.iteritems(), key=itemgetter(1)):
# print name, nn
# maximum 50
print '\nSEARCH RESULT\n'
rank = 1;
for name, n in nlargest(50, score.iteritems(), key=itemgetter(1)):
#open the result:
fi = open('./doc2/'+ name, 'r')
print '\n' + str(rank)+". Title: "+ fi.readline().replace('\n', '')
print 'Weight: ' + str (n)
fi.readline()
fi.readline()
print getContent (" ".join(fi.readlines()))
fi.close()
rank += 1
if rank > nRank:
print ('\n')
break