-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreturn_list.py
136 lines (90 loc) · 3.94 KB
/
return_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
import re
from dadmatools.normalizer import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string
import fasttext
df=pd.read_csv("normalized_data.csv")
normalizer = Normalizer(
full_cleaning=False,
unify_chars=True,
refine_punc_spacing=True,
remove_extra_space=True,
remove_puncs=False,
remove_html=False,
remove_stop_word=False,
replace_email_with="<EMAIL>",
replace_number_with=None,
replace_url_with="",
replace_mobile_number_with=None,
replace_emoji_with=None,
replace_home_number_with=None
)
def normalize_text(text):
if pd.isnull(text) or not isinstance(text, str):
return ""
return normalizer.normalize(text)
columns_to_normalize = ['mavad', 'instruction']
for column in columns_to_normalize:
df[column] = df[column].apply(normalize_text)
def preprocess_text_function(text):
# Define the punctuation characters to be removed
punctuations = string.punctuation
# Remove the punctuations from the text
text_without_punctuations = "".join([char for char in text if char not in punctuations])
return text_without_punctuations
import fasttext
# TFIDF Class: a class to generate TFIDF for your docs and get top k relvant documents
class tfidf:
def __init__(self, docs, ngram_range, bpe_model=None):
self._min_df = 1
self._max_df=0.8
self._max_features=3000
self._docs = docs
self._bpe = bpe_model
self._ngram_range = ngram_range
if self._bpe:
print("we are using bpe")
self._model_tfidf = TfidfVectorizer(analyzer="word", min_df=self._min_df, max_df=self._max_df, max_features=self._max_features, ngram_range=self._ngram_range, tokenizer= lambda x: self.bpe_tokenizer(x))
else:
self._model_tfidf = TfidfVectorizer(analyzer="word", min_df=self._min_df, max_df=self._max_df, max_features=self._max_features, ngram_range=self._ngram_range)
self._matrix = self._model_tfidf.fit_transform(docs)
self._feature_names = self._model_tfidf.get_feature_names_out()
model_skipgram = fasttext.load_model('farsi-dedup-skipgram.bin')
self._tfidf_emb_vecs = np.vstack([model_skipgram.get_word_vector(word) for word in self._feature_names])
self._docs_emb = np.dot(self._matrix.toarray(), self._tfidf_emb_vecs)
def bpe_tokenizer(self, text):
tokens = self._bpe.re.split(r'[و,،]', text)
return tokens
def tfidf_top_k(self, query, k=2):
query_tfidf = self._model_tfidf.transform([query])
doc_scores = []
for doc in self._matrix:
doc_scores.append(cosine_similarity(query_tfidf, doc)[0][0])
sorted_scores = sorted(enumerate(doc_scores), key=lambda ind_score: ind_score[1], reverse=True)
if k!=-1:
top_doc_indices = [ind for ind, score in sorted_scores[:k]]
else:
top_doc_indices=sorted_scores
return top_doc_indices
def tfidf_weighted_top_k(self, query, k=2):
query_tfidf = self._model_tfidf.transform([query])
query_emb = np.dot(query_tfidf.toarray(), self._tfidf_emb_vecs)
doc_scores = []
for doc in self._docs_emb:
doc_scores.append(cosine_similarity(query_emb, doc.reshape(1, -1))[0][0])
sorted_scores = sorted(enumerate(doc_scores), key=lambda ind_score: ind_score[1], reverse=True)
if k!=-1:
top_doc_indices = [ind for ind, score in sorted_scores[:k]]
else:
top_doc_indices = sorted_scores
return top_doc_indices
ingredients_text = df["mavad"].apply(preprocess_text_function)
def return_df(df=df):
return df
def return_list(query,sentence=ingredients_text):
tf_obj_word_1_1 = tfidf(docs=ingredients_text , ngram_range=(1,1), bpe_model=None)
result = tf_obj_word_1_1.tfidf_top_k(query, 5)
return result