-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathparser_module.py
48 lines (40 loc) · 1.62 KB
/
parser_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from document import Document
class Parse:
def __init__(self):
self.stop_words = frozenset(stopwords.words('english'))
def parse_sentence(self, text):
"""
This function tokenize, remove stop words and apply lower case for every word within the text
:param text:
:return:
"""
text_tokens = word_tokenize(text)
text_tokens_without_stopwords = [w.lower() for w in text_tokens if w not in self.stop_words]
return text_tokens_without_stopwords
def parse_doc(self, doc_as_list):
"""
This function takes a tweet document as list and break it into different fields
:param doc_as_list: list re-presenting the tweet.
:return: Document object with corresponding fields.
"""
tweet_id = doc_as_list[0]
tweet_date = doc_as_list[1]
full_text = doc_as_list[2]
url = doc_as_list[3]
retweet_text = doc_as_list[4]
retweet_url = doc_as_list[5]
quote_text = doc_as_list[6]
quote_url = doc_as_list[7]
term_dict = {}
tokenized_text = self.parse_sentence(full_text)
doc_length = len(tokenized_text) # after text operations.
for term in tokenized_text:
if term not in term_dict.keys():
term_dict[term] = 1
else:
term_dict[term] += 1
document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text,
quote_url, term_dict, doc_length)
return document