-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathformatting_text.py
65 lines (44 loc) · 1.45 KB
/
formatting_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from pymongo import MongoClient
import pymorphy2
import re
Morph = pymorphy2.MorphAnalyzer()
Mclient = MongoClient('localhost', 27017)
Db = Mclient.restotexts
Mposts = Db.posts
def norm_word(word):
return Morph.parse(word)[0].normal_form
def trashpos(word):
pos_tags = ['NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'Apro', 'Ques', 'Prdx', 'NUMR', 'Dmns']
tag = Morph.parse(word)[0].tag
return any(pos_tag in tag for pos_tag in pos_tags)
def formatting(text):
text = re.sub(r"[^А-Яа-яЁё ]+", ' ', text)
return text.lower()
def del_spaces(text):
text = re.sub(" +", " ", text)
return text
def text2list(text):
return text.split(' ')
def list2text(l):
return ' '.join((word for word in l))
File_trash = open('trash.txt', 'r')
a = File_trash.read()
Trash = a.split('\n')
for post in Mposts.find():
id = post.get("_id")
text = post.get('text')
if not text:
continue
text = formatting(text)
l = text2list(text)
l = [norm_word(word) for word in l]
to_delete = []
for index, word in enumerate(l):
if trashpos(word) or norm_word(word) in Trash:
to_delete.append(index)
print(word.upper(), l)
for index in sorted(to_delete, reverse=True):
del l[index]
text = del_spaces(list2text(l))
Db.posts.update_one({"_id": id}, {"$set": {"formatted_text": text}})
File_trash.close()