forked from zhanzecheng/Chinese_segment_augment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_run.py
78 lines (67 loc) · 2.6 KB
/
demo_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
"""
# @Time : 2018/05/26 下午5:13
# @Update : 2018/09/28 上午10:30
# @Author : zhanzecheng/片刻
# @File : demo.py.py
# @Software: PyCharm
"""
import os
import jieba
from model import TrieNode
from utils import get_stopwords, load_dictionary, generate_ngram, save_model, load_model
from config import basedir
def load_data(filename, stopwords):
"""
:param filename:
:param stopwords:
:return: 二维数组,[[句子1分词list], [句子2分词list],...,[句子n分词list]]
"""
data = []
with open(filename, 'r') as f:
for line in f:
word_list = [x for x in jieba.cut(line.strip(), cut_all=False) if x not in stopwords]
data.append(word_list)
return data
def load_data_2_root(data):
print('------> 插入节点')
for word_list in data:
# tmp 表示每一行自由组合后的结果(n gram)
# tmp: [['它'], ['是'], ['小'], ['狗'], ['它', '是'], ['是', '小'], ['小', '狗'], ['它', '是', '小'], ['是', '小', '狗']]
ngrams = generate_ngram(word_list, 3)
for d in ngrams:
root.add(d)
print('------> 插入成功')
if __name__ == "__main__":
root_name = basedir + "/data/root.pkl"
stopwords = get_stopwords()
if os.path.exists(root_name):
root = load_model(root_name)
else:
dict_name = basedir + '/data/dict.txt'
word_freq = load_dictionary(dict_name)
root = TrieNode('*', word_freq)
save_model(root, root_name)
# 加载新的文章
filename = 'data/demo.txt'
data = load_data(filename, stopwords)
# 将新的文章插入到Root中
load_data_2_root(data)
# 定义取TOP5个
topN = 5
result, add_word = root.find_word(topN)
# 如果想要调试和选择其他的阈值,可以print result来调整
# print("\n----\n", result)
print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word))
print('#############################')
for word, score in add_word.items():
print(word + ' ----> ', score)
print('#############################')
# 前后效果对比
test_sentence = '蔡英文在昨天应民进党当局的邀请,准备和陈时中一道前往世界卫生大会,和谈有关九二共识问题'
print('添加前:')
print("".join([(x + '/ ') for x in jieba.cut(test_sentence, cut_all=False) if x not in stopwords]))
for word in add_word.keys():
jieba.add_word(word)
print("添加后:")
print("".join([(x + '/ ') for x in jieba.cut(test_sentence, cut_all=False) if x not in stopwords]))