-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmerge-wordlists-2.py
71 lines (56 loc) · 1.27 KB
/
merge-wordlists-2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: UTF-8 -*-
# Combine chinese-words.txt & CorpusWordlist & CorpusWordPOSlist
# output: words of length at least > 1
import sys
import re
uniques = set()
f0 = open("web/CorpusWordlist.csv", "r")
# id,词语,出现次数,频率(%),累积频率(%)
# 1,的,744863,7.7946,7.7946
for i, line in enumerate(f0):
if line[0] == '/':
continue
items = line.split(',')
w = items[1]
if len(w) > 1:
uniques.add(w)
f0.close()
f0 = open("web/CorpusWordPOSlist.csv", "r")
# id,词语,词类标记,词类名称,出现次数,频率,累积频率
# 1,的,u,助词,744863,7.7946,7.7946
size1 = 8643287
size2 = 8989573
j = 0
for i, line in enumerate(f0):
if line[0] == '/':
continue
items = line.split(',')
w = items[1]
if len(w) > 1:
if w in uniques:
j += 1
else:
uniques.add(w)
f0.close()
print(j, "common words found")
f0 = open("web/chinese-words.txt", "r")
j = 0
for i, line in enumerate(f0):
words = re.split('\n|\t', line)
# These are all words with length > 1 strictly
for w in words:
if w == '':
continue
if len(w) == 1:
print ("char found!", i, line)
continue
if w in uniques:
j += 1
else:
uniques.add(w)
f0.close()
print(j, "common words found")
f1 = open("web/all-words.txt", "w")
for w in uniques:
f1.write(w + '\n')
f1.close()