-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmerge-all-freqs.py
55 lines (42 loc) · 1.23 KB
/
merge-all-freqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: UTF-8 -*-
# CorpusWordlist + CorpusWordPOSlist --> merged-freqs.txt
# size = 8643287 (words) + 8989573 (words)
size1 = 8643287 + 8989573
# CUHK-rel-freqs + WHK-rel-freqs --> char-rel-freq.txt
# size = 11825308 (chars) + 2943956 (chars)
size2 = 11825308 + 2943956
# Combine chinese-words.txt &
# output: words of length at least > 1
import sys
import re
dict = {}
f0 = open("web/merged-freqs.txt", "r")
# format: "字字... (space) freq \n"
for i, line in enumerate(f0):
if line[0] == '/':
continue
items = line.split(' ')
w = items[0]
dict[w] = float(items[1]) # note items[1] has '\n'
f0.close()
f0 = open("web/char-rel-freq-ZH.txt", "r")
# format: "字 , freq \n" with no possibility of > 1 chars
j = 0
for i, line in enumerate(f0):
if line[0] == '/':
continue
items = line.split(',')
c = items[0]
freq = float(items[1]) # note items[1] has '\n'
if c in dict:
j += 1
dict[c] = (dict[c] * size1 + freq * size2 ) / (size1 + size2)
else:
dict[c] = (0 + freq * size2) / (size1 + size2)
f0.close()
print(len(dict), "unique chars / words")
print(j, "common chars found")
fo = open("web/all-freqs.txt", "w")
for c in sorted(dict, key=dict.get, reverse=True):
fo.write(c + " " + str(dict[c]) + '\n')
fo.close()