-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsave_full_topics_per_doc_dist.py
54 lines (43 loc) · 2.12 KB
/
save_full_topics_per_doc_dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import json
import pickle
import numpy as np
from collections import Counter
with open('read_cases_manualATM_text_list_bverfg230107.json', 'r') as f:
read_cases_manualATM_text_list = json.load(f)
num_topics = 200
docs_count = len(read_cases_manualATM_text_list)
print('docs_count:', docs_count)
with open('WardNJU_Z_assignment_num_topics=' + str(num_topics) + '.json', 'rb') as f:
Z_assignment = pickle.load(f)
topN = num_topics
def print_topics_per_doc(topN=topN):
topics_prob_per_doc_all = {} # dict of dict
for m in range(docs_count):
z_doc = Z_assignment[m]
# print('z_doc:', z_doc)
z_keys, z_counts = np.array(list(Counter(z_doc).keys())), np.array(list(Counter(z_doc).values()))
z_probs = np.array([round(z_count / sum(z_counts), 2) for z_count in z_counts])
# print('z_keys:', z_keys)
# print('z_counts:', z_counts)
# print('z_probs:', z_probs)
#if len(z_counts) > topN:
top_indices = z_counts.argsort()[::-1][:topN]
z_keys = z_keys[top_indices]
z_probs = z_probs[top_indices]
# print('top z_keys:', z_keys)
# print('top z_probs:', z_probs)
topic_prob_per_doc_dict = {}
for idx in range(len(z_keys)):
topic_prob_per_doc_dict[str(z_keys[idx])] = z_probs[idx]
# print('topic_prob_per_doc_dict:', topic_prob_per_doc_dict)
# print('Document {} has most likely topics:{}'.format(m, topic_prob_per_doc_dict))
topics_prob_per_doc_all[m] = topic_prob_per_doc_dict
return topics_prob_per_doc_all
topics_prob_per_doc_all = print_topics_per_doc(topN=topN)
with open('WardNJU_topics_per_doc_topN=' + str(topN) + '_num_topics=' + str(num_topics) + '.json', 'wb') as f:
pickle.dump(topics_prob_per_doc_all, f)
with open('WardNJU_topics_per_doc_topN=' + str(topN) + '_num_topics=' + str(num_topics) + '.json', 'rb') as f:
topics_prob_per_doc_all = pickle.load(f)
with open('WardNJU_topics_per_doc_topN=' + str(topN) + '_num_topics=' + str(num_topics) + '.txt', "w") as f:
n = f.write(str(topics_prob_per_doc_all))
print('topics_prob_per_doc_all:', topics_prob_per_doc_all)