-
Notifications
You must be signed in to change notification settings - Fork 87
/
Copy pathexport-csv.py
executable file
·68 lines (62 loc) · 2.58 KB
/
export-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
"""
Export topic modeling results generated by NMF, to a comma-delimited file.
"""
import os.path
import logging as log
from optparse import OptionParser
import unsupervised.nmf, unsupervised.rankings
# --------------------------------------------------------------
def main():
parser = OptionParser(usage="usage: %prog [options] results_file1 results_file2 ...")
parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms and documents to write (default=20)", default=20)
parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="base output directory (default is current directory)", default=None)
(options, args) = parser.parse_args()
if len(args) < 1:
parser.error( "Must specify at least one topic modeling results file produced by NMF" )
log.basicConfig(level=20, format='%(message)s')
# Output directory for CSV files
if options.dir_out is None:
dir_out = os.getcwd()
else:
dir_out = options.dir_out
# Load each cached ranking set
for in_path in args:
(doc_ids, terms, term_rankings, partition, W, H, labels) = unsupervised.nmf.load_nmf_results( in_path )
log.info( "- Loaded model with %d topics from %s" % (len(term_rankings), in_path) )
base_name = os.path.splitext( os.path.split(in_path)[-1] )[0]
# Write top terms for each topics
m = unsupervised.rankings.term_rankings_size( term_rankings )
actual_top = min(options.top,m)
term_path = os.path.join(dir_out, "%s_top%d_terms.csv" % (base_name,options.top))
log.info("Writing top %d terms to %s" % (options.top,term_path) )
fout = open(term_path, "w", "utf-8")
fout.write("Rank")
for label in labels:
fout.write(",%s" % label )
fout.write("\n")
for pos in range(actual_top):
fout.write( "%d" % (pos + 1) )
for ranking in term_rankings:
fout.write( ",%s" % ranking[pos] )
fout.write("\n")
fout.close()
# Write top document IDs for each topics
doc_rankings = unsupervised.nmf.generate_doc_rankings( W )
actual_top = min(options.top,len(doc_ids))
doc_path = os.path.join(dir_out, "%s_top%d_docids.csv" % (base_name,options.top))
log.info("Writing top %d document IDs to %s" % (options.top,doc_path) )
fout = open(doc_path, "w", "utf-8")
fout.write("Rank")
for label in labels:
fout.write(",%s" % label )
fout.write("\n")
for pos in range(actual_top):
fout.write( "%d" % (pos + 1) )
for ranking in doc_rankings:
fout.write( ",%s" % doc_ids[ranking[pos]] )
fout.write("\n")
fout.close()
# --------------------------------------------------------------
if __name__ == "__main__":
main()