export-csv.py

#!/usr/bin/env python
"""
Export topic modeling results generated by NMF, to a comma-delimited file.
"""
import os.path
import logging as log
from optparse import OptionParser
import unsupervised.nmf, unsupervised.rankings

# --------------------------------------------------------------

def main():
	parser = OptionParser(usage="usage: %prog [options]  results_file1 results_file2 ...")
	parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms and documents to write (default=20)", default=20)
	parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="base output directory (default is current directory)", default=None)
	(options, args) = parser.parse_args()
	if len(args) < 1:
		parser.error( "Must specify at least one topic modeling results file produced by NMF" )
	log.basicConfig(level=20, format='%(message)s')

	# Output directory for CSV files
	if options.dir_out is None:
		dir_out = os.getcwd()
	else:
		dir_out = options.dir_out	

	# Load each cached ranking set
	for in_path in args:
		(doc_ids, terms, term_rankings, partition, W, H, labels) = unsupervised.nmf.load_nmf_results( in_path )
		log.info( "- Loaded model with %d topics from %s" % (len(term_rankings), in_path) )
		base_name = os.path.splitext( os.path.split(in_path)[-1] )[0]
		# Write top terms for each topics
		m = unsupervised.rankings.term_rankings_size( term_rankings )
		actual_top = min(options.top,m)
		term_path = os.path.join(dir_out, "%s_top%d_terms.csv"  % (base_name,options.top))
		log.info("Writing top %d terms to %s" % (options.top,term_path) )
		fout = open(term_path, "w", "utf-8")
		fout.write("Rank")
		for label in labels:
			fout.write(",%s" % label )
		fout.write("\n")
		for pos in range(actual_top):
			fout.write( "%d" % (pos + 1) )
			for ranking in term_rankings:
				fout.write( ",%s" % ranking[pos] )
			fout.write("\n")
		fout.close()
		# Write top document IDs for each topics
		doc_rankings = unsupervised.nmf.generate_doc_rankings( W )
		actual_top = min(options.top,len(doc_ids))
		doc_path = os.path.join(dir_out, "%s_top%d_docids.csv"  % (base_name,options.top))
		log.info("Writing top %d document IDs to %s" % (options.top,doc_path) )
		fout = open(doc_path, "w", "utf-8")
		fout.write("Rank")
		for label in labels:
			fout.write(",%s" % label )
		fout.write("\n")
		for pos in range(actual_top):
			fout.write( "%d" % (pos + 1) )
			for ranking in doc_rankings:
				fout.write( ",%s" % doc_ids[ranking[pos]] )
			fout.write("\n")
		fout.close()

# --------------------------------------------------------------

if __name__ == "__main__":
	main()