diff --git a/nerpa.py b/nerpa.py index 943b0a9..d9bcd9f 100755 --- a/nerpa.py +++ b/nerpa.py @@ -15,6 +15,7 @@ site.addsitedir(nerpa_init.python_modules_dir) import predictions_preprocessor +import postprocessing import nerpa_utils import handle_rban import logger @@ -202,7 +203,7 @@ def copy_prediction_list(args, main_out_dir): return new_prediction_path -def get_antismash_v3_compatible_input_paths(listing_fpath, list_of_paths, output_dir, log): +def get_antismash_v3_compatible_input_paths(listing_fpath, list_of_paths, output_dir, log, origin_file): ''' Parses all antiSMASH-related options, detects all relevant output dirs (either with .json [aS v.5] or with ./txt/ & ./nrpspks_predictions_txt [aS v.3], @@ -270,7 +271,12 @@ def _is_antiSMASHv5_path(path): converted_antiSMASH_v5_paths = convert_antiSMASH_v5(antiSMASHv5_paths + ['-o', converted_antiSMASH_v5_outputs_dir, '-m', 'hybrid']) antiSMASHv3_paths += converted_antiSMASH_v5_paths + for (covant5, origin5) in zip(converted_antiSMASH_v5_paths, antiSMASHv5_paths): + origin_file[covant5] = origin_file[origin5] if origin5 in origin_file else origin5 log.info("\n======= Done with Preprocessing antiSMASH v5 inputs") + for path3 in antiSMASHv3_paths: + if path3 not in origin_file: + origin_file[path3] = path3 return antiSMASHv3_paths @@ -281,6 +287,8 @@ def run(args, log): log.set_up_file_handler(output_dir) log.start() + origin_file = {} + if args.predictions is not None: path_predictions = copy_prediction_list(args, output_dir) else: @@ -301,10 +309,11 @@ def run(args, log): '--cpus', str(args.threads), args.seqs] nerpa_utils.sys_call(command, log, cwd=output_dir) antismash_out_dirs.append(cur_antismash_out) + origin_file[cur_antismash_out] = args.seqs path_predictions = predictions_preprocessor.create_predictions_by_antiSAMSHout(get_antismash_v3_compatible_input_paths( listing_fpath=args.antismash_out, list_of_paths=antismash_out_dirs, - output_dir=output_dir, log=log), output_dir, log) + output_dir=output_dir, log=log, origin_file=origin_file), output_dir, log, origin_file) input_configs_dir = args.configs_dir if args.configs_dir else nerpa_init.configs_dir current_configs_dir = os.path.join(output_dir, "configs") @@ -345,6 +354,9 @@ def run(args, log): log.info("RESULTS:") log.info("Main report is saved to " + os.path.join(output_dir, 'report.csv'), indent=1) log.info("Detailed reports are saved to " + output_dir, indent=1) + + postprocessing.postproccesing(output_dir, origin_file) + log.finish() diff --git a/src/nerpa_pipeline/postprocessing.py b/src/nerpa_pipeline/postprocessing.py new file mode 100644 index 0000000..16a0d78 --- /dev/null +++ b/src/nerpa_pipeline/postprocessing.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +import csv +import os + +def originPathInReport(output_dir, origin_file): + lines = [] + with open(os.path.join(output_dir, "report.csv"), "r") as f: + csv_reader = csv.reader(f) + for row in csv_reader: + lines.append(row) + + pos_id = lines[0].index("PredictionFileName") + lines[0] = lines[0][:pos_id + 1] + ["ContigId", "PartId"] + lines[0][pos_id + 1:] + for i in range(1, len(lines)): + curPath = lines[i][pos_id] + orgPath = origin_file[curPath] + ctgid = curPath.split('/')[-1].split('_')[-4] + partid = curPath.split('/')[-1].split('_')[-1] + + lines[i] = lines[i][:pos_id] + [orgPath, ctgid, partid] + lines[i][pos_id + 1:] + + with open(os.path.join(output_dir, "report.csv"), "w") as fw: + writer = csv.writer(fw, delimiter=',') + for line in lines: + writer.writerow(line) + + +def originPathInDetailReport(output_dir, origin_file): + for filename in os.listdir(os.path.join(output_dir, "details")): + if filename.endswith('.match'): + lines = [] + with open(os.path.join(output_dir, "details", filename)) as f: + for line in f: + if line[:-1] in origin_file: + orgPath = origin_file[line[:-1]] + ctgid = line.split('/')[-1].split('_')[-4] + partid = line.split('/')[-1].split('_')[-1] + lines.append(orgPath + "\t" + ctgid + "\t" + partid) + else: + lines.append(line) + + with open(os.path.join(output_dir, "details", filename), "w") as fw: + for line in lines: + fw.write(line) + + +def postproccesing(output_dir, origin_file): + originPathInReport(output_dir, origin_file) + originPathInDetailReport(output_dir, origin_file) \ No newline at end of file diff --git a/src/nerpa_pipeline/predictions_preprocessor.py b/src/nerpa_pipeline/predictions_preprocessor.py index 90fdf5e..f93c68e 100644 --- a/src/nerpa_pipeline/predictions_preprocessor.py +++ b/src/nerpa_pipeline/predictions_preprocessor.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os import shutil @@ -40,7 +40,7 @@ def gen_prediction_dict(orf_part, input_file_name, dirname): def gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix, - current_part, predictions_info_list, base_antismashout_name): + current_part, predictions_info_list, base_antismashout_name, origin_file): prediction_dict = gen_prediction_dict(orf_part, input_file_name, base_antismashout_name) output_str = "" @@ -53,17 +53,18 @@ def gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix, with open(output_file, 'w') as wf: wf.write(output_str) current_part += 1 + origin_file[output_file] = origin_file['/'.join(input_file_name.split('/')[:-2])] predictions_info_list.append(output_file) return current_part def gen_predictions(bgc_orfs_parts, input_file_name, output_prefix, current_part, - predictions_info_list, base_antismashout_name): + predictions_info_list, base_antismashout_name, origin_file): for orf_part in bgc_orfs_parts: - current_part = gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix, current_part, predictions_info_list, base_antismashout_name) + current_part = gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix, current_part, predictions_info_list, base_antismashout_name, origin_file) return current_part -def create_predictions_by_antiSAMSHout(antismashouts, outdir, log): +def create_predictions_by_antiSAMSHout(antismashouts, outdir, log, origin_file): log.info("Start create predictions by antiSMASH") dir_for_predictions = os.path.join(outdir, "predictions") @@ -111,7 +112,7 @@ def create_predictions_by_antiSAMSHout(antismashouts, outdir, log): #shutil.copyfile(os.path.join(nrpspred_dir, filename), os.path.join(dir_for_predictions, base_antismashout_name + "_" + base_pred_name)) gen_predictions(parts, os.path.join(nrpspred_dir, filename), os.path.join(dir_for_predictions, base_antismashout_name + "_" + base_pred_name)[:-4], - 0, predictions_info_list, dirname) + 0, predictions_info_list, dirname, origin_file) f = open(predictions_info_file, 'w') for line in predictions_info_list: