Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exchange prediction path to origin #10

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions nerpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
site.addsitedir(nerpa_init.python_modules_dir)

import predictions_preprocessor
import postprocessing
import nerpa_utils
import handle_rban
import logger
Expand Down Expand Up @@ -202,7 +203,7 @@ def copy_prediction_list(args, main_out_dir):
return new_prediction_path


def get_antismash_v3_compatible_input_paths(listing_fpath, list_of_paths, output_dir, log):
def get_antismash_v3_compatible_input_paths(listing_fpath, list_of_paths, output_dir, log, origin_file):
'''
Parses all antiSMASH-related options,
detects all relevant output dirs (either with .json [aS v.5] or with ./txt/ & ./nrpspks_predictions_txt [aS v.3],
Expand Down Expand Up @@ -270,7 +271,12 @@ def _is_antiSMASHv5_path(path):
converted_antiSMASH_v5_paths = convert_antiSMASH_v5(antiSMASHv5_paths +
['-o', converted_antiSMASH_v5_outputs_dir, '-m', 'hybrid'])
antiSMASHv3_paths += converted_antiSMASH_v5_paths
for (covant5, origin5) in zip(converted_antiSMASH_v5_paths, antiSMASHv5_paths):
origin_file[covant5] = origin_file[origin5] if origin5 in origin_file else origin5
log.info("\n======= Done with Preprocessing antiSMASH v5 inputs")
for path3 in antiSMASHv3_paths:
if path3 not in origin_file:
origin_file[path3] = path3

return antiSMASHv3_paths

Expand All @@ -281,6 +287,8 @@ def run(args, log):
log.set_up_file_handler(output_dir)
log.start()

origin_file = {}

if args.predictions is not None:
path_predictions = copy_prediction_list(args, output_dir)
else:
Expand All @@ -301,10 +309,11 @@ def run(args, log):
'--cpus', str(args.threads), args.seqs]
nerpa_utils.sys_call(command, log, cwd=output_dir)
antismash_out_dirs.append(cur_antismash_out)
origin_file[cur_antismash_out] = args.seqs

path_predictions = predictions_preprocessor.create_predictions_by_antiSAMSHout(get_antismash_v3_compatible_input_paths(
listing_fpath=args.antismash_out, list_of_paths=antismash_out_dirs,
output_dir=output_dir, log=log), output_dir, log)
output_dir=output_dir, log=log, origin_file=origin_file), output_dir, log, origin_file)

input_configs_dir = args.configs_dir if args.configs_dir else nerpa_init.configs_dir
current_configs_dir = os.path.join(output_dir, "configs")
Expand Down Expand Up @@ -345,6 +354,9 @@ def run(args, log):
log.info("RESULTS:")
log.info("Main report is saved to " + os.path.join(output_dir, 'report.csv'), indent=1)
log.info("Detailed reports are saved to " + output_dir, indent=1)

postprocessing.postproccesing(output_dir, origin_file)

log.finish()


Expand Down
49 changes: 49 additions & 0 deletions src/nerpa_pipeline/postprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
import csv
import os

def originPathInReport(output_dir, origin_file):
lines = []
with open(os.path.join(output_dir, "report.csv"), "r") as f:
csv_reader = csv.reader(f)
for row in csv_reader:
lines.append(row)

pos_id = lines[0].index("PredictionFileName")
lines[0] = lines[0][:pos_id + 1] + ["ContigId", "PartId"] + lines[0][pos_id + 1:]
for i in range(1, len(lines)):
curPath = lines[i][pos_id]
orgPath = origin_file[curPath]
ctgid = curPath.split('/')[-1].split('_')[-4]
partid = curPath.split('/')[-1].split('_')[-1]

lines[i] = lines[i][:pos_id] + [orgPath, ctgid, partid] + lines[i][pos_id + 1:]

with open(os.path.join(output_dir, "report.csv"), "w") as fw:
writer = csv.writer(fw, delimiter=',')
for line in lines:
writer.writerow(line)


def originPathInDetailReport(output_dir, origin_file):
for filename in os.listdir(os.path.join(output_dir, "details")):
if filename.endswith('.match'):
lines = []
with open(os.path.join(output_dir, "details", filename)) as f:
for line in f:
if line[:-1] in origin_file:
orgPath = origin_file[line[:-1]]
ctgid = line.split('/')[-1].split('_')[-4]
partid = line.split('/')[-1].split('_')[-1]
lines.append(orgPath + "\t" + ctgid + "\t" + partid)
else:
lines.append(line)

with open(os.path.join(output_dir, "details", filename), "w") as fw:
for line in lines:
fw.write(line)


def postproccesing(output_dir, origin_file):
originPathInReport(output_dir, origin_file)
originPathInDetailReport(output_dir, origin_file)
13 changes: 7 additions & 6 deletions src/nerpa_pipeline/predictions_preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import sys
import os
import shutil
Expand Down Expand Up @@ -40,7 +40,7 @@ def gen_prediction_dict(orf_part, input_file_name, dirname):


def gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix,
current_part, predictions_info_list, base_antismashout_name):
current_part, predictions_info_list, base_antismashout_name, origin_file):
prediction_dict = gen_prediction_dict(orf_part, input_file_name, base_antismashout_name)

output_str = ""
Expand All @@ -53,17 +53,18 @@ def gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix,
with open(output_file, 'w') as wf:
wf.write(output_str)
current_part += 1
origin_file[output_file] = origin_file['/'.join(input_file_name.split('/')[:-2])]
predictions_info_list.append(output_file)

return current_part

def gen_predictions(bgc_orfs_parts, input_file_name, output_prefix, current_part,
predictions_info_list, base_antismashout_name):
predictions_info_list, base_antismashout_name, origin_file):
for orf_part in bgc_orfs_parts:
current_part = gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix, current_part, predictions_info_list, base_antismashout_name)
current_part = gen_prediction_for_one_orfs_part(orf_part, input_file_name, output_prefix, current_part, predictions_info_list, base_antismashout_name, origin_file)
return current_part

def create_predictions_by_antiSAMSHout(antismashouts, outdir, log):
def create_predictions_by_antiSAMSHout(antismashouts, outdir, log, origin_file):
log.info("Start create predictions by antiSMASH")

dir_for_predictions = os.path.join(outdir, "predictions")
Expand Down Expand Up @@ -111,7 +112,7 @@ def create_predictions_by_antiSAMSHout(antismashouts, outdir, log):
#shutil.copyfile(os.path.join(nrpspred_dir, filename), os.path.join(dir_for_predictions, base_antismashout_name + "_" + base_pred_name))
gen_predictions(parts, os.path.join(nrpspred_dir, filename),
os.path.join(dir_for_predictions, base_antismashout_name + "_" + base_pred_name)[:-4],
0, predictions_info_list, dirname)
0, predictions_info_list, dirname, origin_file)

f = open(predictions_info_file, 'w')
for line in predictions_info_list:
Expand Down