Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test data variola update kao #194

Merged
merged 3 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,094 changes: 3,094 additions & 0 deletions assets/sample_fastas/variola/NC_001611.fasta

Large diffs are not rendered by default.

3,106 changes: 3,106 additions & 0 deletions assets/sample_fastas/variola/PP405578.fasta

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions assets/sample_fastas/variola/VARV_RZ10_3587.fasta

This file was deleted.

2 changes: 0 additions & 2 deletions assets/sample_fastas/variola/VARV_RZ10_3587_2.fasta

This file was deleted.

Binary file modified assets/sample_metadata/VARV_metadata_Sample_Run_1.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 19 additions & 7 deletions bin/repeatmasker_liftoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,22 @@ def get_args():
parser.add_argument("--refgff", type=str, help="Reference GFF to gather the ITR attributes and sample ID \n", required=True)
parser.add_argument("--fasta", type=str, help="FASTA file for sample \n", required=True)
parser.add_argument("--outdir", type=str, default=".", help="Output directory, defualt is current directory")
parser.add_argument("--sample_name", type=str, default=".", help="Sample name")

args = parser.parse_args()

return args

def count_rows_starting_with_comment(file_path):
count = 0
with open(file_path, 'r') as file:
for line in file:
if line.startswith('#'):
count += 1
else:
break # Stop counting once a line is encountered that doesn't start with '#'
return count

def annotation_main():
""" Main function for calling the annotation transfer pipeline
"""
Expand All @@ -44,9 +55,10 @@ def annotation_main():
headerList = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']

#####GATHER REF INFO#####

#load in repeatmasker gff skip commented lines that dont belong in dataframe
ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
#ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, skiprows=count_rows_starting_with_comment(args.refgff))

#gather ref sample id
ref_id=ref_gff['seq_id'][0]
#gather index of attributes for first and second ITRs; needed for repeatmasker ITR attributes
Expand All @@ -63,18 +75,18 @@ def annotation_main():
#samp_name=repMannotation_prep.sample_info()[0]
#repMannotation_prep.repM_prep_main()

LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, samp_name, args.outdir)
LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, args.sample_name, args.outdir)
#LOannotation_prep.LO_prep_main()
#repMannotation_prep.sample_info()
new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, samp_name, args.outdir)
new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, args.sample_name, args.outdir)

new_gff.concat_LO_RM()

#####CREATE TBL FILE#####
main_util=MainUtility()
main_util.gff2tbl(
samp_name=samp_name,
gff_loc=f"{args.outdir}/{samp_name}_reformatted.gff",
gff_loc=f"{args.outdir}/{args.sample_name}_reformatted.gff",
tbl_output=f"{args.outdir}/"
)

Expand Down Expand Up @@ -121,7 +133,7 @@ def sample_info(self):

def cleanup_repeat_masker_gff(self):
#load in repeatmasker gff skip the first two lines that dont belong in dataframe
rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.repeatMGFF))
#correct repeat region labels; repeatmasker labels repeat regions as dispersed_repeat
rem_gff['type'] = rem_gff['type'].replace({'dispersed_repeat': 'repeat_region'}, regex=True)

Expand Down Expand Up @@ -213,7 +225,7 @@ def LO_prep_main(self):
fields_to_drop = ['coverage', 'sequence_ID', 'matches_ref_protein', 'valid_ORF', 'valid_ORFs', 'extra_copy_number',
'copy_num_ID', 'pseudogene', 'partial_mapping', 'low_identity']
#load in liftoff gff with same headers as Repeatmasker and skip commented lines at dont belong to dataframe
lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.liftoffGFF))

#run function to find and drop fields in attributes
lo_gff['attributes']=lo_gff['attributes'].apply(lambda row : self.fix_attributes(fields_to_drop, row))
Expand Down
8 changes: 5 additions & 3 deletions modules/local/concat_gffs/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ process CONCAT_GFFS {

input:
path ref_gff_path
path repeatmasker_gff
path liftoff_gff
//path repeatmasker_gff
//path liftoff_gff
tuple val(meta), path(repeatmasker_gff), path(liftoff_gff)
tuple val(meta), path(fasta_path), path(fastq_1), path(fastq_2)

script:
"""
repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path
echo "repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path --sample_name $meta.id"
repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path --sample_name $meta.id
"""

output:
Expand Down
4 changes: 2 additions & 2 deletions modules/local/liftoff_cli_annotation/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process LIFTOFF_CLI {
'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0' :
'quay.io/biocontainers/liftoff:1.6.3--pyhdfd78af_0'}"

publishDir "$params.output_dir/repeatmasker_liftoff_outputs", mode: "copy", overwrite: params.overwrite_output,
publishDir "$params.output_dir/liftoff", mode: "copy", overwrite: params.overwrite_output,
saveAs: { filename ->
filename.indexOf('.fasta') > 0 ? "fasta/${filename}":
filename.indexOf('.txt') > 0 ? "errors/${filename}":
Expand All @@ -26,7 +26,7 @@ process LIFTOFF_CLI {

script:
"""
liftoff -g $ref_gff_path -o ${fasta.baseName}_liftoff-orig.gff \
liftoff -g $ref_gff_path -o ${fasta.baseName}.liftoff-orig.gff \
-u $params.lift_unmapped_features_file_name \
-a $params.lift_coverage_threshold -s $params.lift_child_feature_align_threshold \
-d $params.lift_distance_scaling_factor -flank $params.lift_flank -p $params.lift_parallel_processes \
Expand Down
22 changes: 19 additions & 3 deletions subworkflows/local/repeatmasker_liftoff.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,31 @@ workflow REPEATMASKER_LIFTOFF {
params.ref_fasta_path,
params.ref_gff_path
)

repeatmasker_gff_ch = REPEATMASKER.out.gff.collect().flatten()
.map {
meta = [:]
meta['id'] = [id:it.getSimpleName()]
[ meta, it ]
}

liftoff_gff_ch = LIFTOFF_CLI.out.gff.collect().flatten()
.map {
meta = [:]
meta['id'] = [id:it.getSimpleName()]
[ meta, it ]
}

concat_gffs_ch = repeatmasker_gff_ch.join(liftoff_gff_ch) // meta.id, fasta, repeatmasker_gff, liftoff_gff

// concat gffs
CONCAT_GFFS (
params.ref_gff_path,
REPEATMASKER.out.gff,
LIFTOFF_CLI.out.gff,
concat_gffs_ch,
fasta
)

emit:
fasta = LIFTOFF_CLI.out.fasta
gff = CONCAT_GFFS.out.gff
}
}
3 changes: 0 additions & 3 deletions subworkflows/local/submission.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ workflow INITIAL_SUBMISSION {
// submit the files to database of choice (after fixing config and getting wait time)
if ( params.genbank && params.sra ){ // genbank and sra
// submit the files to database of choice (after fixing config and getting wait time)
submission_ch.view()
SUBMISSION_FULL ( submission_ch, submission_config )

// actual process to initiate wait
Expand All @@ -39,7 +38,6 @@ workflow INITIAL_SUBMISSION {
.map {
it -> [it[0], it[1], it[3], it[4]]
}
submission_ch.view()
SUBMISSION_SRA ( submission_ch, submission_config )

// actual process to initiate wait
Expand All @@ -55,7 +53,6 @@ workflow INITIAL_SUBMISSION {
.map {
it -> [it[0], it[1], it[2], it[5]]
}
submission_ch.view()
SUBMISSION_GENBANK ( submission_ch, submission_config )

// actual process to initiate wait
Expand Down
Loading