Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added option to NuQCJob to annotate filtered fastq. #155

Merged
merged 4 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions sequence_processing_pipeline/NuQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
wall_time_limit, jmem, fastp_path, minimap2_path,
samtools_path, modules_to_load, qiita_job_id,
max_array_length, known_adapters_path, movi_path, gres_value,
pmls_path, bucket_size=8, length_limit=100, cores_per_task=4):
pmls_path, bucket_size=8, length_limit=100, cores_per_task=4,
additional_fastq_tags=None):
charles-cowart marked this conversation as resolved.
Show resolved Hide resolved
"""
Submit a slurm job where the contents of fastq_root_dir are processed
using fastp, minimap2, and samtools. Human-genome sequences will be
Expand All @@ -69,6 +70,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
:param bucket_size: the size in GB of each bucket to process
:param length_limit: reads shorter than this will be discarded.
:param cores_per_task: Number of CPU cores per node to request.
:param additional_fastq_tags: An optional list of fastq tags to
preserve during filtering.
"""
super().__init__(fastq_root_dir,
output_path,
Expand Down Expand Up @@ -96,6 +99,12 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
self.movi_path = movi_path
self.gres_value = gres_value
self.pmls_path = pmls_path
self.additional_fastq_tags = additional_fastq_tags

if self.additional_fastq_tags is not None:
if not isinstance(self.additional_fastq_tags, list):
raise ValueError("'additional_fastq_tags' must be of type "
"list or None")

# for projects that use sequence_processing_pipeline as a dependency,
# jinja_env must be set to sequence_processing_pipeline's root path,
Expand Down Expand Up @@ -401,6 +410,14 @@ def _generate_mmi_filter_cmds(self, working_dir):

cores_to_allocate = int(self.cores_per_task / 2)

tags = ""
if self.additional_fastq_tags is not None:
if len(self.additional_fastq_tags) > 0:
# add tags for known metadata types that fastq files may have
# been annotated with. Samtools will safely ignore tags that
# are not present.
tags = " -T %s" % ','.join(self.additional_fastq_tags)

for count, mmi_db_path in enumerate(self.mmi_file_paths):
if count == 0:
# prime initial state with unfiltered file and create first of
Expand All @@ -416,9 +433,10 @@ def _generate_mmi_filter_cmds(self, working_dir):
input = tmp_file1
output = tmp_file2

cmds.append(f"minimap2 -2 -ax sr -t {cores_to_allocate} "
cmds.append(f"minimap2 -2 -ax sr -y -t {cores_to_allocate} "
f"{mmi_db_path} {input} -a | samtools fastq -@ "
f"{cores_to_allocate} -f 12 -F 256 > {output}")
f"{cores_to_allocate} -f 12 -F 256{tags} > "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
f"{cores_to_allocate} -f 12 -F 256{tags} > "
f"{cores_to_allocate} -f 12 -F 256 {tags} > "

f"{output}")

# rename the latest tmp file to the final output filename.
cmds.append(f"mv {output} {final_output}")
Expand Down
38 changes: 38 additions & 0 deletions sequence_processing_pipeline/tests/data/seqs.interleaved.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/1 BX:Z:TAGACACGAAGGTTGGAC
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/2 BX:Z:TAGACACGAAGGTTGGAC
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FF,FFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/1 BX:Z:AAAGATGAGGGCAGTTAA
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/2 BX:Z:AAAGATGAGGGCAGTTAA
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/1 BX:Z:TGGGGGTCGTAACACGAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/2 BX:Z:TGGGGGTCGTAACACGAA
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFF::FFFFF,FFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/1 BX:Z:CGAGGCAGACTTGAATGC
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFF:FFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/2 BX:Z:CGAGGCAGACTTGAATGC
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFF:FFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/1 BX:Z:CAGACACGTAGGTGGGAC
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/2 BX:Z:CAGACACGTAGGTGGGAC
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/2
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FF,FFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/1
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/2
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/2
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFF::FFFFF,FFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/1
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFF:FFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/2
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFF:FFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/2
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
50 changes: 44 additions & 6 deletions sequence_processing_pipeline/tests/test_NuQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -2093,11 +2093,11 @@ def test_generate_mmi_filter_cmds(self):
obs = job._generate_mmi_filter_cmds("/my_work_dir")

exp = [
"minimap2 -2 -ax sr -t 2 db_path/mmi_1.db /my_work_dir/seqs."
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_1.db /my_work_dir/seqs."
"interleaved.fastq -a | samtools fastq -@ 2 -f 12 -F 256 > "
"/my_work_dir/foo",
"minimap2 -2 -ax sr -t 2 db_path/mmi_2.db /my_work_dir/foo -a | "
"samtools fastq -@ 2 -f 12 -F 256 > /my_work_dir/bar",
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_2.db /my_work_dir/foo -a"
" | samtools fastq -@ 2 -f 12 -F 256 > /my_work_dir/bar",
"mv /my_work_dir/bar /my_work_dir/seqs.interleaved.filter_"
"alignment.fastq",
"[ -e /my_work_dir/foo ] && rm /my_work_dir/foo",
Expand All @@ -2106,9 +2106,47 @@ def test_generate_mmi_filter_cmds(self):

exp = "\n".join(exp)

print(obs)
print("###")
print(exp)
self.assertEqual(obs, exp)

def test_generate_mmi_filter_cmds_w_annotate_fastq(self):
double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"]
job = NuQCJob(
self.fastq_root_path,
self.output_path,
self.good_sample_sheet_path,
double_db_paths,
"queue_name",
1,
1440,
"8",
"fastp",
"minimap2",
"samtools",
[],
self.qiita_job_id,
1000,
"",
self.movi_path,
self.gres_value,
self.pmls_path,
additional_fastq_tags=['BX']
)

obs = job._generate_mmi_filter_cmds("/my_work_dir")

exp = [
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_1.db /my_work_dir/seqs."
"interleaved.fastq -a | samtools fastq -@ 2 -f 12 -F 256 -T BX > "
"/my_work_dir/foo",
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_2.db /my_work_dir/foo -a"
" | samtools fastq -@ 2 -f 12 -F 256 -T BX > /my_work_dir/bar",
"mv /my_work_dir/bar /my_work_dir/seqs.interleaved.filter_"
"alignment.fastq",
"[ -e /my_work_dir/foo ] && rm /my_work_dir/foo",
"[ -e /my_work_dir/bar ] && rm /my_work_dir/bar"
]

exp = "\n".join(exp)

self.assertEqual(obs, exp)

Expand Down
2 changes: 1 addition & 1 deletion sequence_processing_pipeline/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from tempfile import TemporaryDirectory
import gzip
import os
from os.path import join
from sequence_processing_pipeline.Commands import (split_similar_size_bins,
demux)
import io
from os.path import join


class CommandTests(unittest.TestCase):
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@
],
entry_points={
'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli'
':demux', ], })
':demux'],
})
Loading