Skip to content

Commit

Permalink
Revamp of metagenomics rules
Browse files Browse the repository at this point in the history
Add support for krakenuniq, kaiju to replace kraken and diamond in
default pipelines.

Both rely on customized krakenuniq/kaiju packages for custom
functionality, like loading database
from pipe or printing out all taxa hits (kaiju).

Krona fixed to use hit counts as scores in report files directly,
instead of counting the
individual read classifications (which takes a long time).
  • Loading branch information
yesimon committed Mar 29, 2019
1 parent c2e518a commit ddff459
Show file tree
Hide file tree
Showing 33 changed files with 1,116 additions and 1,301 deletions.
507 changes: 153 additions & 354 deletions metagenomics.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions pipes/WDL/dx-defaults-classify_kaiju.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"classify_kaiju.kaiju.kaiju_db_lz4":
"dx://file-FVYQqP006zFF064QBGf022X1",
"classify_kaiju.kaiju.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb",
"classify_kaiju.kaiju.ncbi_taxonomy_db_tgz":
"dx://file-F8KgJK009y3Qgy3FF1791Vgq"
}
6 changes: 0 additions & 6 deletions pipes/WDL/dx-defaults-classify_kraken.json

This file was deleted.

6 changes: 6 additions & 0 deletions pipes/WDL/dx-defaults-classify_krakenuniq.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"classify_krakenuniq.krakenuniq.krakenuniq_db_tar_lz4":
"dx://file-FVYQqP006zFF064QBGf022X1",
"classify_krakenuniq.krakenuniq.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
}
6 changes: 3 additions & 3 deletions pipes/WDL/dx-defaults-demux_plus.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
"demux_plus.trim_clip_db":
"dx://file-BXF0vYQ0QyBF509G9J12g927",

"demux_plus.kraken.kraken_db_tar_lz4":
"dx://file-F8QF3bj0xf5gZv76BG4QgpF4",
"demux_plus.kraken.krona_taxonomy_db_tgz":
"demux_plus.krakenuniq.krakenuniq_db_tar_lz4":
"dx://file-FVYQqP006zFF064QBGf022X1",
"demux_plus.krakenuniq.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/classify_kaiju.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "tasks_metagenomics.wdl" as metagenomics
workflow classify_kaiju {
call metagenomics.kaiju
}
5 changes: 0 additions & 5 deletions pipes/WDL/workflows/classify_kraken.wdl

This file was deleted.

5 changes: 5 additions & 0 deletions pipes/WDL/workflows/classify_krakenuniq.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "tasks_metagenomics.wdl" as metagenomics
workflow classify_krakenuniq {
call metagenomics.krakenuniq
}
14 changes: 5 additions & 9 deletions pipes/WDL/workflows/demux_metag.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,14 @@ workflow demux_metag {
assembler = "spades",
reads_unmapped_bam = deplete.cleaned_bam
}
call metagenomics.diamond_contigs as diamond {
input:
contigs_fasta = spades.contigs_fasta,
reads_unmapped_bam = deplete.cleaned_bam,
krona_taxonomy_db_tar_lz4 = krona_taxonomy_db_tgz
}
}

call metagenomics.kraken as kraken {
call metagenomics.krakenuniq as kraken {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
}
call metagenomics.kaiju as kaiju {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
krona_taxonomy_db_tgz = krona_taxonomy_db_tgz
}

}
2 changes: 1 addition & 1 deletion pipes/WDL/workflows/demux_plus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ workflow demux_plus {
}
}

call metagenomics.kraken as kraken {
call metagenomics.krakenuniq as krakenuniq {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams
}
Expand Down
124 changes: 45 additions & 79 deletions pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@

# TO DO:
# kraken_build (input & output tarballs)
# diamond, bwa, etc
task kraken {
task krakenuniq {
Array[File] reads_unmapped_bam
File kraken_db_tar_lz4
File krona_taxonomy_db_tgz
File krakenuniq_db_tar_lz4 # krakenuniq/{database.kdb,taxonomy}
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
parameter_meta {
kraken_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type
krakenuniq_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type
krona_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type
#reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type
reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type
}

command {
Expand All @@ -24,7 +19,7 @@ task kraken {

# decompress DB to $DB_DIR
read_utils.py extract_tarball \
${kraken_db_tar_lz4} $DB_DIR \
${krakenuniq_db_tar_lz4} $DB_DIR \
--loglevel=DEBUG
read_utils.py extract_tarball \
${krona_taxonomy_db_tgz} . \
Expand All @@ -33,17 +28,17 @@ task kraken {
# prep input and output file names
OUT_READS=fnames_outreads.txt
OUT_REPORTS=fnames_outreports.txt
OUT_BASENAME=basenames_reads.txt
OUT_BASENAME=basenames_reports.txt
for bam in ${sep=' ' reads_unmapped_bam}; do
echo "$(basename $bam .bam).kraken-reads" >> $OUT_BASENAME
echo "$(basename $bam .bam).kraken-reads.txt.gz" >> $OUT_READS
echo "$(basename $bam .bam).kraken-summary_report.txt" >> $OUT_REPORTS
echo "$(basename $bam .bam).krakenuniq-reads.txt.gz" >> $OUT_READS
echo "$(basename $bam .bam).krakenuniq" >> $OUT_BASENAME
echo "$(basename $bam .bam).krakenuniq-summary_report.txt" >> $OUT_REPORTS
done

# execute on all inputs and outputs serially, but with a single
# database load into ram
metagenomics.py kraken \
$DB_DIR \
metagenomics.py krakenuniq \
$DB_DIR/krakenuniq \
${sep=' ' reads_unmapped_bam} \
--outReads `cat $OUT_READS` \
--outReport `cat $OUT_REPORTS` \
Expand All @@ -54,21 +49,18 @@ task kraken {
# run single-threaded krona on up to nproc samples at once
parallel -I ,, \
"metagenomics.py krona \
,,.txt.gz \
,,-summary_report.txt \
taxonomy \
,,.html \
--noRank --noHits \
,,.krona.html \
--noRank --noHits --inputType krakenuniq \
--loglevel=DEBUG" \
::: `cat $OUT_BASENAME`
# run single-threaded gzip on up to nproc samples at once
parallel -I ,, "tar czf ,,.krona.tar.gz ,,.html*" ::: `cat $OUT_BASENAME`
}

output {
Array[File] kraken_classified_reads = glob("*.kraken-reads.txt.gz")
Array[File] kraken_summary_report = glob("*.kraken-summary_report.txt")
Array[File] krona_report_html = glob("*.kraken-reads.html")
Array[File] krona_report_tgz = glob("*.kraken-reads.krona.tar.gz")
Array[File] krakenuniq_classified_reads = glob("*.krakenuniq-reads.txt.gz")
Array[File] krakenuniq_summary_report = glob("*.krakenuniq-summary_report.txt")
Array[File] krona_report_html = glob("*.krakenuniq.krona.html")
String viralngs_version = "viral-ngs_version_unknown"
}

Expand Down Expand Up @@ -105,13 +97,10 @@ task krona {
${input_basename}.html \
--noRank --noHits \
--loglevel=DEBUG

tar czf ${input_basename}.krona.tar.gz ${input_basename}.html*
}

output {
File krona_report_html = "${input_basename}.html"
File krona_report_tgz = "${input_basename}.krona.tar.gz"
File krona_report_html = "${input_basename}.html"
String viralngs_version = "viral-ngs_version_unknown"
}

Expand Down Expand Up @@ -182,19 +171,18 @@ task filter_bam_to_taxa {

}

task diamond_contigs {
File contigs_fasta
task kaiju {
File reads_unmapped_bam
File diamond_db_lz4
File diamond_taxonomy_db_tar_lz4
File krona_taxonomy_db_tar_lz4
File kaiju_db_lz4 # <something>.fmi
File ncbi_taxonomy_db_tgz # taxonomy/{nodes.dmp, names.dmp}
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
String contigs_basename = basename(contigs_fasta, ".fasta")
String input_basename = basename(reads_unmapped_bam, ".bam")

parameter_meta {
diamond_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
diamond_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
krona_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
kaiju_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
ncbi_taxonomy_db_tgz : "stream"
krona_taxonomy_db_tgz : "stream"
}

command {
Expand All @@ -203,61 +191,41 @@ task diamond_contigs {
if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
DIAMOND_TAXDB_DIR=$(mktemp -d)
DB_DIR=$(mktemp -d)

# find 90% memory
mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
lz4 -d ${kaiju_db_lz4} $DB_DIR/kaiju.fmi

# decompress DBs to /mnt/db
cat ${diamond_db_lz4} | lz4 -d > $TMPDIR/diamond_db.dmnd &
read_utils.py extract_tarball \
${diamond_taxonomy_db_tar_lz4} $DIAMOND_TAXDB_DIR \
--loglevel=DEBUG &
wait
read_utils.py extract_tarball \
${krona_taxonomy_db_tar_lz4} . \
--loglevel=DEBUG & # we don't need this until later
${ncbi_taxonomy_db_tgz} $DB_DIR \
--loglevel=DEBUG

# classify contigs
metagenomics.py diamond_fasta \
${contigs_fasta} \
$TMPDIR/diamond_db.dmnd \
$DIAMOND_TAXDB_DIR/taxonomy/ \
${contigs_basename}.diamond.fasta \
--memLimitGb $mem_in_gb \
read_utils.py extract_tarball \
${krona_taxonomy_db_tgz} . \
--loglevel=DEBUG

# map reads to contigs & create kraken-like read report
bwa index ${contigs_basename}.diamond.fasta
metagenomics.py align_rna \
# classify contigs
metagenomics.py kaiju \
${reads_unmapped_bam} \
${contigs_basename}.diamond.fasta \
$DIAMOND_TAXDB_DIR/taxonomy/ \
${contigs_basename}.diamond.summary_report.txt \
--outReads ${contigs_basename}.diamond.reads.txt.gz \
--dupeReads ${contigs_basename}.diamond.reads_w_dupes.txt.gz \
--outBam ${contigs_basename}.diamond.bam \
$DB_DIR/kaiju.fmi \
$DB_DIR/taxonomy \
${input_basename}.kaiju.report.txt \
--outReads ${input_basename}.kaiju.reads.gz \
--loglevel=DEBUG

# run krona
wait # for krona_taxonomy_db_tgz to download and extract
metagenomics.py krona \
${contigs_basename}.diamond.reads.txt.gz \
${input_basename}.kaiju.report.txt \
taxonomy \
${contigs_basename}.diamond.html \
${input_basename}.kaiju.html \
--inputType kaiju \
--noRank --noHits \
--loglevel=DEBUG
tar czf ${contigs_basename}.diamond.krona.tar.gz ${contigs_basename}.diamond.html*
}

output {
File diamond_contigs = "${contigs_basename}.diamond.fasta"
File reads_mapped_to_contigs = "${contigs_basename}.diamond.bam"
File diamond_summary_report = "${contigs_basename}.diamond.summary_report.txt"
File diamond_classified_reads = "${contigs_basename}.diamond.reads.txt.gz"
File diamond_classified_reads_w_dupes = "${contigs_basename}.diamond.reads_w_dupes.txt.gz"
File krona_report_html = "${contigs_basename}.diamond.html"
File krona_report_tgz = "${contigs_basename}.diamond.krona.tar.gz"
File kaiju_report = "${input_basename}.kaiju.report.txt"
File kaiju_reads = "${input_basename}.kaiju.reads.gz"
File krona_report_html = "${input_basename}.kaiju.html"
String viralngs_version = "viral-ngs_version_unknown"
}

Expand All @@ -268,5 +236,3 @@ task diamond_contigs {
dx_instance_type: "mem3_ssd1_x16"
}
}


23 changes: 5 additions & 18 deletions pipes/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,32 +50,24 @@ trim_clip_db: "s3://sabeti-public-dbs/trim_clip/contaminants.fasta"
# Can be a remote path to a fasta file (s3://, gs://, etc.).
spikeins_db: "s3://sabeti-public-dbs/spikeins/ercc_spike-ins.fasta"

# Directory of kraken database for metagenomics identification
# Directory of krakenuniq database for metagenomics identification
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/kraken_viral_diversity.tar.gz
# Can be a remote path dirname (s3://, gs://, etc.).
kraken_db: "s3://sabeti-public-dbs/kraken"
# Can be a remote path dirname (s3://, gs://, etc.).
krakenuniq_db: "s3://sabeti-public-dbs/krakenuniq"

# Path of DIAMOND database file for metagenomics identification
# Path of Kaiju database file for metagenomics identification
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/nr.dmnd.gz
# Can be a remote path prefix (s3://, gs://, etc.).
diamond_db: "s3://sabeti-public-dbs/diamond/nr-20170529"
kaiju_db: "s3://sabeti-public-dbs/kaiju/nr/nr.fmi"

# Directory of the krona database for generating html reports.
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/krona_taxonomy_20160502.tar.gz
# Can be a remote path dirname (s3://, gs://, etc.).
krona_db: "s3://sabeti-public-dbs/krona"

# BWA index prefix for metagenomic alignment.
# This contains the full human reference genome, multiple diverse sequences covering human
# pathogenic viruses, as well as LSU and SSU rRNA sequences from SILVA.
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/human_viral_rrna.tar.lz4
# Can be a remote path prefix (s3://, gs://, etc.).
align_rna_db: "s3://sabeti-public-dbs/rna_bwa/human_viral_rrna"

# Directory of the NCBI taxonomy dmp files
# For pre-packaged hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/taxonomy.tar.lz4
Expand Down Expand Up @@ -226,11 +218,6 @@ vphaser_max_bins: 10
# allele freq > 0.005. If false, no filtering is performed.
vcf_merge_naive_filter: false

# Strategy for kraken classification for multiple inputs. 'single' results in 1 kraken process for
# each input bam, while 'multiple' coalesces multiple input classifications into a single kraken
# run.
kraken_execution: single

# Random seed, for non-deterministic steps. 0 means use current time.
random_seed: 0

Expand Down
Loading

0 comments on commit ddff459

Please sign in to comment.