diff --git a/metagenomics.py b/metagenomics.py
index 9d77fe3d7..2356d078c 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -7,7 +7,6 @@
__author__ = "yesimon@broadinstitute.org"
import argparse
-import codecs
import collections
import csv
import gzip
@@ -21,7 +20,6 @@
import re
import shutil
import sys
-import subprocess
import tempfile
import json
@@ -33,8 +31,7 @@
import util.cmd
import util.file
import util.misc
-import tools.bwa
-import tools.diamond
+import tools.kaiju
import tools.kraken
import tools.krona
import tools.picard
@@ -246,7 +243,7 @@ def sam_lca(db, sam_file, output=None, top_percent=10, unique_only=True):
if mapped_segs:
tax_id = process_sam_hits(db, mapped_segs, top_percent)
if tax_id is None:
- log.warn('Query: {} has no valid taxonomy paths.'.format(query_name))
+ log.warning('Query: {} has no valid taxonomy paths.'.format(query_name))
if unique_only:
continue
else:
@@ -365,7 +362,7 @@ def coverage_lca(query_ids, parents, lca_percent=100):
while query_id != 1:
path.append(query_id)
if parents.get(query_id, 0) == 0:
- log.warn('Parent for query id: {} missing'.format(query_id))
+ log.warning('Parent for query id: {} missing'.format(query_id))
break
query_id = parents[query_id]
if query_id == 1:
@@ -595,7 +592,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade
with open_or_gzopen(input_path, 'rt') as f, \
open_or_gzopen(output_path, 'wt') as out_f:
if header:
- out_f.write(next(f))
+ out_f.write(f.readline()) # Cannot use next(f) for python2
for line in f:
parts = line.split(sep)
taxid = int(parts[taxid_column])
@@ -729,33 +726,31 @@ def kraken_dfs(db, lines, taxa_hits, total_hits, taxid, level):
return cum_hits
-def parser_kraken(parser=argparse.ArgumentParser()):
+def parser_krakenuniq(parser=argparse.ArgumentParser()):
parser.add_argument('db', help='Kraken database directory.')
parser.add_argument('inBams', nargs='+', help='Input unaligned reads, BAM format.')
parser.add_argument('--outReports', nargs='+', help='Kraken summary report output file. Multiple filenames space separated.')
parser.add_argument('--outReads', nargs='+', help='Kraken per read classification output file. Multiple filenames space separated.')
- parser.add_argument('--lockMemory', action='store_true', default=False, help='Lock kraken database in RAM. Requires high ulimit -l.')
parser.add_argument(
'--filterThreshold', default=0.05, type=float, help='Kraken filter threshold (default %(default)s)'
)
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
- util.cmd.attach_main(parser, kraken, split_args=True)
+ util.cmd.attach_main(parser, krakenuniq, split_args=True)
return parser
-def kraken(db, inBams, outReports=None, outReads=None, lockMemory=False, filterThreshold=None, threads=None):
+def krakenuniq(db, inBams, outReports=None, outReads=None, lockMemory=False, filterThreshold=None, threads=None):
'''
- Classify reads by taxon using Kraken
+ Classify reads by taxon using KrakenUniq
'''
assert outReads or outReports, ('Either --outReads or --outReport must be specified.')
- kraken_tool = tools.kraken.Kraken()
- kraken_tool.pipeline(db, inBams, outReports=outReports, outReads=outReads, lockMemory=lockMemory,
- filterThreshold=filterThreshold, numThreads=threads)
-__commands__.append(('kraken', parser_kraken))
-
+ kuniq_tool = tools.kraken.KrakenUniq()
+ kuniq_tool.pipeline(db, inBams, out_reports=outReports, out_reads=outReads,
+ filter_threshold=filterThreshold, num_threads=threads)
+__commands__.append(('krakenuniq', parser_krakenuniq))
def parser_krona(parser=argparse.ArgumentParser()):
- parser.add_argument('inTsv', help='Input tab delimited file.')
+ parser.add_argument('inReport', help='Input report file (default: tsv)')
parser.add_argument('db', help='Krona taxonomy database directory.')
parser.add_argument('outHtml', help='Output html report.')
parser.add_argument('--queryColumn', help='Column of query id. (default %(default)s)', type=int, default=2)
@@ -764,263 +759,106 @@ def parser_krona(parser=argparse.ArgumentParser()):
parser.add_argument('--magnitudeColumn', help='Column of magnitude. (default %(default)s)', type=int, default=None)
parser.add_argument('--noHits', help='Include wedge for no hits.', action='store_true')
parser.add_argument('--noRank', help='Include no rank assignments.', action='store_true')
+ parser.add_argument('--inputType', help='Handling for specialized report types.', default='tsv', choices=['tsv', 'krakenuniq', 'kaiju'])
util.cmd.common_args(parser, (('loglevel', None), ('version', None)))
util.cmd.attach_main(parser, krona, split_args=True)
return parser
-def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=None, magnitudeColumn=None, noHits=None, noRank=None):
+def krona(inReport, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=None, magnitudeColumn=None, noHits=None, noRank=None,
+ inputType=None):
'''
Create an interactive HTML report from a tabular metagenomic report
'''
krona_tool = tools.krona.Krona()
- if inTsv.endswith('.gz'):
- tmp_tsv = util.file.mkstempfname('.tsv')
- with gzip.open(inTsv, 'rb') as f_in:
- with open(tmp_tsv, 'wb') as f_out:
- shutil.copyfileobj(f_in, f_out)
- to_import = [tmp_tsv]
- else:
- to_import = [inTsv]
- root_name = os.path.basename(inTsv)
-
- krona_tool.import_taxonomy(
- db,
- to_import,
- outHtml,
- query_column=queryColumn,
- taxid_column=taxidColumn,
- score_column=scoreColumn,
- magnitude_column=magnitudeColumn,
- root_name=root_name,
- no_hits=noHits,
- no_rank=noRank
- )
-
- if inTsv.endswith('.gz'):
- # Cleanup tmp .tsv files
- for tmp_tsv in to_import:
- os.unlink(tmp_tsv)
-__commands__.append(('krona', parser_krona))
-
-def parser_diamond(parser=argparse.ArgumentParser()):
- parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
- parser.add_argument('db', help='Diamond database directory.')
- parser.add_argument('taxDb', help='Taxonomy database directory.')
- parser.add_argument('outReport', help='Output taxonomy report.')
- parser.add_argument('--outReads', help='Output LCA assignments for each read.')
- util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
- util.cmd.attach_main(parser, diamond, split_args=True)
- return parser
-def diamond(inBam, db, taxDb, outReport, outReads=None, threads=None):
- '''
- Classify reads by the taxon of the Lowest Common Ancestor (LCA)
- '''
- # do not convert this to samtools bam2fq unless we can figure out how to replicate
- # the clipping functionality of Picard SamToFastq
- picard = tools.picard.SamToFastqTool()
- with util.file.fifo(2) as (fastq_pipe, diamond_pipe):
- s2fq = picard.execute(
- inBam,
- fastq_pipe,
- interleave=True,
- illuminaClipping=True,
- JVMmemory=picard.jvmMemDefault,
- background=True,
+ if inputType == 'tsv':
+ root_name = os.path.basename(inReport)
+ if inReport.endswith('.gz'):
+ tmp_tsv = util.file.mkstempfname('.tsv')
+ with gzip.open(inReport, 'rb') as f_in:
+ with open(tmp_tsv, 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ to_import = [tmp_tsv]
+ else:
+ to_import = [inReport]
+
+ krona_tool.import_taxonomy(
+ db,
+ to_import,
+ outHtml,
+ query_column=queryColumn,
+ taxid_column=taxidColumn,
+ score_column=scoreColumn,
+ magnitude_column=magnitudeColumn,
+ root_name=root_name,
+ no_hits=noHits,
+ no_rank=noRank
)
- diamond_tool = tools.diamond.Diamond()
- taxonmap = join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz')
- taxonnodes = join(taxDb, 'nodes.dmp')
- rutils = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'read_utils.py')
- cmd = '{read_utils} join_paired_fastq --outFormat fasta /dev/stdout {fastq}'.format(
- read_utils=rutils, fastq=fastq_pipe)
-
- cmd += ' | {} blastx --out {output} --outfmt 102 --sallseqid'.format(
- diamond_tool.install_and_get_path(), output=diamond_pipe)
- cmd += ' --threads {threads}'.format(threads=util.misc.sanitize_thread_count(threads))
- cmd += ' --db {db} --taxonmap {taxonmap} --taxonnodes {taxonnodes}'.format(
- db=db,
- taxonmap=taxonmap,
- taxonnodes=taxonnodes)
-
- if outReads is not None:
- # Interstitial save of stdout to output file
- cmd += ' | tee >(pigz --best > {out})'.format(out=outReads)
-
- diamond_ps = subprocess.Popen(cmd, shell=True, executable='/bin/bash')
-
- tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)
-
- with open(diamond_pipe) as lca_p:
- hits = taxa_hits_from_tsv(lca_p)
- with open(outReport, 'w') as f:
- for line in kraken_dfs_report(tax_db, hits):
- print(line, file=f)
-
- s2fq.wait()
- assert s2fq.returncode == 0
- diamond_ps.wait()
- assert diamond_ps.returncode == 0
-__commands__.append(('diamond', parser_diamond))
-
-
-def parser_diamond_fasta(parser=argparse.ArgumentParser()):
- parser.add_argument('inFasta', help='Input sequences, FASTA format, optionally gzip compressed.')
- parser.add_argument('db', help='Diamond database file.')
- parser.add_argument('taxDb', help='Taxonomy database directory.')
- parser.add_argument('outFasta', help='Output sequences, same as inFasta, with taxid|###| prepended to each sequence identifier.')
- parser.add_argument('--memLimitGb', type=float, default=None, help='approximate memory usage in GB')
- util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
- util.cmd.attach_main(parser, diamond_fasta, split_args=True)
- return parser
-def diamond_fasta(inFasta, db, taxDb, outFasta, threads=None, memLimitGb=None):
- '''
- Classify fasta sequences by the taxon of the Lowest Common Ancestor (LCA)
- '''
-
- with util.file.tmp_dir() as tmp_dir:
- # run diamond blastx on fasta sequences
- cmd = [tools.diamond.Diamond().install_and_get_path(), 'blastx',
- '-q', inFasta,
- '--db', db,
- '--outfmt', '102', # tsv: query name, taxid of LCA, e-value
- '--salltitles',# to recover the entire fasta sequence name
- '--sensitive', # this is necessary for longer reads or contigs
- '--algo', '1', # for small query files
- '--threads', str(util.misc.sanitize_thread_count(threads)),
- '--taxonmap', os.path.join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz'),
- '--taxonnodes', os.path.join(taxDb, 'nodes.dmp'),
- '--tmpdir', tmp_dir,
- ]
- if memLimitGb:
- cmd.extend(['--block-size', str(round(memLimitGb / 5.0, 1))])
- log.debug(' '.join(cmd))
- diamond_p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-
- # read the output report and load into an in-memory map
- # of sequence ID -> tax ID (there shouldn't be that many sequences)
- seq_to_tax = {}
- for line in diamond_p.stdout:
- row = line.decode('UTF-8').rstrip('\n\r').split('\t')
- tax = row[1] if row[1] != '0' else '32644' # diamond returns 0 if unclassified, but the proper taxID for that is 32644
- seq_to_tax[row[0]] = tax
- if diamond_p.poll():
- raise subprocess.CalledProcessError(diamond_p.returncode, cmd)
-
- # copy inFasta to outFasta while prepending taxid|###| to all sequence names
- log.debug("transforming {} to {}".format(inFasta, outFasta))
- with util.file.open_or_gzopen(inFasta, 'rt') as inf:
- with util.file.open_or_gzopen(outFasta, 'wt') as outf:
- for seq in Bio.SeqIO.parse(inf, 'fasta'):
- taxid = seq_to_tax.get(seq.id, '32644') # default to "unclassified"
- for text_line in util.file.fastaMaker([(
- '|'.join('taxid', taxid, seq.id),
- str(seq.seq))]):
- outf.write(text_line)
-
-__commands__.append(('diamond_fasta', parser_diamond_fasta))
-
-
-def parser_build_diamond_db(parser=argparse.ArgumentParser()):
- parser.add_argument('protein_fastas', nargs='+', help='Input protein fasta files')
- parser.add_argument('db', help='Output Diamond database file')
- util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
- util.cmd.attach_main(parser, build_diamond_db, split_args=True)
- return parser
-def build_diamond_db(protein_fastas, db, threads=None):
- tool.diamond.Diamond().build(db, protein_fastas, options={'threads':str(util.misc.sanitize_thread_count(threads))})
-__commands__.append(('build_diamond_db', parser_build_diamond_db))
+ if inReport.endswith('.gz'):
+ # Cleanup tmp .tsv files
+ for tmp_tsv in to_import:
+ os.unlink(tmp_tsv)
+
+ elif inputType == 'krakenuniq':
+ krakenuniq = tools.kraken.KrakenUniq()
+ report = krakenuniq.read_report(inReport)
+ with util.file.tempfname() as fn:
+ with open(fn, 'w') as to_import:
+ for taxid, (tax_reads, cov) in report.items():
+ print('{}\t{}\t{}'.format(taxid, tax_reads, cov), file=to_import)
+ krona_tool.import_taxonomy(
+ db, [fn], outHtml,
+ taxid_column=1, magnitude_column=2,
+ score_column=3,
+ no_hits=True, no_rank=True
+ )
+ # Rename "Avg. score" to "Est. genome coverage"
+ html_lines = util.file.slurp_file(outHtml).split('\n')
+ with util.file.tempfname() as fn:
+ with open(fn, 'w') as new_report:
+ for line in html_lines:
+ if 'score' in line:
+ line = line.replace('Avg. score', 'Est. genome coverage')
+ print(line, file=new_report)
+ os.rename(fn, outHtml)
+ return
+ elif inputType == 'kaiju':
+ kaiju = tools.kaiju.Kaiju()
+ report = kaiju.read_report(inReport)
+ with util.file.tempfname() as fn:
+ print(fn)
+ with open(fn, 'w') as to_import:
+ for taxid, reads in report.items():
+ print('{}\t{}'.format(taxid, reads), file=to_import)
+ krona_tool.import_taxonomy(
+ db, [fn], outHtml,
+ taxid_column=1, magnitude_column=2,
+ no_hits=True, no_rank=True
+ )
+ return
+ else:
+ raise NotImplementedError
+__commands__.append(('krona', parser_krona))
-def parser_align_rna_metagenomics(parser=argparse.ArgumentParser()):
+def parser_kaiju(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
- parser.add_argument('db', help='Bwa index prefix.')
+ parser.add_argument('db', help='Kaiju database .fmi file.')
parser.add_argument('taxDb', help='Taxonomy database directory.')
parser.add_argument('outReport', help='Output taxonomy report.')
- parser.add_argument('--dupeReport', help='Generate report including duplicates.')
- parser.add_argument(
- '--sensitive',
- dest='sensitive',
- action="store_true",
- help='Use sensitive instead of default BWA mem options.'
- )
- parser.add_argument('--outBam', help='Output aligned, indexed BAM file. Default is to write to temp.')
parser.add_argument('--outReads', help='Output LCA assignments for each read.')
- parser.add_argument('--dupeReads', help='Output LCA assignments for each read including duplicates.')
- parser.add_argument(
- '--JVMmemory',
- default=tools.picard.PicardTools.jvmMemDefault,
- help='JVM virtual memory size (default: %(default)s)'
- )
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
- util.cmd.attach_main(parser, align_rna_metagenomics, split_args=True)
+ util.cmd.attach_main(parser, kaiju, split_args=True)
return parser
-def align_rna_metagenomics(
- inBam,
- db,
- taxDb,
- outReport,
- dupeReport=None,
- outBam=None,
- dupeReads=None,
- outReads=None,
- sensitive=None,
- JVMmemory=None,
- threads=None,
- picardOptions=None,
-):
+def kaiju(inBam, db, taxDb, outReport, outReads=None, threads=None):
'''
- Align to metagenomics bwa index, mark duplicates, and generate LCA report
+ Classify reads by the taxon of the Lowest Common Ancestor (LCA)
'''
- picardOptions = picardOptions if picardOptions else []
-
- bwa = tools.bwa.Bwa()
- samtools = tools.samtools.SamtoolsTool()
-
- bwa_opts = ['-a']
- if sensitive:
- bwa_opts += '-k 12 -A 1 -B 1 -O 1 -E 1'.split()
-
- # TODO: Use bwa.mem's min_score_to_filter argument to decrease false
- # positives in the output. Currently, it works by summing the alignment
- # score across all alignments output by bwa for each query (reads in a
- # pair, supplementary, and secondary alignments). This is not reasonable
- # for reads with secondary alignments because it will be easier for those
- # reads/queries to exceed the threshold given by the value of the argument.
- # In this context, bwa is called using '-a' as an option and its output
- # will likely include many secondary alignments. One option is to add
- # another argument to bwa.mem, similar to min_score_to_filter, that sets a
- # threshold on the alignment score of output alignments but only filters on
- # a per-alignment level (i.e., not by summing alignment scores across all
- # alignments for each query).
-
- aln_bam = util.file.mkstempfname('.bam')
- bwa.mem(inBam, db, aln_bam, options=bwa_opts)
-
- tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)
-
- if dupeReport:
- aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam')
- samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=threads)
- sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outReads=dupeReads, unique_only=False)
- os.unlink(aln_bam_sorted)
- aln_bam_deduped = outBam if outBam else util.file.mkstempfname('.align_deduped.bam')
- opts = list(picardOptions)
- dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
- pic = tools.picard.MarkDuplicatesTool()
- pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory)
-
-
- os.unlink(aln_bam)
- aln_bam_dd_sorted = util.file.mkstempfname('.bam')
- samtools.sort(aln_bam_deduped, aln_bam_dd_sorted, args=['-n'], threads=threads)
- sam_lca_report(tax_db, aln_bam_dd_sorted, outReport=outReport, outReads=outReads)
-
- if not outBam:
- os.unlink(aln_bam_deduped)
-__commands__.append(('align_rna', parser_align_rna_metagenomics))
+ kaiju_tool = tools.kaiju.Kaiju()
+ kaiju_tool.classify(db, taxDb, inBam, output_report=outReport, output_reads=outReads, num_threads=threads)
+__commands__.append(('kaiju', parser_kaiju))
def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=None):
@@ -1039,8 +877,6 @@ def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=No
print(line, file=f)
-
-
def parser_metagenomic_report_merge(parser=argparse.ArgumentParser()):
parser.add_argument(
"metagenomic_reports",
@@ -1115,10 +951,8 @@ def metagenomic_report_merge(metagenomic_reports, out_kraken_summary, kraken_db,
-def kraken_library_ids(library):
- '''Parse gi/accession from ids of fasta files in library directory. '''
- library_taxids = set()
- library_gis = set()
+def fasta_library_accessions(library):
+ '''Parse accession from ids of fasta files in library directory. '''
library_accessions = set()
for dirpath, dirnames, filenames in os.walk(library, followlinks=True):
for filename in filenames:
@@ -1127,32 +961,20 @@ def kraken_library_ids(library):
filepath = os.path.join(dirpath, filename)
for seqr in SeqIO.parse(filepath, 'fasta'):
name = seqr.name
- # Search for encoded taxid
- mo = re.search('kraken:taxid\|(\d+)\|', name)
- if mo:
- taxid = int(mo.group(1))
- library_taxids.add(taxid)
- continue
- # Search for gi
- mo = re.search('gi\|(\d+)\|', name)
- if mo:
- gi = int(mo.group(1))
- library_gis.add(gi)
- continue
# Search for accession
- mo = re.search('([A-Z]+\d+\.\d+)', name)
+ mo = re.search('([A-Z]+_?\d+\.\d+)', name)
if mo:
accession = mo.group(1)
library_accessions.add(accession)
- return library_taxids, library_gis, library_accessions
+ return library_accessions
-class KrakenBuildError(Exception):
- '''Error while building kraken database.'''
+class KrakenUniqBuildError(Exception):
+ '''Error while building KrakenUniq database.'''
def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
parser.add_argument('in_bam', help='Input bam file.')
- parser.add_argument('read_IDs_to_tax_IDs', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
+ parser.add_argument('read_IDs_to_tax_IDs', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.')
parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified')
parser.add_argument('nodes_dmp', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
parser.add_argument('names_dmp', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/')
@@ -1168,27 +990,22 @@ def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()):
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True)
- return parser
-
-def filter_bam_to_taxa( in_bam,
- read_IDs_to_tax_IDs,
- out_bam,
- nodes_dmp,
- names_dmp,
- tax_names=None,
- tax_ids=None,
- omit_children=False,
- read_id_col=1,
- tax_id_col=2,
- JVMmemory=None ):
+ return parser
+
+def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
+ nodes_dmp, names_dmp,
+ tax_names=None, tax_ids=None,
+ omit_children=False,
+ read_id_col=1, tax_id_col=2,
+ JVMmemory=None):
"""
- Filter an (already classified) input bam file to only include reads that have been mapped to specified
- taxonomic IDs or scientific names. This requires a classification file, as produced
+ Filter an (already classified) input bam file to only include reads that have been mapped to specified
+ taxonomic IDs or scientific names. This requires a classification file, as produced
by tools such as Kraken, as well as the NCBI taxonomy database.
"""
tax_ids = set(tax_ids) if tax_ids else set()
tax_names = tax_names or []
- # use TaxonomyDb() class above and tree traversal/collection functions above
+ # use TaxonomyDb() class above and tree traversal/collection functions above
db = TaxonomyDb(nodes_path=nodes_dmp, names_path=names_dmp, load_nodes=True, load_names=True)
db.children = parents_to_children(db.parents)
@@ -1249,10 +1066,10 @@ def filter_bam_to_taxa( in_bam,
# if we found reads matching the taxNames requested,
if read_ids_written > 0:
# filter the input bam to include only these
- tools.picard.FilterSamReadsTool().execute(in_bam,
- False,
- temp_read_list,
- out_bam,
+ tools.picard.FilterSamReadsTool().execute(in_bam,
+ False,
+ temp_read_list,
+ out_bam,
JVMmemory=JVMmemory)
else:
# otherwise, "touch" the output bam to contain the
@@ -1267,7 +1084,7 @@ def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
parser.add_argument('--jsonOut', dest="json_out", type=argparse.FileType('w'), help='The path to a json file containing the relevant parsed summary data in json format.')
parser.add_argument('--csvOut', dest="csv_out", type=argparse.FileType('w'), help='The path to a csv file containing sample-specific counts.')
parser.add_argument('--taxHeading', nargs="+", dest="tax_headings", help='The taxonomic heading to analyze (default: %(default)s). More than one can be specified.', default="Viruses")
- parser.add_argument('--taxlevelFocus', dest="taxlevel_focus", help='The taxonomic heading to summarize (totals by Genus, etc.) (default: %(default)s).', default="species",
+ parser.add_argument('--taxlevelFocus', dest="taxlevel_focus", help='The taxonomic heading to summarize (totals by Genus, etc.) (default: %(default)s).', default="species",
choices=["species", "genus", "family", "order", "class", "phylum", "kingdom", "superkingdom"])
parser.add_argument('--topN', type=int, dest="top_n_entries", help='Only include the top N most abundant taxa by read count (default: %(default)s)', default=100)
parser.add_argument('--countThreshold', type=int, dest="count_threshold", help='Minimum number of reads to be included (default: %(default)s)', default=1)
@@ -1281,13 +1098,13 @@ def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel_focus, top_n_entries, count_threshold, no_hist, zero_fill, include_root):
"""
Aggregates taxonomic abundance data from multiple Kraken-format summary files.
- It is intended to report information on a particular taxonomic level (--taxlevelFocus; ex. 'species'),
- within a higher-level grouping (--taxHeading; ex. 'Viruses'). By default, when --taxHeading
- is at the same level as --taxlevelFocus a summary with lines for each sample is emitted.
+ It is intended to report information on a particular taxonomic level (--taxlevelFocus; ex. 'species'),
+ within a higher-level grouping (--taxHeading; ex. 'Viruses'). By default, when --taxHeading
+ is at the same level as --taxlevelFocus a summary with lines for each sample is emitted.
Otherwise, a histogram is returned. If per-sample information is desired, --noHist can be specified.
In per-sample data, the suffix "-pt" indicates percentage, so a value of 0.02 is 0.0002 of the total number of reads for the sample.
- If --topN is specified, only the top N most abundant taxa are included in the histogram count or per-sample output.
- If a number is specified for --countThreshold, only taxa with that number of reads (or greater) are included.
+ If --topN is specified, only the top N most abundant taxa are included in the histogram count or per-sample output.
+ If a number is specified for --countThreshold, only taxa with that number of reads (or greater) are included.
Full data returned via --jsonOut (filtered by --topN and --countThreshold), whereas -csvOut returns a summary.
"""
@@ -1295,7 +1112,7 @@ def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel
same_level = False
Abundance = collections.namedtuple("Abundance", "percent,count")
-
+
def indent_len(in_string):
return len(in_string)-len(in_string.lstrip())
@@ -1315,11 +1132,11 @@ def indent_len(in_string):
csv.register_dialect('kraken_report', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
fieldnames = ["pct_of_reads","num_reads","reads_exc_children","rank","NCBI_tax_ID","sci_name"]
row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report"))
-
+
indent_of_line = indent_len(row["sci_name"])
# remove leading/trailing whitespace from each item
row = { k:v.strip() for k, v in row.items()}
-
+
# rows are formatted like so:
# 0.00 16 0 D 10239 Viruses
#
@@ -1330,7 +1147,7 @@ def indent_len(in_string):
# row["NCBI_tax_ID"] NCBI taxonomy ID
# row["sci_name"] indented scientific name
- # if the root-level bins (root, unclassified) should be included, do so, but bypass normal
+ # if the root-level bins (root, unclassified) should be included, do so, but bypass normal
# stateful parsing logic since root does not have a distinct rank level
if row["sci_name"].lower() in ["root","unclassified"] and include_root:
sample_root_summary[row["sci_name"]] = collections.OrderedDict()
@@ -1344,7 +1161,7 @@ def indent_len(in_string):
if indent_of_selection == -1:
if row["sci_name"].lower() in tax_headings_copy:
tax_headings_copy.remove(row["sci_name"].lower())
-
+
should_process = True
indent_of_selection = indent_of_line
currently_being_processed = row["sci_name"]
@@ -1353,15 +1170,15 @@ def indent_len(in_string):
same_level = True
if row["rank"] == "-":
log.warning("Non-taxonomic parent level selected")
-
+
if should_process:
# skip "-" rank levels since they do not occur at the sample level
# otherwise include the taxon row if the rank matches the desired level of focus
- if (row["rank"] != "-" and rank_code(taxlevel_focus) == row["rank"]):
+ if (row["rank"] != "-" and rank_code(taxlevel_focus) == row["rank"]):
if int(row["num_reads"])>=count_threshold:
sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]))
-
+
for k,taxa in sample_summary.items():
sample_summary[k] = collections.OrderedDict(sorted(taxa.items(), key=lambda item: (item[1][1]) , reverse=True)[:top_n_entries])
@@ -1370,8 +1187,8 @@ def indent_len(in_string):
"\"{name}\" with {reads} reads ({percent:.2%} of total); "
"included since >{threshold} read{plural}".format(
f=f,
- heading=k,
- level=taxlevel_focus,
+ heading=k,
+ level=taxlevel_focus,
name=list(sample_summary[k].items())[0][0],
reads=list(sample_summary[k].items())[0][1].count,
percent=list(sample_summary[k].items())[0][1].percent/100.0,
@@ -1417,12 +1234,12 @@ def indent_len(in_string):
for sample, taxa in samples.items():
sample_dict = {}
sample_dict["sample"] = sample
- for heading,taxon in taxa.items():
+ for heading,taxon in taxa.items():
for entry in taxon.keys():
sample_dict[entry+"-pt"] = taxon[entry].percent
sample_dict[entry+"-ct"] = taxon[entry].count
writer.writerow(sample_dict)
-
+
csv_out.close()
@@ -1455,29 +1272,28 @@ def indent_len(in_string):
__commands__.append(('taxlevel_summary', parser_kraken_taxlevel_summary))
-def parser_kraken_build(parser=argparse.ArgumentParser()):
- parser.add_argument('db', help='Kraken database output directory.')
+def parser_krakenuniq_build(parser=argparse.ArgumentParser()):
+ parser.add_argument('db', help='Krakenuniq database output directory.')
parser.add_argument('--library', help='Input library directory of fasta files. If not specified, it will be read from the "library" subdirectory of "db".')
parser.add_argument('--taxonomy', help='Taxonomy db directory. If not specified, it will be read from the "taxonomy" subdirectory of "db".')
parser.add_argument('--subsetTaxonomy', action='store_true', help='Subset taxonomy based on library fastas.')
- parser.add_argument('--minimizerLen', type=int, help='Minimizer length (kraken default: 15)')
- parser.add_argument('--kmerLen', type=int, help='k-mer length (kraken default: 31)')
+ parser.add_argument('--minimizerLen', type=int, help='Minimizer length (krakenuniq default: 15)')
+ parser.add_argument('--kmerLen', type=int, help='k-mer length (krakenuniq default: 31)')
parser.add_argument('--maxDbSize', type=int, help='Maximum db size in GB (will shrink if too big)')
parser.add_argument('--clean', action='store_true', help='Clean by deleting other database files after build')
parser.add_argument('--workOnDisk', action='store_true', help='Work on disk instead of RAM. This is generally much slower unless the "db" directory lives on a RAM disk.')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
- util.cmd.attach_main(parser, kraken_build, split_args=True)
+ util.cmd.attach_main(parser, krakenuniq_build, split_args=True)
return parser
-def kraken_build(db, library, taxonomy=None, subsetTaxonomy=None,
- threads=None, workOnDisk=False,
- minimizerLen=None, kmerLen=None, maxDbSize=None, clean=False):
+def krakenuniq_build(db, library, taxonomy=None, subsetTaxonomy=None,
+ threads=None, workOnDisk=False,
+ minimizerLen=None, kmerLen=None, maxDbSize=None, clean=False):
'''
- Builds a kraken database from library directory of fastas and taxonomy db
- directory. The --subsetTaxonomy option allows shrinking the taxonomy to only
- include taxids associated with the library folders. For this to work, the
- library fastas must have the standard id names such as `>NC1234.1`
- accessions, `>gi|123456789|ref|XXXX||`, or custom kraken name
- `>kraken:taxid|1234|`.
+ Builds a krakenuniq database from library directory of fastas and taxonomy
+ db directory. The --subsetTaxonomy option allows shrinking the taxonomy to
+ only include taxids associated with the library folders. For this to work,
+ the library fastas must have the standard accession id names such as
+ `>NC1234.1` or `>NC_01234.1`.
Setting the --minimizerLen (default: 16) small, such as 10, will drastically
shrink the db size for small inputs, which is useful for testing.
@@ -1503,45 +1319,28 @@ def kraken_build(db, library, taxonomy=None, subsetTaxonomy=None,
taxonomy_exists = os.path.exists(taxonomy_dir)
if taxonomy:
if taxonomy_exists:
- raise KrakenBuildError('Output db directory already contains taxonomy directory {}'.format(taxonomy_dir))
+ raise KrakenUniqBuildError('Output db directory already contains taxonomy directory {}'.format(taxonomy_dir))
if subsetTaxonomy:
- kraken_taxids, kraken_gis, kraken_accessions = kraken_library_ids(library)
-
- whitelist_taxid_f = util.file.mkstempfname()
- with open(whitelist_taxid_f, 'wt') as f:
- for taxid in kraken_taxids:
- print(taxid, file=f)
-
- whitelist_gi_f = util.file.mkstempfname()
- with open(whitelist_gi_f, 'wt') as f:
- for gi in kraken_gis:
- print(gi, file=f)
+ accessions = fasta_library_accessions(library)
whitelist_accession_f = util.file.mkstempfname()
with open(whitelist_accession_f, 'wt') as f:
- for accession in kraken_accessions:
+ for accession in accessions:
print(accession, file=f)
# Context-managerize eventually
taxonomy_tmp = tempfile.mkdtemp()
- subset_taxonomy(taxonomy, taxonomy_tmp, whitelistTaxidFile=whitelist_taxid_f,
- whitelistGiFile=whitelist_gi_f, whitelistAccessionFile=whitelist_accession_f)
+ subset_taxonomy(taxonomy, taxonomy_tmp, whitelistAccessionFile=whitelist_accession_f)
shutil.move(taxonomy_tmp, taxonomy_dir)
else:
os.symlink(os.path.abspath(taxonomy), taxonomy_dir)
- for fn in os.listdir(os.path.join(taxonomy_dir, 'accession2taxid')):
- if not fn.endswith('accession2taxid'):
- continue
-
- os.symlink(os.path.join(taxonomy_dir, 'accession2taxid', fn),
- os.path.join(taxonomy_dir, fn))
else:
if not taxonomy_exists:
raise FileNotFoundError('Taxonomy directory {} not found'.format(taxonomy_dir))
if subsetTaxonomy:
- raise KrakenBuildError('Cannot subset taxonomy if already in db folder')
+ raise KrakenUniqBuildError('Cannot subset taxonomy if already in db folder')
- kraken_tool = tools.kraken.Kraken()
+ krakenuniq_tool = tools.kraken.KrakenUniq()
options = {'--build': None}
if threads:
options['--threads'] = threads
@@ -1553,11 +1352,11 @@ def kraken_build(db, library, taxonomy=None, subsetTaxonomy=None,
options['--max-db-size'] = maxDbSize
if workOnDisk:
options['--work-on-disk'] = None
- kraken_tool.build(db, options=options)
+ krakenuniq_tool.build(db, options=options)
if clean:
- kraken_tool.execute('kraken-build', db, '', options={'--clean': None})
-__commands__.append(('kraken_build', parser_kraken_build))
+ krakenuniq_tool.execute('krakenuniq-build', db, '', options={'--clean': None})
+__commands__.append(('krakenuniq_build', parser_krakenuniq_build))
diff --git a/pipes/WDL/dx-defaults-classify_kaiju.json b/pipes/WDL/dx-defaults-classify_kaiju.json
new file mode 100644
index 000000000..1616c0376
--- /dev/null
+++ b/pipes/WDL/dx-defaults-classify_kaiju.json
@@ -0,0 +1,8 @@
+{
+ "classify_kaiju.kaiju.kaiju_db_lz4":
+ "dx://file-FVYQqP006zFF064QBGf022X1",
+ "classify_kaiju.kaiju.krona_taxonomy_db_tgz":
+ "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb",
+ "classify_kaiju.kaiju.ncbi_taxonomy_db_tgz":
+ "dx://file-F8KgJK009y3Qgy3FF1791Vgq"
+}
diff --git a/pipes/WDL/dx-defaults-classify_kraken.json b/pipes/WDL/dx-defaults-classify_kraken.json
deleted file mode 100644
index be31e4ca5..000000000
--- a/pipes/WDL/dx-defaults-classify_kraken.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "classify_kraken.kraken.kraken_db_tar_lz4":
- "dx://file-F8QF3bj0xf5gZv76BG4QgpF4",
- "classify_kraken.kraken.krona_taxonomy_db_tgz":
- "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
-}
diff --git a/pipes/WDL/dx-defaults-classify_krakenuniq.json b/pipes/WDL/dx-defaults-classify_krakenuniq.json
new file mode 100644
index 000000000..eabf57fc7
--- /dev/null
+++ b/pipes/WDL/dx-defaults-classify_krakenuniq.json
@@ -0,0 +1,6 @@
+{
+ "classify_krakenuniq.krakenuniq.krakenuniq_db_tar_lz4":
+ "dx://file-FVYQqP006zFF064QBGf022X1",
+ "classify_krakenuniq.krakenuniq.krona_taxonomy_db_tgz":
+ "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
+}
diff --git a/pipes/WDL/dx-defaults-demux_plus.json b/pipes/WDL/dx-defaults-demux_plus.json
index a50a8c985..1b25a153d 100644
--- a/pipes/WDL/dx-defaults-demux_plus.json
+++ b/pipes/WDL/dx-defaults-demux_plus.json
@@ -15,8 +15,8 @@
"demux_plus.trim_clip_db":
"dx://file-BXF0vYQ0QyBF509G9J12g927",
- "demux_plus.kraken.kraken_db_tar_lz4":
- "dx://file-F8QF3bj0xf5gZv76BG4QgpF4",
- "demux_plus.kraken.krona_taxonomy_db_tgz":
+ "demux_plus.krakenuniq.krakenuniq_db_tar_lz4":
+ "dx://file-FVYQqP006zFF064QBGf022X1",
+ "demux_plus.krakenuniq.krona_taxonomy_db_tgz":
"dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb"
}
diff --git a/pipes/WDL/workflows/classify_kaiju.wdl b/pipes/WDL/workflows/classify_kaiju.wdl
new file mode 100644
index 000000000..5fe139e34
--- /dev/null
+++ b/pipes/WDL/workflows/classify_kaiju.wdl
@@ -0,0 +1,5 @@
+import "tasks_metagenomics.wdl" as metagenomics
+
+workflow classify_kaiju {
+ call metagenomics.kaiju
+}
diff --git a/pipes/WDL/workflows/classify_kraken.wdl b/pipes/WDL/workflows/classify_kraken.wdl
deleted file mode 100644
index 50a72d6f0..000000000
--- a/pipes/WDL/workflows/classify_kraken.wdl
+++ /dev/null
@@ -1,5 +0,0 @@
-import "tasks_metagenomics.wdl" as metagenomics
-
-workflow classify_kraken {
- call metagenomics.kraken
-}
diff --git a/pipes/WDL/workflows/classify_krakenuniq.wdl b/pipes/WDL/workflows/classify_krakenuniq.wdl
new file mode 100644
index 000000000..8a664f345
--- /dev/null
+++ b/pipes/WDL/workflows/classify_krakenuniq.wdl
@@ -0,0 +1,5 @@
+import "tasks_metagenomics.wdl" as metagenomics
+
+workflow classify_krakenuniq {
+ call metagenomics.krakenuniq
+}
diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl
index 02170aa0a..70128d7bf 100644
--- a/pipes/WDL/workflows/demux_metag.wdl
+++ b/pipes/WDL/workflows/demux_metag.wdl
@@ -25,18 +25,14 @@ workflow demux_metag {
assembler = "spades",
reads_unmapped_bam = deplete.cleaned_bam
}
- call metagenomics.diamond_contigs as diamond {
- input:
- contigs_fasta = spades.contigs_fasta,
- reads_unmapped_bam = deplete.cleaned_bam,
- krona_taxonomy_db_tar_lz4 = krona_taxonomy_db_tgz
- }
}
- call metagenomics.kraken as kraken {
+ call metagenomics.krakenuniq as kraken {
+ input:
+ reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
+ }
+ call metagenomics.kaiju as kaiju {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
- krona_taxonomy_db_tgz = krona_taxonomy_db_tgz
}
-
}
diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl
index b1f388c39..d2a6e0c2c 100644
--- a/pipes/WDL/workflows/demux_plus.wdl
+++ b/pipes/WDL/workflows/demux_plus.wdl
@@ -35,7 +35,7 @@ workflow demux_plus {
}
}
- call metagenomics.kraken as kraken {
+ call metagenomics.krakenuniq as krakenuniq {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams
}
diff --git a/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl b/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
index 194255e24..6d43044ca 100644
--- a/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
+++ b/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl
@@ -1,17 +1,12 @@
-
-# TO DO:
-# kraken_build (input & output tarballs)
-# diamond, bwa, etc
-
-task kraken {
+task krakenuniq {
Array[File] reads_unmapped_bam
- File kraken_db_tar_lz4
- File krona_taxonomy_db_tgz
+ File krakenuniq_db_tar_lz4 # krakenuniq/{database.kdb,taxonomy}
+ File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
parameter_meta {
- kraken_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type
+ krakenuniq_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type
krona_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type
- #reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type
+ reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type
}
command {
@@ -24,7 +19,7 @@ task kraken {
# decompress DB to $DB_DIR
read_utils.py extract_tarball \
- ${kraken_db_tar_lz4} $DB_DIR \
+ ${krakenuniq_db_tar_lz4} $DB_DIR \
--loglevel=DEBUG
read_utils.py extract_tarball \
${krona_taxonomy_db_tgz} . \
@@ -33,17 +28,17 @@ task kraken {
# prep input and output file names
OUT_READS=fnames_outreads.txt
OUT_REPORTS=fnames_outreports.txt
- OUT_BASENAME=basenames_reads.txt
+ OUT_BASENAME=basenames_reports.txt
for bam in ${sep=' ' reads_unmapped_bam}; do
- echo "$(basename $bam .bam).kraken-reads" >> $OUT_BASENAME
- echo "$(basename $bam .bam).kraken-reads.txt.gz" >> $OUT_READS
- echo "$(basename $bam .bam).kraken-summary_report.txt" >> $OUT_REPORTS
+ echo "$(basename $bam .bam).krakenuniq-reads.txt.gz" >> $OUT_READS
+ echo "$(basename $bam .bam).krakenuniq" >> $OUT_BASENAME
+ echo "$(basename $bam .bam).krakenuniq-summary_report.txt" >> $OUT_REPORTS
done
# execute on all inputs and outputs serially, but with a single
# database load into ram
- metagenomics.py kraken \
- $DB_DIR \
+ metagenomics.py krakenuniq \
+ $DB_DIR/krakenuniq \
${sep=' ' reads_unmapped_bam} \
--outReads `cat $OUT_READS` \
--outReport `cat $OUT_REPORTS` \
@@ -54,21 +49,18 @@ task kraken {
# run single-threaded krona on up to nproc samples at once
parallel -I ,, \
"metagenomics.py krona \
- ,,.txt.gz \
+ ,,-summary_report.txt \
taxonomy \
- ,,.html \
- --noRank --noHits \
+ ,,.krona.html \
+ --noRank --noHits --inputType krakenuniq \
--loglevel=DEBUG" \
::: `cat $OUT_BASENAME`
- # run single-threaded gzip on up to nproc samples at once
- parallel -I ,, "tar czf ,,.krona.tar.gz ,,.html*" ::: `cat $OUT_BASENAME`
}
output {
- Array[File] kraken_classified_reads = glob("*.kraken-reads.txt.gz")
- Array[File] kraken_summary_report = glob("*.kraken-summary_report.txt")
- Array[File] krona_report_html = glob("*.kraken-reads.html")
- Array[File] krona_report_tgz = glob("*.kraken-reads.krona.tar.gz")
+ Array[File] krakenuniq_classified_reads = glob("*.krakenuniq-reads.txt.gz")
+ Array[File] krakenuniq_summary_report = glob("*.krakenuniq-summary_report.txt")
+ Array[File] krona_report_html = glob("*.krakenuniq.krona.html")
String viralngs_version = "viral-ngs_version_unknown"
}
@@ -105,13 +97,10 @@ task krona {
${input_basename}.html \
--noRank --noHits \
--loglevel=DEBUG
-
- tar czf ${input_basename}.krona.tar.gz ${input_basename}.html*
}
output {
- File krona_report_html = "${input_basename}.html"
- File krona_report_tgz = "${input_basename}.krona.tar.gz"
+ File krona_report_html = "${input_basename}.html"
String viralngs_version = "viral-ngs_version_unknown"
}
@@ -182,19 +171,18 @@ task filter_bam_to_taxa {
}
-task diamond_contigs {
- File contigs_fasta
+task kaiju {
File reads_unmapped_bam
- File diamond_db_lz4
- File diamond_taxonomy_db_tar_lz4
- File krona_taxonomy_db_tar_lz4
+ File kaiju_db_lz4 # .fmi
+ File ncbi_taxonomy_db_tgz # taxonomy/{nodes.dmp, names.dmp}
+ File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
- String contigs_basename = basename(contigs_fasta, ".fasta")
+ String input_basename = basename(reads_unmapped_bam, ".bam")
parameter_meta {
- diamond_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
- diamond_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
- krona_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
+ kaiju_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type
+ ncbi_taxonomy_db_tgz : "stream"
+ krona_taxonomy_db_tgz : "stream"
}
command {
@@ -203,61 +191,41 @@ task diamond_contigs {
if [ -d /mnt/tmp ]; then
TMPDIR=/mnt/tmp
fi
- DIAMOND_TAXDB_DIR=$(mktemp -d)
+ DB_DIR=$(mktemp -d)
- # find 90% memory
- mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90`
+ lz4 -d ${kaiju_db_lz4} $DB_DIR/kaiju.fmi
- # decompress DBs to /mnt/db
- cat ${diamond_db_lz4} | lz4 -d > $TMPDIR/diamond_db.dmnd &
read_utils.py extract_tarball \
- ${diamond_taxonomy_db_tar_lz4} $DIAMOND_TAXDB_DIR \
- --loglevel=DEBUG &
- wait
- read_utils.py extract_tarball \
- ${krona_taxonomy_db_tar_lz4} . \
- --loglevel=DEBUG & # we don't need this until later
+ ${ncbi_taxonomy_db_tgz} $DB_DIR \
+ --loglevel=DEBUG
- # classify contigs
- metagenomics.py diamond_fasta \
- ${contigs_fasta} \
- $TMPDIR/diamond_db.dmnd \
- $DIAMOND_TAXDB_DIR/taxonomy/ \
- ${contigs_basename}.diamond.fasta \
- --memLimitGb $mem_in_gb \
+ read_utils.py extract_tarball \
+ ${krona_taxonomy_db_tgz} . \
--loglevel=DEBUG
- # map reads to contigs & create kraken-like read report
- bwa index ${contigs_basename}.diamond.fasta
- metagenomics.py align_rna \
+ # classify contigs
+ metagenomics.py kaiju \
${reads_unmapped_bam} \
- ${contigs_basename}.diamond.fasta \
- $DIAMOND_TAXDB_DIR/taxonomy/ \
- ${contigs_basename}.diamond.summary_report.txt \
- --outReads ${contigs_basename}.diamond.reads.txt.gz \
- --dupeReads ${contigs_basename}.diamond.reads_w_dupes.txt.gz \
- --outBam ${contigs_basename}.diamond.bam \
+ $DB_DIR/kaiju.fmi \
+ $DB_DIR/taxonomy \
+ ${input_basename}.kaiju.report.txt \
+ --outReads ${input_basename}.kaiju.reads.gz \
--loglevel=DEBUG
# run krona
- wait # for krona_taxonomy_db_tgz to download and extract
metagenomics.py krona \
- ${contigs_basename}.diamond.reads.txt.gz \
+ ${input_basename}.kaiju.report.txt \
taxonomy \
- ${contigs_basename}.diamond.html \
+ ${input_basename}.kaiju.html \
+ --inputType kaiju \
--noRank --noHits \
--loglevel=DEBUG
- tar czf ${contigs_basename}.diamond.krona.tar.gz ${contigs_basename}.diamond.html*
}
output {
- File diamond_contigs = "${contigs_basename}.diamond.fasta"
- File reads_mapped_to_contigs = "${contigs_basename}.diamond.bam"
- File diamond_summary_report = "${contigs_basename}.diamond.summary_report.txt"
- File diamond_classified_reads = "${contigs_basename}.diamond.reads.txt.gz"
- File diamond_classified_reads_w_dupes = "${contigs_basename}.diamond.reads_w_dupes.txt.gz"
- File krona_report_html = "${contigs_basename}.diamond.html"
- File krona_report_tgz = "${contigs_basename}.diamond.krona.tar.gz"
+ File kaiju_report = "${input_basename}.kaiju.report.txt"
+ File kaiju_reads = "${input_basename}.kaiju.reads.gz"
+ File krona_report_html = "${input_basename}.kaiju.html"
String viralngs_version = "viral-ngs_version_unknown"
}
@@ -268,5 +236,3 @@ task diamond_contigs {
dx_instance_type: "mem3_ssd1_x16"
}
}
-
-
diff --git a/pipes/config.yaml b/pipes/config.yaml
index 3178c6df8..82483df71 100644
--- a/pipes/config.yaml
+++ b/pipes/config.yaml
@@ -50,17 +50,17 @@ trim_clip_db: "s3://sabeti-public-dbs/trim_clip/contaminants.fasta"
# Can be a remote path to a fasta file (s3://, gs://, etc.).
spikeins_db: "s3://sabeti-public-dbs/spikeins/ercc_spike-ins.fasta"
-# Directory of kraken database for metagenomics identification
+# Directory of krakenuniq database for metagenomics identification
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/kraken_viral_diversity.tar.gz
-# Can be a remote path dirname (s3://, gs://, etc.).
-kraken_db: "s3://sabeti-public-dbs/kraken"
+# Can be a remote path dirname (s3://, gs://, etc.).
+krakenuniq_db: "s3://sabeti-public-dbs/krakenuniq"
-# Path of DIAMOND database file for metagenomics identification
+# Path of Kaiju database file for metagenomics identification
# For pre-built hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/nr.dmnd.gz
# Can be a remote path prefix (s3://, gs://, etc.).
-diamond_db: "s3://sabeti-public-dbs/diamond/nr-20170529"
+kaiju_db: "s3://sabeti-public-dbs/kaiju/nr/nr.fmi"
# Directory of the krona database for generating html reports.
# For pre-built hosted databases, you may download:
@@ -68,14 +68,6 @@ diamond_db: "s3://sabeti-public-dbs/diamond/nr-20170529"
# Can be a remote path dirname (s3://, gs://, etc.).
krona_db: "s3://sabeti-public-dbs/krona"
-# BWA index prefix for metagenomic alignment.
-# This contains the full human reference genome, multiple diverse sequences covering human
-# pathogenic viruses, as well as LSU and SSU rRNA sequences from SILVA.
-# For pre-built hosted databases, you may download:
-# - https://storage.googleapis.com/sabeti-public/meta_dbs/human_viral_rrna.tar.lz4
-# Can be a remote path prefix (s3://, gs://, etc.).
-align_rna_db: "s3://sabeti-public-dbs/rna_bwa/human_viral_rrna"
-
# Directory of the NCBI taxonomy dmp files
# For pre-packaged hosted databases, you may download:
# - https://storage.googleapis.com/sabeti-public/meta_dbs/taxonomy.tar.lz4
@@ -226,11 +218,6 @@ vphaser_max_bins: 10
# allele freq > 0.005. If false, no filtering is performed.
vcf_merge_naive_filter: false
-# Strategy for kraken classification for multiple inputs. 'single' results in 1 kraken process for
-# each input bam, while 'multiple' coalesces multiple input classifications into a single kraken
-# run.
-kraken_execution: single
-
# Random seed, for non-deterministic steps. 0 means use current time.
random_seed: 0
diff --git a/pipes/rules/metagenomics.rules b/pipes/rules/metagenomics.rules
index 3baf5a3e1..8d82bf8a2 100644
--- a/pipes/rules/metagenomics.rules
+++ b/pipes/rules/metagenomics.rules
@@ -6,204 +6,88 @@ samples = list(read_samples_file(config.get("samples_metagenomics")))
rule all_metagenomics:
input:
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.raw.{method}.report"), sample=samples, method=['kraken', 'rna_bwa', 'rna_bwa.nodupes']),
+ "{sample}.raw.{method}.report"), sample=samples, method=['krakenuniq']),
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.raw.{method}.krona.html"), sample=samples, method=['kraken', 'rna_bwa', 'rna_bwa_nodupes'])
+ "{sample}.raw.{method}.krona.html"), sample=samples, method=['krakenuniq'])
rule all_metagenomics_host_depleted:
input:
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.cleaned.{method}.report"), sample=samples, method=['kraken', 'rna_bwa']),
+ "{sample}.cleaned.{method}.report"), sample=samples, method=['krakenuniq']),
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.cleaned.{method}.krona.html"), sample=samples, method=['kraken', 'rna_bwa'])
-
-
-method_props = {
- 'diamond': {
- 'reads_ext': 'diamond.lca.gz',
- },
- 'kraken': {
- 'reads_ext': 'kraken.reads.gz',
- },
- 'rna_bwa': {
- 'reads_ext': 'rna_bwa.lca.gz',
- },
- 'rna_bwa_nodupes': {
- 'reads_ext': 'rna_bwa.lca_nodupes.gz',
- }
-}
+ "{sample}.cleaned.{method}.krona.html"), sample=samples, method=['krakenuniq'])
+
taxfiles = [
- 'names.dmp',
- 'nodes.dmp',
- 'merged.dmp'
- ]
-
-taxfiles_kraken = [
- 'names.dmp',
- 'nodes.dmp'
+ 'names.dmp',
+ 'nodes.dmp',
]
taxfiles_krona = [
- 'taxonomy.tab',
+ 'taxonomy.tab',
]
-rna_bwa_ext = ['sa', 'bwt', 'amb', 'sa', 'ann', 'pac']
-rule diamond:
+rule kaiju:
input:
- bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"),
- diamond_db = objectify_remote(expand("{path_prefix}.{ext}", path_prefix=config["diamond_db"], ext=["dmnd"])),
- taxonomy_db = objectify_remote(expand("{path_prefix}/{taxfile}", path_prefix=config["taxonomy_db"], taxfile=taxfiles))
+ bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"),
+ kaiju_db = objectify_remote(config['kaiju_db']),
+ taxonomy_db = objectify_remote(expand("{path_prefix}/{taxfile}", path_prefix=config['taxonomy_db'], taxfile=taxfiles))
output:
- report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.diamond.report"),
- reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.diamond.lca.gz")
+ report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kaiju.report"),
+ reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kaiju.reads.gz")
resources:
threads = int(config.get("number_of_threads", 1)),
- mem_mb = 120*1000
+ mem_mb = 50*1000
params:
- UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00')
+ UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=12:00:00')
run:
- diamond_db_prefix = os.path.splitext(strip_protocol(config["diamond_db"], relative=True))[0]
taxonomy_db_prefix = strip_protocol(config["taxonomy_db"], relative=True)
- shell("{config[bin_dir]}/metagenomics.py diamond {input.bam} "+diamond_db_prefix+" "+taxonomy_db_prefix+" {output.report} --outReads {output.reads} --threads {resources.threads}")
+ shell("{config[bin_dir]}/metagenomics.py kaiju {input.bam} {input.kaiju_db} "+taxonomy_db_prefix+" {output.report} --outReads {output.reads} --threads {resources.threads}")
-rule align_rna:
+
+rule krakenuniq:
input:
- bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"),
- rna_bwa_db = objectify_remote(expand("{path_prefix}.{ext}", path_prefix=config["align_rna_db"], ext=rna_bwa_ext)),
- taxonomy_db = objectify_remote(expand("{path_prefix}/{taxfile}", path_prefix=config["taxonomy_db"], taxfile=taxfiles))
+ bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam")
output:
- report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.report"),
- nodupes_report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.nodupes.report"),
- bam = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.bam"),
- lca = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca.gz"),
- nodupes_lca = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca_nodupes.gz")
+ report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.krakenuniq.report"),
+ reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.krakenuniq.reads.gz")
resources:
threads = int(config.get("number_of_threads", 1)),
- mem_mb = 7*1000
+ mem_mb = 120*1000
params:
UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00')
- run:
- rna_bwa_path_prefix = strip_protocol(config["align_rna_db"], relative=True)
- taxonomy_db_prefix = strip_protocol(config["taxonomy_db"], relative=True)
- shell("{config[bin_dir]}/metagenomics.py align_rna {input.bam} "+rna_bwa_path_prefix+" "+taxonomy_db_prefix+" {output.nodupes_report} --dupeReport {output.report} --outBam {output.bam} --outReads {output.nodupes_lca} --dupeReads {output.lca} --JVMmemory {resources.mem_mb}m --threads {resources.threads}")
-
-if config['kraken_execution'] == 'multiple':
- kraken_samples = []
- all_kraken_inputs = {}
- all_kraken_reports = {}
- all_kraken_reads = {}
- for adjective in ('raw', 'cleaned'):
- for sample in samples:
- kraken_report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.kraken.report".format(
- sample=sample, adjective=adjective))
- kraken_input = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam".format(
- sample=sample, adjective=adjective))
- try:
- itime = os.path.getmtime(kraken_input)
- otime = os.path.getmtime(kraken_report)
- except OSError:
- kraken_samples.append(sample)
- continue
- if itime > otime:
- kraken_samples.append(sample)
- if not kraken_samples:
- kraken_samples = samples.copy()
-
- all_kraken_inputs[adjective] = expand(
- os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"),
- sample=kraken_samples, adjective=adjective)
- all_kraken_reports[adjective] = expand(
- os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.kraken.report"),
- sample=kraken_samples, adjective=adjective)
- all_kraken_reads[adjective] = expand(
- os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.kraken.reads.gz"),
- sample=kraken_samples, adjective=adjective)
-
- kraken_shell = textwrap.dedent("""\
- {config[bin_dir]}/metagenomics.py kraken {params.kraken_db} {input.bams} --outReads {output.reads} --outReports {output.reports} \
- {params.lock_memory} --threads {resources.threads}
- """)
-
- rule kraken_multiple_raw:
- input:
- bams = all_kraken_inputs['raw']
- output:
- reports = all_kraken_reports['raw'],
- reads = all_kraken_reads['raw']
- resources:
- threads = int(config.get("number_of_threads", 1)),
- mem_mb = 120*1000
- params:
- UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00'),
- kraken_db = config['kraken_db'],
- lock_memory = ' --lockMemory' if config.get('kraken_lock_memory') else ''
- shell:
- kraken_shell
-
- rule kraken_multiple_cleaned:
- input:
- bams = all_kraken_inputs['cleaned']
- output:
- reports = all_kraken_reports['cleaned'],
- reads = all_kraken_reads['cleaned']
- resources:
- threads = int(config.get("number_of_threads", 1)),
- mem_mb = 120*1000
- params:
- UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00'),
- kraken_db = config['kraken_db'],
- lock_memory = ' --lockMemory' if config.get('kraken_lock_memory') else ''
- shell:
- kraken_shell
-
-else:
- rule kraken:
- input:
- bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam")
- output:
- report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kraken.report"),
- reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kraken.reads.gz")
- resources:
- threads = int(config.get("number_of_threads", 1)),
- mem_mb = 120*1000
- params:
- UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00'),
- kraken_db = config['kraken_db'],
- lock_memory = ' --lockMemory' if config.get('kraken_lock_memory') else ''
- shell:
- """
- {config[bin_dir]}/metagenomics.py kraken {params.kraken_db} {input.bam} --outReads {output.reads} --outReports {output.report} \
- {params.lock_memory} --threads {resources.threads}
- """
-
-
-rule all_kraken:
+ shell:
+ """
+ {config[bin_dir]}/metagenomics.py krakenuniq {config[krakenuniq_db]} {input.bam} --outReads {output.reads} --outReports {output.report} \
+ --threads {resources.threads}
+ """
+
+
+rule all_krakenuniq:
input:
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.raw.kraken.report"), sample=samples),
+ "{sample}.raw.krakenuniq.report"), sample=samples),
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.raw.kraken.krona.html"), sample=samples)
+ "{sample}.raw.krakenuniq.krona.html"), sample=samples)
-rule all_kraken_host_depleted:
+rule all_krakenuniq_host_depleted:
input:
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.cleaned.kraken.report"), sample=samples),
+ "{sample}.cleaned.krakenuniq.report"), sample=samples),
expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"],
- "{sample}.cleaned.kraken.krona.html"), sample=samples)
+ "{sample}.cleaned.krakenuniq.krona.html"), sample=samples)
rule krona_import_taxonomy:
input:
- tsv = lambda wildcards: os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], \
- ".".join([wildcards.sample, wildcards.adjective, method_props[wildcards.method]['reads_ext']])),
- krona_db = objectify_remote(expand("{path_prefix}/{kronafile}", path_prefix=config["krona_db"], kronafile=taxfiles_krona))#["taxonomy.tab", "gi_taxid.dat"]))
+ tsv = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.{method}.report"),
+ krona_db = objectify_remote(expand("{path_prefix}/{kronafile}", path_prefix=config["krona_db"], kronafile=taxfiles_krona))
output:
- os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.{method,kraken|diamond|rna_bwa|rna_bwa_nodupes}.krona.html")
+ os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.{method,krakenuniq|kaiju}.krona.html")
resources:
mem_mb=32*1000
run:
krona_db_prefix = strip_protocol(config["krona_db"], relative=True)
- shell("{config[bin_dir]}/metagenomics.py krona {input.tsv} "+krona_db_prefix+" {output} --noRank")
+ shell("{config[bin_dir]}/metagenomics.py krona {input.tsv} "+krona_db_prefix+" {output} --noRank --inputType {wildcards.method}")
diff --git a/requirements-conda.txt b/requirements-conda.txt
index 10a0b8204..2415d8459 100644
--- a/requirements-conda.txt
+++ b/requirements-conda.txt
@@ -6,7 +6,8 @@ cd-hit-auxtools=4.6.8
diamond=0.9.10
kmc=3.1.1rc1
gatk=3.8
-kraken=1.0.0_fork3
+kaiju=1.6.3_yesimon
+krakenuniq=0.5.7_yesimon
krona=2.7
last=876
lbzip2=2.5
diff --git a/test/input/TestMetagenomicsSimple/db/library/krakenuniq.map b/test/input/TestMetagenomicsSimple/db/library/krakenuniq.map
new file mode 100644
index 000000000..5724240e4
--- /dev/null
+++ b/test/input/TestMetagenomicsSimple/db/library/krakenuniq.map
@@ -0,0 +1,5 @@
+NC_002549.1 186538
+NC_004161.1 186539
+NC_006432.1 186540
+NC_014373.1 565995
+NC_014372.1 186541
\ No newline at end of file
diff --git a/test/input/TestMetagenomicsViralMix/db/library/krakenuniq.map b/test/input/TestMetagenomicsViralMix/db/library/krakenuniq.map
new file mode 100644
index 000000000..070cd373d
--- /dev/null
+++ b/test/input/TestMetagenomicsViralMix/db/library/krakenuniq.map
@@ -0,0 +1,7 @@
+NC_002058.3 138950
+NC_002692.1 12253
+NC_004102.1 11103
+AY184220.1 12083
+EU255973.1 31646
+KJ207374.1 12253
+KM609478.1 42788
\ No newline at end of file
diff --git a/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json b/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json
index 791ce4273..105cb090d 100644
--- a/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json
+++ b/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json
@@ -11,6 +11,6 @@
{ "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-BzBpkK80x0z8GBQPfZ4kgPKp" } }
],
- "stage-3.kraken_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F86qZg809y3Qvp7j7vgFKj4Z" } },
+ "stage-3.krakenuniq_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-FVYQqP006zFF064QBGf022X1" } },
"stage-3.krona_taxonomy_db_tgz": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F4z0fgj07FZ8jg8yP7yz0Qzb" } }
}
diff --git a/test/integration/fixtures.py b/test/integration/fixtures.py
new file mode 100644
index 000000000..47b5a5a45
--- /dev/null
+++ b/test/integration/fixtures.py
@@ -0,0 +1,50 @@
+import gzip
+import os
+from os.path import join
+import shutil
+import tools
+import tools.kaiju
+import tools.picard
+import tools.krona
+import util.file
+
+# @pytest.fixture(scope='module')
+def krona():
+ krona = tools.krona.Krona()
+ krona.install()
+ return krona
+
+
+# @pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix'])
+def db_type(request):
+ return request.param
+
+
+# @pytest.fixture(scope='module')
+def krona_db(request, tmpdir_module, krona, db_type):
+ data_dir = join(util.file.get_test_input_path(), db_type)
+ db_dir = join(data_dir, 'db')
+
+ db = join(tmpdir_module, 'krona_db_{}'.format(db_type))
+ os.mkdir(db)
+ for d in ('names.dmp', 'nodes.dmp'):
+ src = join(db_dir, 'taxonomy', d)
+ dest = join(db, d)
+ if os.path.isfile(src):
+ os.symlink(src, dest)
+ krona.create_db(db)
+ return db
+
+
+def taxonomy_db(request, tmpdir_module, db_type):
+ taxonomy = join(tmpdir_module, db_type, 'taxonomy')
+ shutil.copytree(join(util.file.get_test_input_path(), db_type, 'db', 'taxonomy'),
+ taxonomy)
+ prot = join(taxonomy, 'accession2taxid', 'prot.accession2taxid')
+ prot_gz = prot + '.gz'
+
+ with open(prot, 'rb') as f_in:
+ with gzip.open(prot_gz, 'wb') as f_out:
+ shutil.copyfileobj(f_in, f_out)
+
+ return taxonomy
diff --git a/test/integration/test_diamond.py b/test/integration/test_diamond.py
deleted file mode 100644
index 93f010b9e..000000000
--- a/test/integration/test_diamond.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Integration tests for diamond
-
-from builtins import super
-import argparse
-import fnmatch
-import gzip
-import os
-from os.path import join
-import sys
-import shutil
-import tempfile
-import pytest
-from Bio import SeqIO
-
-import metagenomics
-import tools
-import tools.diamond
-import tools.picard
-import util.file
-
-def find_files(root_dir, filt):
- matches = []
- for root, dirnames, filenames in os.walk(root_dir):
- for filename in fnmatch.filter(filenames, filt):
- yield join(root, filename)
-
-
-@pytest.fixture(scope='module')
-def fastq_to_sam():
- return tools.picard.FastqToSamTool()
-
-
-@pytest.fixture(scope='module')
-def sam_to_fastq():
- return tools.picard.SamToFastqTool()
-
-
-@pytest.fixture(scope='module')
-def diamond():
- diamond = tools.diamond.Diamond()
- diamond.install()
- return diamond
-
-
-@pytest.fixture(scope='module')
-def krona():
- krona = tools.krona.Krona()
- krona.install()
- return krona
-
-
-@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix'])
-def db_type(request):
- return request.param
-
-
-def input_fastq_paths():
- data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple')
- return [os.path.join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
-
-
-def input_bam_paths():
- data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
- return join(data_dir, 'test-reads.bam')
-
-
-@pytest.fixture(scope='module')
-def input_bam(request, tmpdir_module, fastq_to_sam, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- if db_type == 'TestMetagenomicsSimple':
- fastqs = [os.path.join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
-
- bam_name = 'zaire_ebola.bam'
- bam = os.path.join(tmpdir_module, bam_name)
- fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam)
- return bam
-
- return input_bam_paths()
-
-
-@pytest.fixture(scope='module')
-def input_fastqs(request, tmpdir_module, sam_to_fastq, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- if db_type == 'TestMetagenomicsSimple':
- fastqs = [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
- return fastqs
-
- bam = join(data_dir, 'test-reads.bam')
- basename = os.path.basename(bam)
- fastq1 = join(tmpdir_module, '{}.1.fastq'.format(basename))
- fastq2 = join(tmpdir_module, '{}.2.fastq'.format(basename))
- sam_to_fastq.execute(bam, fastq1, fastq2)
- return fastq1, fastq2
-
-
-@pytest.fixture(scope='module')
-def taxonomy_db(request, tmpdir_module, db_type):
- taxonomy = os.path.join(tmpdir_module, db_type, 'taxonomy')
- shutil.copytree(join(util.file.get_test_input_path(), db_type, 'db', 'taxonomy'),
- taxonomy)
- prot = os.path.join(taxonomy, 'accession2taxid', 'prot.accession2taxid')
- prot_gz = prot + '.gz'
-
- with open(prot, 'rb') as f_in:
- with gzip.open(prot_gz, 'wb') as f_out:
- shutil.copyfileobj(f_in, f_out)
-
- return taxonomy
-
-
-@pytest.fixture(scope='module')
-def diamond_db(request, tmpdir_module, diamond, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- db_dir = join(data_dir, 'db')
-
- db = os.path.join(tmpdir_module, db_type + '.dmnd')
- translated = os.path.join(tmpdir_module, db_type + '.faa')
-
- lib_dir = join(db_dir, 'library')
- util.file.cat(translated, find_files(db_dir, '*.faa'))
-
- diamond.build(db, [translated])
- return os.path.splitext(db)[0]
-
-
-TAXONOMY_FILES = ('gi_taxid_nucl.dmp',
- 'gi_taxid_prot.dmp',
- 'names.dmp',
- 'nodes.dmp',
- 'merged.dmp')
-
-
-@pytest.fixture(scope='module')
-def krona_db(request, tmpdir_module, krona, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- db_dir = os.path.join(data_dir, 'db')
-
- db = os.path.join(tmpdir_module, 'krona_db_{}'.format(db_type))
- os.mkdir(db)
- for d in TAXONOMY_FILES:
- src = join(db_dir, 'taxonomy', d)
- dest = join(db, d)
- os.symlink(src, dest)
- krona.create_db(db)
- return db
-
-
-@pytest.mark.skipif(tools.is_osx(), reason="not currently tested under OSX")
-def test_diamond(diamond_db, taxonomy_db, input_bam):
- out_report = util.file.mkstempfname('.report')
- out_reads = util.file.mkstempfname('.lca.tsv')
- cmd = [input_bam, diamond_db, taxonomy_db, out_report, '--outReads', out_reads]
- parser = metagenomics.parser_diamond(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
-
- assert os.path.getsize(out_report) > 0
- assert os.path.getsize(out_reads) > 0
diff --git a/test/integration/test_kaiju.py b/test/integration/test_kaiju.py
new file mode 100644
index 000000000..928e08933
--- /dev/null
+++ b/test/integration/test_kaiju.py
@@ -0,0 +1,163 @@
+# Integration tests for kaiju
+from builtins import super
+import argparse
+import binascii
+import fnmatch
+import gzip
+import os
+from os.path import join
+import sys
+import shutil
+import tempfile
+
+import lxml.html.clean
+import pytest
+from Bio import SeqIO
+
+import metagenomics
+import tools
+import tools.kaiju
+import tools.picard
+import util.file
+import test.integration.fixtures
+
+
+def is_gz_file(filepath):
+ with open(filepath, 'rb') as test_f:
+ return binascii.hexlify(test_f.read(2)) == b'1f8b'
+
+
+def find_files(root_dir, filt):
+ matches = []
+ for root, dirnames, filenames in os.walk(root_dir):
+ for filename in fnmatch.filter(filenames, filt):
+ yield join(root, filename)
+
+
+krona = pytest.fixture(scope='module')(test.integration.fixtures.krona)
+krona_db = pytest.fixture(scope='module')(test.integration.fixtures.krona_db)
+taxonomy_db = pytest.fixture(scope='module')(test.integration.fixtures.taxonomy_db)
+
+
+@pytest.fixture(scope='module')
+def fastq_to_sam():
+ return tools.picard.FastqToSamTool()
+
+
+@pytest.fixture(scope='module')
+def sam_to_fastq():
+ return tools.picard.SamToFastqTool()
+
+
+@pytest.fixture(scope='module')
+def kaiju():
+ kaiju = tools.kaiju.Kaiju()
+ kaiju.install()
+ return kaiju
+
+
+@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix'])
+def db_type(request):
+ return request.param
+
+
+def input_fastq_paths():
+ data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple')
+ return [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
+
+
+def input_bam_paths():
+ data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
+ return join(data_dir, 'test-reads.bam')
+
+
+@pytest.fixture(scope='module')
+def input_bam(request, tmpdir_module, fastq_to_sam, db_type):
+ data_dir = join(util.file.get_test_input_path(), db_type)
+ if db_type == 'TestMetagenomicsSimple':
+ fastqs = [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
+
+ bam_name = 'zaire_ebola.bam'
+ bam = join(tmpdir_module, bam_name)
+ fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam)
+ return bam
+
+ return input_bam_paths()
+
+
+@pytest.fixture(scope='module')
+def input_fastqs(request, tmpdir_module, sam_to_fastq, db_type):
+ data_dir = join(util.file.get_test_input_path(), db_type)
+ if db_type == 'TestMetagenomicsSimple':
+ fastqs = [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
+ return fastqs
+
+ bam = join(data_dir, 'test-reads.bam')
+ basename = os.path.basename(bam)
+ fastq1 = join(tmpdir_module, '{}.1.fastq'.format(basename))
+ fastq2 = join(tmpdir_module, '{}.2.fastq'.format(basename))
+ sam_to_fastq.execute(bam, fastq1, fastq2)
+ return fastq1, fastq2
+
+
+@pytest.fixture(scope='module')
+def kaiju_db(request, tmpdir_module, kaiju, taxonomy_db, db_type):
+ data_dir = join(util.file.get_test_input_path(), db_type)
+ db_dir = join(data_dir, 'db')
+
+ db_prefix = join(tmpdir_module, db_type)
+ translated = join(tmpdir_module, db_type + '.faa')
+
+ lib_dir = join(db_dir, 'library')
+ util.file.cat(translated, find_files(db_dir, '*.faa'))
+
+ kaiju.build(db_prefix, [translated], tax_db=taxonomy_db, translate_accessions=True)
+ return db_prefix + '.fmi'
+
+
+def test_kaiju(kaiju_db, krona_db, taxonomy_db, input_bam):
+ out_report = util.file.mkstempfname('.report')
+ out_reads = util.file.mkstempfname('.reads')
+ cmd = [input_bam, kaiju_db, taxonomy_db, out_report, '--outReads', out_reads]
+ parser = metagenomics.parser_kaiju(argparse.ArgumentParser())
+ args = parser.parse_args(cmd)
+ args.func_main(args)
+
+ assert os.path.getsize(out_report) > 0
+ assert os.path.getsize(out_reads) > 0
+
+ with util.file.open_or_gzopen(out_report) as inf:
+ report_lines = [x.strip().split('\t') for x in inf.readlines()]
+ report_lines = [x for x in report_lines if x]
+
+ assert is_gz_file(out_reads)
+ assert os.path.getsize(out_report) > 0
+
+ if 'TestMetagenomicsSimple' in kaiju_db:
+ zaire_found = False
+ tai_found = False
+ for line in report_lines:
+ if len(line) < 2:
+ continue
+ if 'Zaire ebolavirus' in line[-2] and float(line[0]) > 40:
+ zaire_found = True
+ elif 'Tai Forest' in line[-2]:
+ tai_found = True
+ assert zaire_found
+ assert not tai_found
+
+ out_html = util.file.mkstempfname('.krona.html')
+ parser = metagenomics.parser_krona(argparse.ArgumentParser())
+ args = parser.parse_args(['--inputType', 'kaiju', out_report, krona_db, out_html])
+ args.func_main(args)
+
+ if 'TestMetagenomicsSimple' in kaiju_db:
+ ebola_found = False
+ cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=False, page_structure=False)
+ tree = cleaner.clean_html(lxml.html.parse(out_html))
+ root_node = tree.xpath('//krona/node')[0]
+ for n in root_node.iterdescendants():
+ if n.get('name') == 'Zaire ebolavirus':
+ if int(n.xpath('magnitude/val')[0].text) > 0:
+ ebola_found = True
+ assert ebola_found
diff --git a/test/integration/test_kraken.py b/test/integration/test_kraken.py
deleted file mode 100644
index 72f5cbc87..000000000
--- a/test/integration/test_kraken.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Integration tests for kraken
-
-import argparse
-import os
-import os.path
-from os.path import join
-import sys
-import pytest
-import metagenomics
-import unittest
-import util.file
-import tools
-import tools.kraken
-import tools.krona
-import tools.picard
-
-@pytest.fixture()
-def input_bam(db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- return join(data_dir, 'test-reads.bam')
-
-
-@pytest.fixture(scope='module')
-def kraken():
- kraken = tools.kraken.Kraken()
- kraken.install()
- return kraken
-
-
-@pytest.fixture(scope='module')
-def krona():
- krona = tools.krona.Krona()
- krona.install()
- return krona
-
-
-@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix'])
-def db_type(request):
- return request.param
-
-
-@pytest.fixture(scope='module')
-def kraken_db(request, tmpdir_module, kraken, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- db_dir = join(data_dir, 'db')
-
- parser = metagenomics.parser_kraken(argparse.ArgumentParser())
-
- db = os.path.join(tmpdir_module, 'kraken_db_{}'.format(db_type))
- parser = metagenomics.parser_kraken_build(argparse.ArgumentParser())
- cmd = [db, '--library', join(db_dir, 'library'),
- '--taxonomy', join(db_dir, 'taxonomy'),
- '--subsetTaxonomy',
- '--minimizerLen', '10',
- '--clean']
-
- parser.parse_args(cmd)
- args = parser.parse_args(cmd)
- args.func_main(args)
- return db
-
-
-TAXONOMY_FILES = ('gi_taxid_nucl.dmp',
- 'gi_taxid_prot.dmp',
- 'names.dmp',
- 'nodes.dmp',
- 'merged.dmp')
-
-
-@pytest.fixture(scope='module')
-def krona_db(request, tmpdir_module, krona, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- db_dir = os.path.join(data_dir, 'db')
-
- db = os.path.join(tmpdir_module, 'krona_db_{}'.format(db_type))
- os.mkdir(db)
- for d in TAXONOMY_FILES:
- src = join(db_dir, 'taxonomy', d)
- dest = join(db, d)
- if os.path.isfile(src):
- os.symlink(src, dest)
- krona.create_db(db)
- return db
-
-def test_kraken(kraken_db, input_bam):
- out_report = util.file.mkstempfname('.report')
- out_reads = util.file.mkstempfname('.reads.gz')
- cmd = [kraken_db, input_bam, '--outReports', out_report, '--outReads', out_reads]
- parser = metagenomics.parser_kraken(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
-
- with util.file.open_or_gzopen(out_reads, 'r') as inf:
- assert len(inf.read()) > 0
- with util.file.open_or_gzopen(out_report) as inf:
- report_lines = [x.strip().split() for x in inf.readlines()]
-
- assert os.path.getsize(out_report) > 0
-
- '''
- # not sure what to do with this failing test for now that never seemed to work in the past
- if 'TestMetagenomicsSimple' in kraken_db:
- zaire_found = False
- tai_found = False
- for line in report_lines:
- if line[-1] == 'Zaire ebolavirus' and float(line[0]) > 90:
- zaire_found = True
- elif 'Tai Forest' in line[-1]:
- tai_found = True
- assert zaire_found
- assert not tai_found
- '''
-
-def test_kraken_multi(kraken_db):
- in_bams = list(os.path.join(util.file.get_test_input_path(), d, 'test-reads.bam') for d in ('TestMetagenomicsSimple', 'TestMetagenomicsViralMix'))
- out_reports = list(util.file.mkstempfname('.out_{}.report.txt'.format(i)) for i in (1,2))
- out_reads = list(util.file.mkstempfname('.out_{}.reads.txt.gz'.format(i)) for i in (1,2))
- cmd = [kraken_db] + in_bams \
- + ['--outReports'] + out_reports \
- + ['--outReads'] + out_reads
- parser = metagenomics.parser_kraken(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
-
- # just check for non-empty outputs
- for outfile in out_reads:
- with util.file.open_or_gzopen(outfile, 'r') as inf:
- assert len(inf.read()) > 0
- for outfile in out_reports:
- with util.file.open_or_gzopen(outfile) as inf:
- assert len(inf.read()) > 0
-
-@unittest.skip("this deadlocks currently...")
-def test_kraken_fifo(kraken_db):
- in_bams = list(os.path.join(util.file.get_test_input_path(), d, 'test-reads.bam') for d in ('TestMetagenomicsSimple', 'TestMetagenomicsViralMix'))
- out_reports = list(util.file.mkstempfname('.out_{}.report.txt'.format(i)) for i in (1,2))
- out_reads = list(util.file.mkstempfname('.out_{}.reads.txt.gz'.format(i)) for i in (1,2))
- with util.file.fifo(names=('inbam1.bam', 'inbam2.bam')) as (inbam1, inbam2):
- with open(inbam1, 'wb') as b1, open(inbam2, 'wb') as b2:
- p1 = subprocess.Popen(['cat', in_bams[0]], stdout=b1)
- p2 = subprocess.Popen(['cat', in_bams[1]], stdout=b2)
- cmd = [kraken_db, inbam1, inbam2] \
- + ['--outReports'] + out_reports \
- + ['--outReads'] + out_reads
- parser = metagenomics.parser_kraken(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
- print("waiting for kraken to drain fifo for first bam file")
- p1.wait()
- print("waiting for kraken to drain fifo for second bam file")
- p2.wait()
-
- # just check for non-empty outputs
- for outfile in out_reads:
- with util.file.open_or_gzopen(outfile, 'r') as inf:
- assert len(inf.read()) > 0
- for outfile in out_reports:
- with util.file.open_or_gzopen(outfile) as inf:
- assert len(inf.read()) > 0
-
-def test_kraken_krona(kraken_db, krona_db, input_bam):
- out_report = util.file.mkstempfname('.report')
- out_reads = util.file.mkstempfname('.reads.gz')
-
- cmd = [kraken_db, input_bam, '--outReport', out_report, '--outReads', out_reads]
- parser = metagenomics.parser_kraken(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
-
- out_html = util.file.mkstempfname('.krona.html')
- parser = metagenomics.parser_krona(argparse.ArgumentParser())
- args = parser.parse_args([out_reads, krona_db, out_html])
- args.func_main(args)
-
-def test_kraken_on_empty(kraken_db, input_bam):
- if 'TestMetagenomicsViralMix' not in kraken_db:
- return
- input_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
- out_report = util.file.mkstempfname('.report')
- out_reads = util.file.mkstempfname('.reads.gz')
- cmd = [kraken_db, input_bam, '--outReport', out_report, '--outReads', out_reads]
- parser = metagenomics.parser_kraken(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
-
- with util.file.open_or_gzopen(out_reads, 'r') as inf:
- assert len(inf.read()) == 0
- with open(out_report, 'rt') as inf:
- out_report_contents = inf.readlines()
- assert len(out_report_contents) == 1
- out_report_contents = out_report_contents[0].rstrip('\n').split('\t')
- assert out_report_contents == ['100.00', '0', '0', 'U', '0', 'unclassified']
diff --git a/test/integration/test_krakenuniq.py b/test/integration/test_krakenuniq.py
new file mode 100644
index 000000000..b4b57cb36
--- /dev/null
+++ b/test/integration/test_krakenuniq.py
@@ -0,0 +1,141 @@
+# Integration tests for krakenuniq
+import argparse
+import binascii
+import os
+import os.path
+from os.path import join
+
+import lxml.html.clean
+import pytest
+
+import metagenomics
+import unittest
+import util.file
+import tools
+import tools.kraken
+import tools.krona
+import tools.picard
+import test.integration.fixtures
+
+
+def is_gz_file(filepath):
+ with open(filepath, 'rb') as test_f:
+ return binascii.hexlify(test_f.read(2)) == b'1f8b'
+
+
+krona = pytest.fixture(scope='module')(test.integration.fixtures.krona)
+krona_db = pytest.fixture(scope='module')(test.integration.fixtures.krona_db)
+
+
+@pytest.fixture()
+def input_bam(db_type):
+ data_dir = join(util.file.get_test_input_path(), db_type)
+ return join(data_dir, 'test-reads.bam')
+
+
+@pytest.fixture(scope='module')
+def krakenuniq():
+ krakenuniq = tools.kraken.KrakenUniq()
+ krakenuniq.install()
+ return krakenuniq
+
+
+@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix'])
+def db_type(request):
+ return request.param
+
+
+@pytest.fixture(scope='module')
+def krakenuniq_db(request, tmpdir_module, krakenuniq, db_type):
+ data_dir = join(util.file.get_test_input_path(), db_type)
+ db_dir = join(data_dir, 'db')
+
+ db = join(tmpdir_module, 'krakenuniq_db_{}'.format(db_type))
+ parser = metagenomics.parser_krakenuniq_build(argparse.ArgumentParser())
+ cmd = [db, '--library', join(db_dir, 'library'),
+ '--taxonomy', join(db_dir, 'taxonomy'),
+ '--subsetTaxonomy',
+ '--minimizerLen', '10',
+ '--clean']
+
+ parser.parse_args(cmd)
+ args = parser.parse_args(cmd)
+ args.func_main(args)
+ return db
+
+
+def test_krakenuniq(krakenuniq_db, input_bam):
+ out_report = util.file.mkstempfname('.report')
+ out_reads = util.file.mkstempfname('.reads.gz')
+ cmd = [krakenuniq_db, input_bam, '--outReports', out_report, '--outReads', out_reads]
+ parser = metagenomics.parser_krakenuniq(argparse.ArgumentParser())
+ args = parser.parse_args(cmd)
+ args.func_main(args)
+
+ with util.file.open_or_gzopen(out_reads, 'r') as inf:
+ assert len(inf.read()) > 0
+ with util.file.open_or_gzopen(out_report) as inf:
+ report_lines = [x.strip().split('\t') for x in inf.readlines()]
+ report_lines = [x for x in report_lines if x]
+
+ assert is_gz_file(out_reads)
+ assert os.path.getsize(out_report) > 0
+
+ if 'TestMetagenomicsSimple' in krakenuniq_db:
+ zaire_found = False
+ tai_found = False
+ for line in report_lines:
+ if 'Zaire ebolavirus' in line[-1] and float(line[0]) > 90:
+ zaire_found = True
+ elif 'Tai Forest' in line[-1]:
+ tai_found = True
+ assert zaire_found
+ assert not tai_found
+
+
+def test_krakenuniq_krona(krakenuniq_db, krona_db, input_bam):
+ out_report = util.file.mkstempfname('.report')
+ out_reads = util.file.mkstempfname('.reads.gz')
+
+ cmd = [krakenuniq_db, input_bam, '--outReport', out_report, '--outReads', out_reads]
+ parser = metagenomics.parser_krakenuniq(argparse.ArgumentParser())
+ args = parser.parse_args(cmd)
+ args.func_main(args)
+
+ out_html = util.file.mkstempfname('.krona.html')
+ parser = metagenomics.parser_krona(argparse.ArgumentParser())
+ args = parser.parse_args(['--inputType', 'krakenuniq', out_report, krona_db, out_html])
+ args.func_main(args)
+
+ if 'TestMetagenomicsSimple' in krakenuniq_db:
+ ebola_found = False
+ cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=False, page_structure=False)
+ tree = cleaner.clean_html(lxml.html.parse(out_html))
+ root_node = tree.xpath('//krona/node')[0]
+ for n in root_node.iterdescendants():
+ if n.get('name') == 'Zaire ebolavirus':
+ if int(n.xpath('magnitude/val')[0].text) > 0:
+ ebola_found = True
+ assert ebola_found
+
+
+def test_krakenuniq_on_empty(krakenuniq_db, input_bam):
+ if 'TestMetagenomicsViralMix' not in krakenuniq_db:
+ return
+ input_bam = join(util.file.get_test_input_path(), 'empty.bam')
+ out_report = util.file.mkstempfname('.report')
+ out_reads = util.file.mkstempfname('.reads.gz')
+ cmd = [krakenuniq_db, input_bam, '--outReport', out_report, '--outReads', out_reads]
+ parser = metagenomics.parser_krakenuniq(argparse.ArgumentParser())
+ args = parser.parse_args(cmd)
+ args.func_main(args)
+
+ with util.file.open_or_gzopen(out_reads, 'r') as inf:
+ assert len(inf.read()) == 0
+
+ assert is_gz_file(out_reads)
+ with open(out_report, 'rt') as inf:
+ lines = [line.strip() for line in inf.readlines() if not line.startswith('#') and not line.startswith('%')]
+ out_report_contents = [line for line in lines if line]
+ assert len(out_report_contents) == 1
+ assert out_report_contents[0].split('\t') == ['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified']
diff --git a/test/integration/test_metagenomics_align.py b/test/integration/test_metagenomics_align.py
deleted file mode 100644
index 264f5613e..000000000
--- a/test/integration/test_metagenomics_align.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Integration tests for metagenomics direct alignment
-
-from builtins import super
-import argparse
-import fnmatch
-from os import listdir
-import os.path
-from os.path import join
-import sys
-import tempfile
-
-import pytest
-from Bio import SeqIO
-
-import metagenomics
-import util.file
-import tools
-import tools.bwa
-import tools.krona
-import tools.picard
-
-def find_files(root_dir, filt):
- matches = []
- for root, dirnames, filenames in os.walk(root_dir):
- for filename in fnmatch.filter(filenames, filt):
- yield join(root, filename)
-
-
-@pytest.fixture(scope='module')
-def fastq_to_sam():
- return tools.picard.FastqToSamTool()
-
-
-@pytest.fixture(scope='module')
-def taxonomy_db(request, tmpdir_factory, db_type):
- return join(util.file.get_test_input_path(), db_type, 'db', 'taxonomy')
-
-
-@pytest.fixture(scope='module')
-def input_bam(request, tmpdir_module, fastq_to_sam, db_type):
- data_dir = join(util.file.get_test_input_path(), db_type)
- if db_type == 'TestMetagenomicsSimple':
- fastqs = [os.path.join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']]
-
- bam_name = 'zaire_ebola.bam'
- bam = os.path.join(tmpdir_module, bam_name)
- fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam)
- return bam
-
- data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix')
- return join(data_dir, 'test-reads.bam')
-
-
-@pytest.fixture(scope='module')
-def bwa():
- bwa = tools.bwa.Bwa()
- bwa.install()
- return bwa
-
-
-# @pytest.fixture(scope='session', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix'])
-@pytest.fixture(scope='module', params=['TestMetagenomicsSimple'])
-def db_type(request):
- return request.param
-
-
-FNA_TAXIDS = {
- 'GCF_000889155.1_ViralProj51245_genomic.fna': '565995', # Bundibugyo_ebolavirus
- 'GCF_000854085.1_ViralProj15006_genomic.fna': '186539', # Reston_ebolavirus
- 'GCF_000855585.1_ViralProj15012_genomic.fna': '186540', # Sudan_ebolavirus
- 'GCF_000888475.1_ViralProj51257_genomic.fna': '186541', # Tai_Forest_ebolavirus
- 'GCF_000848505.1_ViralProj14703_genomic.fna': '186538', # Zaire_ebolavirus
-}
-
-
-@pytest.fixture(scope='module')
-def bwa_db(request, tmpdir_module, bwa, db_type):
-
- data_dir = join(util.file.get_test_input_path(), db_type)
- db_dir = join(data_dir, 'db')
-
- index_fa = os.path.join(tmpdir_module, db_type + '.bwa_index.fa')
- db = os.path.join(tmpdir_module, db_type + '')
-
- with open(index_fa, "w") as f_out:
- for fname in find_files(join(db_dir, 'library'), '*.fna'):
- with open(fname) as f:
- for seq_record in SeqIO.parse(f, 'fasta'):
- seq_id = seq_record.id
- try:
- tax_id = FNA_TAXIDS[os.path.basename(fname)]
- except KeyError:
- continue
- seq_record.id = 'taxid|{}|{}'.format(tax_id, seq_id)
- SeqIO.write(seq_record, f_out, 'fasta')
-
- bwa.index(index_fa, output=db)
- return db
-
-
-def test_meta_bwa(bwa_db, taxonomy_db, input_bam):
- out_report = util.file.mkstempfname('.report')
- dupe_report = util.file.mkstempfname('.dupes.report')
- out_bam = util.file.mkstempfname('.output.bam')
- cmd = [input_bam, bwa_db, taxonomy_db, out_report, '--dupeReport', dupe_report, '--outBam', out_bam]
- parser = metagenomics.parser_align_rna_metagenomics(argparse.ArgumentParser())
- args = parser.parse_args(cmd)
- args.func_main(args)
-
- assert os.path.getsize(out_report) > 0
- assert os.path.getsize(dupe_report) > 0
- assert os.path.getsize(out_bam) > 0
diff --git a/test/pipelines/snakemake/test_diamond.py b/test/pipelines/snakemake/test_kaiju.py
similarity index 69%
rename from test/pipelines/snakemake/test_diamond.py
rename to test/pipelines/snakemake/test_kaiju.py
index 709abcc70..a689d4ca4 100755
--- a/test/pipelines/snakemake/test_diamond.py
+++ b/test/pipelines/snakemake/test_kaiju.py
@@ -7,14 +7,13 @@
import tools
from test.pipelines.snakemake import snake
-from test.integration.test_diamond import * # for pytest fixtures
+from test.integration.test_kaiju import * # for pytest fixtures
-@pytest.mark.skipif(tools.is_osx(), reason="not currently tested under OSX")
@pytest.mark.skipif(sys.version_info < (3, 5), reason="Python version is too old for snakemake.")
-def test_pipes(tmpdir_function, diamond_db, taxonomy_db, krona_db, input_bam):
+def test_pipes(tmpdir_function, kaiju_db, taxonomy_db, krona_db, input_bam):
runner = snake.SnakemakeRunner(workdir=tmpdir_function)
override_config = {
- 'diamond_db': diamond_db,
+ 'kaiju_db': kaiju_db,
'taxonomy_db': taxonomy_db,
'krona_db': krona_db,
}
@@ -24,12 +23,12 @@ def test_pipes(tmpdir_function, diamond_db, taxonomy_db, krona_db, input_bam):
runner.create_sample_files(sample_files=['samples_metagenomics'])
krona_out = join(runner.config['data_dir'], runner.config['subdirs']['metagenomics'],
- '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'diamond.krona.html']))
+ '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kaiju.krona.html']))
- diamond_out = join(
+ kaiju_out = join(
runner.config['data_dir'], runner.config['subdirs']['metagenomics'],
- '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'diamond.report'])
+ '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kaiju.report'])
)
runner.run([krona_out])
- assert os.path.getsize(os.path.join(runner.workdir, diamond_out)) > 0
- assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0
\ No newline at end of file
+ assert os.path.getsize(os.path.join(runner.workdir, kaiju_out)) > 0
+ assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0
diff --git a/test/pipelines/snakemake/test_kraken.py b/test/pipelines/snakemake/test_krakenuniq.py
similarity index 69%
rename from test/pipelines/snakemake/test_kraken.py
rename to test/pipelines/snakemake/test_krakenuniq.py
index ac2c446cd..063ddc9d0 100755
--- a/test/pipelines/snakemake/test_kraken.py
+++ b/test/pipelines/snakemake/test_krakenuniq.py
@@ -6,13 +6,13 @@
import pytest
from test.pipelines.snakemake import snake
-from test.integration.test_kraken import * # for pytest fixtures
+from test.integration.test_krakenuniq import * # for pytest fixtures
@pytest.mark.skipif(sys.version_info < (3, 5), reason="Python version is too old for snakemake.")
-def test_pipes(tmpdir_function, kraken_db, krona_db, input_bam):
+def test_pipes(tmpdir_function, krakenuniq_db, krona_db, input_bam):
runner = snake.SnakemakeRunner(workdir=tmpdir_function)
override_config = {
- 'kraken_db': kraken_db,
+ 'krakenuniq_db': krakenuniq_db,
'krona_db': krona_db,
}
runner.set_override_config(override_config)
@@ -20,17 +20,16 @@ def test_pipes(tmpdir_function, kraken_db, krona_db, input_bam):
runner.link_samples([input_bam], destination='per_sample', link_transform=snake.rename_raw_bam)
runner.create_sample_files(sample_files=['samples_metagenomics'])
- kraken_out = join(
+ krakenuniq_out = join(
runner.config['data_dir'], runner.config['subdirs']['metagenomics'],
- '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kraken.report'])
+ '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'krakenuniq.report'])
)
krona_out = join(
runner.config['data_dir'], runner.config['subdirs']['metagenomics'],
- '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kraken.krona.html'])
+ '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'krakenuniq.krona.html'])
)
- # runner.run(['all_metagenomics'])
- runner.run([kraken_out, krona_out])
- assert os.path.getsize(os.path.join(runner.workdir, kraken_out)) > 0
- assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0
\ No newline at end of file
+ runner.run([krakenuniq_out, krona_out])
+ assert os.path.getsize(os.path.join(runner.workdir, krakenuniq_out)) > 0
+ assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0
diff --git a/test/pipelines/snakemake/test_metagenomics_align.py b/test/pipelines/snakemake/test_metagenomics_align.py
deleted file mode 100755
index fc8fd0602..000000000
--- a/test/pipelines/snakemake/test_metagenomics_align.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-
-import pytest
-
-from test.pipelines.snakemake import snake
-from test.integration.test_metagenomics_align import * # for pytest fixtures
-
-@pytest.mark.skipif(sys.version_info < (3, 5), reason="Python version is too old for snakemake.")
-def test_pipes(tmpdir_function, bwa_db, taxonomy_db, input_bam):
- runner = snake.SnakemakeRunner(workdir=tmpdir_function)
- override_config = {
- 'align_rna_db': bwa_db,
- 'taxonomy_db': taxonomy_db,
- }
- runner.set_override_config(override_config)
- runner.setup()
- runner.link_samples([input_bam], destination='per_sample', link_transform=snake.rename_raw_bam)
- runner.create_sample_files(sample_files=['samples_metagenomics'])
-
- report_out = join(
- runner.config['data_dir'], runner.config['subdirs']['metagenomics'],
- '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw.rna_bwa.report'])
- )
-
- bam_out = join(
- runner.config['data_dir'], runner.config['subdirs']['metagenomics'],
- '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw.rna_bwa.bam'])
- )
-
- runner.run([report_out])
- assert os.path.getsize(os.path.join(runner.workdir, report_out)) > 0
- assert os.path.getsize(os.path.join(runner.workdir, bam_out)) > 0
\ No newline at end of file
diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
index 486043aac..66f981268 100644
--- a/test/unit/test_metagenomics.py
+++ b/test/unit/test_metagenomics.py
@@ -34,61 +34,6 @@ def test_help_parser_for_each_command(self):
helpstring = parser.format_help()
-@patch('metagenomics.kraken_dfs_report')
-class TestDiamondCalls(TestCaseWithTmp):
- def setUp(self):
- super().setUp()
- patcher = patch('subprocess.Popen')
- self.addCleanup(patcher.stop)
- self.mock_popen = patcher.start()
- self.mock_popen.return_value.returncode = 0
-
- patcher = patch('tools.picard.SamToFastqTool')
- self.addCleanup(patcher.stop)
- self.mock_s2f = patcher.start()
- self.mock_s2f.return_value.execute.return_value.returncode = 0
-
- patcher = patch('tools.diamond.Diamond', autospec=True)
- self.addCleanup(patcher.stop)
- self.mock_diamond = patcher.start()
-
- # Can't open unwritten named pipes
- if six.PY2:
- patcher = patch('__builtin__.open', mock.mock_open(read_data="id1\t1\n"))
- else:
- patcher = patch('builtins.open', mock.mock_open(read_data="id1\t1\n"))
- self.addCleanup(patcher.stop)
- patcher.start()
-
- # mock_open doesn't have __next__ for csv.reader
- patcher = patch('metagenomics.taxa_hits_from_tsv', autospec=True)
- self.addCleanup(patcher.stop)
- self.mock_taxa_hits = patcher.start()
- self.mock_taxa_hits.return_value = Counter({1: 100, 2: 40})
-
- self.inBam = util.file.mkstempfname('.bam')
- self.db = tempfile.mkdtemp('db')
- self.tax_db = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple', 'db', 'taxonomy')
-
- def test_output_reads(self, mock_dfs):
- out_report = util.file.mkstempfname('report.txt')
- out_reads = util.file.mkstempfname('lca.gz')
-
- metagenomics.diamond(self.inBam, self.db, self.tax_db, out_report, outReads=out_reads)
-
- cmd = self.mock_popen.call_args[0][0]
- self.assertIn(out_reads, cmd)
- assert isinstance(metagenomics.kraken_dfs_report.call_args[0][0], metagenomics.TaxonomyDb)
-
- def test_num_threads(self, mock_dfs):
- out_report = util.file.mkstempfname('report.txt')
- metagenomics.diamond(self.inBam, self.db, self.tax_db, out_report, threads=11)
- expected_threads = min(11, _CPUS)
- expected_threads = '--threads {}'.format(expected_threads)
- cmd = self.mock_popen.call_args[0][0]
- self.assertIn(expected_threads, cmd)
-
-
class TestKronaCalls(TestCaseWithTmp):
def setUp(self):
@@ -103,7 +48,7 @@ def setUp(self):
def test_krona_import_taxonomy(self):
out_html = util.file.mkstempfname('.html')
metagenomics.krona(self.inTsv, self.db, out_html, queryColumn=3, taxidColumn=5, scoreColumn=7,
- noHits=True, noRank=True)
+ noHits=True, noRank=True, inputType='tsv')
self.mock_krona().import_taxonomy.assert_called_once_with(
self.db, [self.inTsv], out_html, query_column=3, taxid_column=5, score_column=7,
no_hits=True, no_rank=True, magnitude_column=None, root_name=os.path.basename(self.inTsv))
@@ -310,12 +255,40 @@ def test_coverage_lca(taxa_db):
assert metagenomics.coverage_lca([10, 11, 12], taxa_db.parents, 50) == 7
assert metagenomics.coverage_lca([9], taxa_db.parents) is None
+
+def test_krakenuniq(mocker):
+ p = mocker.patch('tools.kraken.KrakenUniq.pipeline')
+ args = [
+ 'db',
+ 'input.bam',
+ '--outReports', 'output.report',
+ '--outReads', 'output.reads',
+ ]
+ args = metagenomics.parser_krakenuniq(argparse.ArgumentParser()).parse_args(args)
+ args.func_main(args)
+ p.assert_called_with('db', ['input.bam'], num_threads=mock.ANY, filter_threshold=mock.ANY, out_reports=['output.report'], out_reads=['output.reads'])
+
+
+def test_kaiju(mocker):
+ p = mocker.patch('tools.kaiju.Kaiju.classify')
+ args = [
+ 'input.bam',
+ 'db.fmi',
+ 'tax_db',
+ 'output.report',
+ '--outReads', 'output.reads',
+ ]
+ args = metagenomics.parser_kaiju(argparse.ArgumentParser()).parse_args(args)
+ args.func_main(args)
+ p.assert_called_with('db.fmi', 'tax_db', 'input.bam', output_report='output.report', num_threads=mock.ANY, output_reads='output.reads')
+
+
class TestBamFilter(TestCaseWithTmp):
def test_bam_filter_simple(self):
temp_dir = tempfile.gettempdir()
input_dir = util.file.get_test_input_path(self)
taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")
-
+
filtered_bam = util.file.mkstempfname('.bam')
args = [
os.path.join(input_dir,"input.bam"),
@@ -336,7 +309,7 @@ def test_bam_filter_by_tax_id(self):
temp_dir = tempfile.gettempdir()
input_dir = util.file.get_test_input_path(self)
taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy")
-
+
filtered_bam = util.file.mkstempfname('.bam')
args = [
os.path.join(input_dir,"input.bam"),
@@ -352,4 +325,3 @@ def test_bam_filter_by_tax_id(self):
expected_bam = os.path.join(input_dir,"expected.bam")
assert_equal_bam_reads(self, filtered_bam, expected_bam)
-
diff --git a/test/unit/test_tools_kraken.py b/test/unit/test_tools_kraken.py
index 9f40692a8..19045c7d7 100644
--- a/test/unit/test_tools_kraken.py
+++ b/test/unit/test_tools_kraken.py
@@ -10,8 +10,8 @@
@pytest.fixture
-def kraken():
- return tools.kraken.Kraken()
+def krakenuniq():
+ return tools.kraken.KrakenUniq()
@pytest.fixture
@@ -26,68 +26,38 @@ def db(tmpdir_factory):
@pytest.fixture(autouse=True)
def mocks(mocker):
-
mock_run = mocker.patch('util.misc.run', autospec=True)
mock_check_call = mocker.patch('subprocess.check_call', autospec=True)
-
- mock_conda = mocker.patch('tools.CondaPackage', autospec=True)
- mock_conda.return_value.verify_install.return_value = "mock"
- mock_conda.return_value.is_attempted.return_value = True
- mock_conda.return_value.is_installed.return_value = True
- mock_conda.return_value.require_executability = False
- mock_conda.return_value.executable_path.return_value = "/dev/null"
return {
- 'conda': mock_conda,
'run': mock_run,
'check_call': mock_check_call,
}
-def test_kraken_classify(mocks, kraken, db, in_bam):
+def test_krakenuniq_classify(mocks, krakenuniq, db, in_bam):
out_reads = util.file.mkstempfname('.reads.txt')
- kraken.classify(in_bam, db, out_reads)
+ krakenuniq.classify(in_bam, db, out_reads)
args = mocks['check_call'].call_args[0][0]
- assert 'kraken' == os.path.basename(args[0])
+ assert 'krakenuniq' == os.path.basename(args[0])
assert util.misc.list_contains(['--db', db], args)
assert util.misc.list_contains(['--output', out_reads], args)
assert util.misc.list_contains(['--threads', str(_CPUS)], args)
-
-def test_kraken_filter(mocks, kraken, db):
- in_reads = util.file.mkstempfname('.kraken_reads.unfilt.txt')
- out_reads = util.file.mkstempfname('.kraken_reads.filt.txt')
- for thresh in (0.05, 0.3, 0.81):
- kraken.filter(in_reads, db, out_reads, thresh)
- args = mocks['run'].call_args[0][0]
- assert 'kraken-filter' == os.path.basename(args[0])
- assert in_reads in args
- assert util.misc.list_contains(['--db', db], args)
- assert util.misc.list_contains(['--threshold', str(thresh)], args)
-
-def test_kraken_report(mocks, kraken, db):
- in_reads = util.file.mkstempfname('.kraken_reads.txt')
- out_report = util.file.mkstempfname('.kraken_report.txt')
- kraken.report(in_reads, db, out_report)
- args = mocks['run'].call_args[0][0]
- assert 'kraken-report' == os.path.basename(args[0])
- assert in_reads in args
- assert util.misc.list_contains(['--db', db], args)
-
-def test_classify_num_threads(mocks, kraken, db, in_bam):
+def test_classify_num_threads(mocks, krakenuniq, db, in_bam):
out_reads = util.file.mkstempfname('.reads.txt')
- kraken.classify(in_bam, db, out_reads)
+ krakenuniq.classify(in_bam, db, out_reads)
args = mocks['check_call'].call_args[0][0]
- assert 'kraken' == os.path.basename(args[0])
+ assert 'krakenuniq' == os.path.basename(args[0])
assert '--threads' in args
actual = args[args.index('--threads')+1]
assert actual == str(_CPUS)
for requested in (1,2,3,8,11,20):
expected = min(_CPUS, requested)
- kraken.classify(in_bam, db, out_reads, numThreads=requested)
+ krakenuniq.classify(in_bam, db, out_reads, num_threads=requested)
args = mocks['check_call'].call_args[0][0]
- assert 'kraken' == os.path.basename(args[0])
+ assert 'krakenuniq' == os.path.basename(args[0])
assert '--threads' in args
actual = args[args.index('--threads')+1]
assert actual == str(expected), "failure for requested %s, expected %s, actual %s" % (requested, expected, actual)
diff --git a/test/unit/test_tools_krakenuniq.py b/test/unit/test_tools_krakenuniq.py
new file mode 100644
index 000000000..2c35eaf38
--- /dev/null
+++ b/test/unit/test_tools_krakenuniq.py
@@ -0,0 +1,93 @@
+# Unit tests for krakenuniq
+import os.path
+
+import pytest
+
+import util.file
+import util.misc
+import tools.kraken
+from test import _CPUS
+
+
+@pytest.fixture
+def kraken():
+ return tools.kraken.Kraken()
+
+
+@pytest.fixture
+def in_bam():
+ return os.path.join(util.file.get_test_input_path(), 'almost-empty.bam')
+
+
+@pytest.fixture
+def db(tmpdir_factory):
+ return str(tmpdir_factory.mktemp('db'))
+
+
+@pytest.fixture(autouse=True)
+def mocks(mocker):
+
+ mock_run = mocker.patch('util.misc.run', autospec=True)
+ mock_check_call = mocker.patch('subprocess.check_call', autospec=True)
+
+ mock_conda = mocker.patch('tools.CondaPackage', autospec=True)
+ mock_conda.return_value.verify_install.return_value = "mock"
+ mock_conda.return_value.is_attempted.return_value = True
+ mock_conda.return_value.is_installed.return_value = True
+ mock_conda.return_value.require_executability = False
+ mock_conda.return_value.executable_path.return_value = "/dev/null"
+ return {
+ 'conda': mock_conda,
+ 'run': mock_run,
+ 'check_call': mock_check_call,
+ }
+
+
+def test_kraken_classify(mocks, kraken, db, in_bam):
+ out_reads = util.file.mkstempfname('.reads.txt')
+ kraken.classify(in_bam, db, out_reads)
+ args = mocks['check_call'].call_args[0][0]
+ assert 'kraken' == os.path.basename(args[0])
+ assert util.misc.list_contains(['--db', db], args)
+ assert util.misc.list_contains(['--output', out_reads], args)
+ assert util.misc.list_contains(['--threads', str(_CPUS)], args)
+
+
+def test_kraken_filter(mocks, kraken, db):
+ in_reads = util.file.mkstempfname('.kraken_reads.unfilt.txt')
+ out_reads = util.file.mkstempfname('.kraken_reads.filt.txt')
+ for thresh in (0.05, 0.3, 0.81):
+ kraken.filter(in_reads, db, out_reads, thresh)
+ args = mocks['run'].call_args[0][0]
+ assert 'kraken-filter' == os.path.basename(args[0])
+ assert in_reads in args
+ assert util.misc.list_contains(['--db', db], args)
+ assert util.misc.list_contains(['--threshold', str(thresh)], args)
+
+def test_kraken_report(mocks, kraken, db):
+ in_reads = util.file.mkstempfname('.kraken_reads.txt')
+ out_report = util.file.mkstempfname('.kraken_report.txt')
+ kraken.report(in_reads, db, out_report)
+ args = mocks['run'].call_args[0][0]
+ assert 'kraken-report' == os.path.basename(args[0])
+ assert in_reads in args
+ assert util.misc.list_contains(['--db', db], args)
+
+def test_classify_num_threads(mocks, kraken, db, in_bam):
+ out_reads = util.file.mkstempfname('.reads.txt')
+
+ kraken.classify(in_bam, db, out_reads)
+ args = mocks['check_call'].call_args[0][0]
+ assert 'kraken' == os.path.basename(args[0])
+ assert '--threads' in args
+ actual = args[args.index('--threads')+1]
+ assert actual == str(_CPUS)
+
+ for requested in (1,2,3,8,11,20):
+ expected = min(_CPUS, requested)
+ kraken.classify(in_bam, db, out_reads, numThreads=requested)
+ args = mocks['check_call'].call_args[0][0]
+ assert 'kraken' == os.path.basename(args[0])
+ assert '--threads' in args
+ actual = args[args.index('--threads')+1]
+ assert actual == str(expected), "failure for requested %s, expected %s, actual %s" % (requested, expected, actual)
diff --git a/tools/kaiju.py b/tools/kaiju.py
new file mode 100644
index 000000000..c44230aba
--- /dev/null
+++ b/tools/kaiju.py
@@ -0,0 +1,185 @@
+'''
+Kaiju - DNA-to-protein metagenomic classifier
+'''
+from builtins import super
+import collections
+import itertools
+import logging
+import os
+import os.path
+import shlex
+import shutil
+import subprocess
+import tools
+
+from Bio import SeqIO
+
+import util.file
+
+TOOL_VERSION = '1.6.3_yesimon'
+
+log = logging.getLogger(__name__)
+
+
+def read_a2t(fn, base_accession=True):
+ if base_accession:
+ accession_col = 0
+ else:
+ accession_col = 1
+ d = {}
+ with open(fn) as f:
+ f.readline() # Cannot use next(f) in python2
+ for line in f.readlines():
+ parts = line.split('\t')
+ taxid = int(parts[2])
+ accession = parts[accession_col]
+ d[accession] = taxid
+ return d
+
+
+class Kaiju(tools.Tool):
+
+ def __init__(self, install_methods=None):
+ if not install_methods:
+ install_methods = [
+ tools.CondaPackage("kaiju", version=TOOL_VERSION)
+ ]
+ super(Kaiju, self).__init__(install_methods=install_methods)
+
+ def version(self):
+ return TOOL_VERSION
+
+ def build(self, db_prefix, protein_fastas, threads=None, options=None, option_string=None,
+ tax_db=None, translate_accessions=False):
+ '''Create a kaiju database.
+
+ Args:
+ db_prefix: Kaiju database prefix (file path not containing extension) to create.
+ protein_fastas: List of input fasta files to process.
+ tax_db: Contains accession2taxid used if translate_accessions is True.
+ translate_accessions: If fasta IDs are accessions, translate to
+ _ format kaiju expects.
+ '''
+ assert len(protein_fastas), ('Kaiju requires input files to create a database.')
+ options = options or {}
+ options['-a'] = 'protein'
+ options['-n'] = util.misc.sanitize_thread_count(threads)
+ options['-o'] = db_prefix
+
+ temp_file = util.file.temp_catted_files(protein_fastas, prefix='kaiju_', suffix='.faa')
+
+ db_fasta_fn = util.file.tempfname()
+ with db_fasta_fn as db_fasta_fn:
+ if translate_accessions:
+ with temp_file as input_fasta:
+ with open(db_fasta_fn, 'w') as db_fasta:
+ prot2taxid_fn = os.path.join(tax_db, 'accession2taxid', 'prot.accession2taxid')
+ prot2taxid = read_a2t(prot2taxid_fn)
+ for record in SeqIO.parse(input_fasta, 'fasta'):
+ accession = record.id.split('.', 1)[0]
+ taxid = prot2taxid.get(accession)
+ if taxid is not None:
+ new_id = '_'.join([record.id, str(taxid)])
+ record.id = new_id
+ record.description = new_id
+ SeqIO.write(record, db_fasta, 'fasta')
+ option_string = db_fasta_fn
+ else:
+ option_string = temp_
+
+ self.execute('mkbwt', options=options, option_string=option_string)
+ self.execute('mkfmi', option_string=db_prefix)
+
+ def classify(self, db, tax_db, in_bam, output_reads=None, output_report=None, rank='species', verbose=None, num_threads=None):
+ assert output_reads or output_report
+ output_ctx = util.file.tempfname()
+ with util.file.tempfname() as temp_reads:
+ # Fake loop to break out early if report is not wanted
+ while True:
+ tmp_fastq1 = util.file.mkstempfname('_1.fastq')
+ tmp_fastq2 = util.file.mkstempfname('_2.fastq')
+ picard = tools.picard.SamToFastqTool()
+ picard_opts = {
+ 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
+ 'CLIPPING_ACTION': 'X'
+ }
+ picard.execute(in_bam, tmp_fastq1, tmp_fastq2,
+ picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
+ JVMmemory=picard.jvmMemDefault)
+
+ nodes_dmp = os.path.join(tax_db, 'nodes.dmp')
+ names_dmp = os.path.join(tax_db, 'names.dmp')
+
+ opts = {
+ '-t': nodes_dmp,
+ '-f': db,
+ '-o': temp_reads,
+ '-z': util.misc.sanitize_thread_count(num_threads),
+ '-i': tmp_fastq1
+ }
+ if verbose:
+ opts['-v'] = None
+
+ if os.path.getsize(tmp_fastq2) >= 50:
+ opts['-j'] = tmp_fastq2
+
+ self.execute('kaiju', options=opts)
+ if not output_report:
+ break
+
+ opts = {
+ '-r': rank,
+ '-p': True,
+ '-t': nodes_dmp,
+ '-n': names_dmp,
+ '-i': temp_reads,
+ '-o': output_report,
+ '-w': None,
+ '-x': None
+ }
+ if verbose:
+ opts['-v'] = None
+ self.execute('kaijuReport', options=opts)
+ break
+ if output_reads:
+ subprocess.check_call(['pigz', '-9', temp_reads])
+ shutil.move(temp_reads + '.gz', output_reads)
+
+ def execute(self, command, options=None, option_string=None, return_stdout=False):
+ '''Run a kaiju command
+
+ Args:
+ options: Dict of command line options to values. Set value to None
+ for an option with no value.
+ return_stdout: Whether to return stdout as well as in
+ (exitcode, stdout).
+ '''
+ cmd = [command]
+ if options:
+ # We need some way to allow empty options args like --log, hence
+ # we filter out on 'x is None'.
+ cmd.extend([str(x) for x in itertools.chain(*options.items()) if x is not None])
+ if option_string:
+ cmd.extend(shlex.split(option_string))
+ log.debug("Calling {}: {}".format(command, " ".join(cmd)))
+ subprocess.check_call(cmd)
+
+ def read_report(self, report_fn):
+ report = collections.Counter()
+ with open(report_fn) as f:
+ f.readline() # Cannot use next(f) in python2
+ for line in f:
+ if line.startswith('---'):
+ continue
+ parts = line.strip().split('\t')
+ percent = float(parts[0])
+ reads = int(parts[1])
+ fullname = parts[2]
+ if 'cannot be assigned to a species' in fullname:
+ tax_id = 1
+ elif 'unclassified' == fullname:
+ tax_id = 0
+ else:
+ tax_id = parts[3]
+ report[tax_id] = reads
+ return report
diff --git a/tools/kraken.py b/tools/kraken.py
index 19a93ef89..c3eef1acc 100644
--- a/tools/kraken.py
+++ b/tools/kraken.py
@@ -2,6 +2,7 @@
KRAKEN metagenomics classifier
'''
from __future__ import print_function
+import collections
import itertools
import logging
import os
@@ -11,6 +12,7 @@
import subprocess
import sys
import tempfile
+
import tools
import tools.picard
import tools.samtools
@@ -18,24 +20,30 @@
import util.misc
from builtins import super
-TOOL_NAME = 'kraken'
-TOOL_VERSION = '1.0.0_fork3'
+KRAKEN_VERSION = '1.0.0_fork3'
+KRAKENUNIQ_VERSION = '0.5.7_yesimon'
+
log = logging.getLogger(__name__)
+
class Kraken(tools.Tool):
- BINS = ['kraken', 'kraken-build', 'kraken-filter', 'kraken-mpa-report', 'kraken-report', 'kraken-translate']
+ BINS = {
+ 'classify': 'kraken',
+ 'build': 'kraken-build',
+ 'filter': 'kraken-filter',
+ 'report': 'kraken-report'}
def __init__(self, install_methods=None):
self.subtool_name = self.subtool_name if hasattr(self, "subtool_name") else "kraken"
if not install_methods:
install_methods = []
- install_methods.append(tools.CondaPackage(TOOL_NAME, executable=self.subtool_name, version=TOOL_VERSION, channel='broad-viral'))
+ install_methods.append(tools.CondaPackage('kraken', executable=self.subtool_name, version=KRAKEN_VERSION, channel='broad-viral'))
super(Kraken, self).__init__(install_methods=install_methods)
def version(self):
- return TOOL_VERSION
+ return KRAKEN_VERSION
@property
def libexec(self):
@@ -52,7 +60,7 @@ def build(self, db, options=None, option_string=None):
*args: List of input filenames to process.
'''
options['--threads'] = util.misc.sanitize_thread_count(options.get('--threads'))
- self.execute('kraken-build', db, db, options=options,
+ self.execute(self.BINS['build'], db, db, options=options,
option_string=option_string)
def _db_opts(self, db, threads):
@@ -115,7 +123,7 @@ def pipeline(self, db, inBams, outReports=None, outReads=None,
fastq_pipes = pipes[:n_bams * 2]
kraken_output_pipes = pipes[n_bams * 2:]
- kraken_bin = os.path.join(self.libexec, 'kraken')
+ kraken_bin = 'kraken'
opts = ''
if lockMemory:
opts += ' --lock-memory'
@@ -146,13 +154,13 @@ def pipeline(self, db, inBams, outReports=None, outReads=None,
if outReports:
if filterThreshold is not None:
- kraken_filter_bin = os.path.join(self.libexec, 'kraken-filter')
+ kraken_filter_bin = 'kraken-filter'
cmd += ' | {kraken_filter}{tax_opts} --threshold {filterThreshold}'.format(
kraken_filter=kraken_filter_bin,
tax_opts=tax_filter_opts,
filterThreshold=filterThreshold)
- kraken_report_bin = os.path.join(self.libexec, 'kraken-report')
+ kraken_report_bin = 'kraken-report'
cmd += ' | {kraken_report}{tax_opts} > {outReport}'.format(
kraken_report=kraken_report_bin,
tax_opts=tax_report_opts,
@@ -207,6 +215,7 @@ def classify(self, inBam, db, outReads, numThreads=None):
'--fastq-input': None,
'--gzip-compressed': None,
}
+ # Detect if input bam was paired by checking fastq 2
if os.path.getsize(tmp_fastq2) < 50:
res = self.execute('kraken', db, outReads, args=[tmp_fastq1], options=opts)
else:
@@ -218,13 +227,13 @@ def classify(self, inBam, db, outReads, numThreads=None):
def filter(self, inReads, db, outReads, filterThreshold):
"""Filter Kraken hits
"""
- self.execute('kraken-filter', db, outReads, args=[inReads],
+ self.execute(self.BINS['filter'], db, outReads, args=[inReads],
options={'--threshold': filterThreshold})
def report(self, inReads, db, outReport):
"""Convert Kraken read-based output to summary reports
"""
- self.execute('kraken-report', db, outReport, args=[inReads])
+ self.execute(self.BINS['report'], db, outReport, args=[inReads])
def execute(self, command, db, output, args=None, options=None,
option_string=None):
@@ -237,15 +246,17 @@ def execute(self, command, db, output, args=None, options=None,
options: List of keyword options.
option_string: Raw strip command line options.
'''
- assert command in Kraken.BINS, 'Kraken command is unknown'
options = options or {}
- if command == 'kraken':
- options['--output'] = output
+ if command == self.BINS['classify']:
+ if output:
+ options['--output'] = output
+ elif 'krakenuniq' in command:
+ options['--output'] = 'off'
option_string = option_string or ''
args = args or []
- cmd = [os.path.join(self.libexec, command), '--db', db]
+ cmd = [command, '--db', db]
# We need some way to allow empty options args like --build, hence
# we filter out on 'x is None'.
cmd.extend([str(x) for x in itertools.chain(*options.items())
@@ -254,18 +265,119 @@ def execute(self, command, db, output, args=None, options=None,
cmd.extend(args)
log.debug('Calling %s: %s', command, ' '.join(cmd))
- if command == 'kraken':
+ if command == self.BINS['classify']:
+ subprocess.check_call(cmd)
+ elif command == self.BINS['build']:
subprocess.check_call(cmd)
- elif command == 'kraken-build':
- jellyfish_path = Jellyfish().install_and_get_path()
- env = os.environ.copy()
- env['PATH'] = ':'.join([os.path.dirname(jellyfish_path), env['PATH']])
- subprocess.check_call(cmd, env=env)
else:
with util.file.open_or_gzopen(output, 'w') as of:
util.misc.run(cmd, stdout=of, stderr=subprocess.PIPE, check=True)
+@tools.skip_install_test()
class Jellyfish(Kraken):
""" Tool wrapper for Jellyfish (installed by kraken-all metapackage) """
subtool_name = 'jellyfish'
+
+
+class KrakenUniq(Kraken):
+
+ BINS = {
+ 'classify': 'krakenuniq',
+ 'build': 'krakenuniq-build',
+ 'filter': 'krakenuniq-filter',
+ 'report': 'krakenuniq-report'}
+
+ def __init__(self, install_methods=None):
+ self.subtool_name = self.subtool_name if hasattr(self, 'subtool_name') else 'krakenuniq'
+ if not install_methods:
+ install_methods = []
+ install_methods.append(tools.CondaPackage('krakenuniq', executable=self.subtool_name, version=KRAKENUNIQ_VERSION, channel='broad-viral'))
+ super(KrakenUniq, self).__init__(install_methods=install_methods)
+
+ def version(self):
+ return TOOL_VERSION
+
+ def pipeline(self, db, in_bams, out_reports=None, out_reads=None,
+ filter_threshold=None, num_threads=None):
+
+ try:
+ from itertools import zip_longest
+ except: # Python 2 compat
+ from itertools import izip_longest as zip_longest
+ assert out_reads is not None or out_reports is not None
+ out_reports = out_reports or []
+ out_reads = out_reads or []
+
+ for in_bam, out_read, out_report in zip_longest(in_bams, out_reads, out_reports):
+ self.classify(in_bam, db, out_reads=out_read, out_report=out_report, num_threads=None)
+
+ def classify(self, in_bam, db, out_reads=None, out_report=None, num_threads=None):
+ """Classify input reads (bam)
+
+ Args:
+ in_bam: unaligned reads
+ db: Kraken built database directory.
+ outReads: Output file of command.
+ """
+ tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz')
+ tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz')
+ # Do not convert this to samtools bam2fq unless we can figure out how to replicate
+ # the clipping functionality of Picard SamToFastq
+ picard = tools.picard.SamToFastqTool()
+ picard_opts = {
+ 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
+ 'CLIPPING_ACTION': 'X'
+ }
+ picard.execute(in_bam, tmp_fastq1, tmp_fastq2,
+ picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
+ JVMmemory=picard.jvmMemDefault)
+
+ opts = {
+ '--threads': util.misc.sanitize_thread_count(num_threads),
+ '--fastq-input': None,
+ '--gzip-compressed': None,
+ '--preload': None
+ }
+ if out_report:
+ opts['--report-file'] = out_report
+ # Detect if input bam was paired by checking fastq 2
+ if os.path.getsize(tmp_fastq2) < 50:
+ res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1], options=opts)
+ else:
+ opts['--paired'] = None
+ res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1, tmp_fastq2], options=opts)
+ os.unlink(tmp_fastq1)
+ os.unlink(tmp_fastq2)
+ if out_report:
+ with open(out_report, 'rt+') as f:
+ lines = [line.strip() for line in f.readlines() if not line.startswith('#')]
+ lines = [line for line in lines if line]
+ if not lines:
+ f.seek(f.tell() - 1, os.SEEK_SET)
+ print('\t'.join(['%', 'reads', 'taxReads', 'kmers', 'dup', 'cov', 'taxID', 'rank', 'taxName']), file=f)
+ print('\t'.join(['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified']), file=f)
+
+ def read_report(self, report_fn):
+ report = collections.Counter()
+ with open(report_fn) as f:
+ for line in f:
+ if line.startswith('#') or line.startswith('%'):
+ continue
+ line = line.strip()
+ if not line:
+ continue
+ parts = line.split('\t')
+ percent = float(parts[0])
+ cum_reads = int(parts[1])
+ tax_reads = int(parts[2])
+ tax_kmers = int(parts[3])
+ if parts[5] == 'NA': # unclassified
+ cov = 0
+ else:
+ cov = float(parts[5])
+ tax_id = int(parts[6])
+ rank = parts[7]
+ name = parts[8]
+ report[tax_id] = (tax_reads, tax_kmers)
+ return report
diff --git a/travis/install-conda.sh b/travis/install-conda.sh
index d51779199..def3c6737 100755
--- a/travis/install-conda.sh
+++ b/travis/install-conda.sh
@@ -47,11 +47,7 @@ else # if it does not exist, we need to install miniconda
# Use recommendations from https://github.com/bioconda/bioconda-recipes/issues/13774
conda update --quiet -y conda
# conda config --set channel_priority strict
- travis_wait conda install --quiet -y pycryptosat
- conda config --set sat_solver pycryptosat
- conda install --quiet -y openjdk==8.0.112
fi
# update certs
-conda update --quiet -y openssl pyopenssl ca-certificates certifi
conda info -a # for debugging
diff --git a/util/file.py b/util/file.py
index aac233bf2..6db6b1296 100644
--- a/util/file.py
+++ b/util/file.py
@@ -321,11 +321,13 @@ def mkdir_p(dirpath):
else:
raise
+
def touch_p(path, times=None):
'''Touch file, making parent directories if they don't exist.'''
mkdir_p(os.path.dirname(path))
touch(path, times=times)
+
def open_or_gzopen(fname, *opts, **kwargs):
mode = 'r'
open_opts = list(opts)
@@ -822,34 +824,6 @@ def find_broken_symlinks(rootdir, followlinks=False):
return broken_links_to_remove
-def join_paired_fastq(input_fastqs, output_format='fastq', num_n=None):
- '''Join paired/interleaved fastq(s) into single reads connected by Ns'''
- assert output_format in ('fastq', 'fasta')
- num_n = num_n or 31
-
- if len(input_fastqs) > 1:
- f1 = SeqIO.parse(input_fastqs[0], 'fastq')
- f2 = SeqIO.parse(input_fastqs[1], 'fastq')
- records = zip(f1, f2)
- else:
- if input_fastqs[0] == '-':
- input_fastqs[0] = sys.stdin
- records = util.misc.batch_iterator(SeqIO.parse(input_fastqs[0], 'fastq'), 2)
- for r1, r2 in records:
- if r1.id.endswith('/1'):
- rid = r1.id[:-2]
- else:
- rid = r1.id
- jseq = Seq(str(r1.seq) + 'N' * num_n + str(r2.seq))
- labbrevs = None
- if output_format == 'fastq':
- # Illumina quality score of 2 indicates unreliable base
- labbrevs = {
- 'phred_quality': r1.letter_annotations['phred_quality'] + [2] * num_n + r2.letter_annotations['phred_quality']
- }
- rec = SeqRecord(jseq, id=rid, description='', letter_annotations=labbrevs)
- yield rec
-
def uncompressed_file_type(fname):
"""Return the original file extension of either a compressed or an uncompressed file."""
base, ext = os.path.splitext(fname)