diff --git a/metagenomics.py b/metagenomics.py index 9d77fe3d7..2356d078c 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -7,7 +7,6 @@ __author__ = "yesimon@broadinstitute.org" import argparse -import codecs import collections import csv import gzip @@ -21,7 +20,6 @@ import re import shutil import sys -import subprocess import tempfile import json @@ -33,8 +31,7 @@ import util.cmd import util.file import util.misc -import tools.bwa -import tools.diamond +import tools.kaiju import tools.kraken import tools.krona import tools.picard @@ -246,7 +243,7 @@ def sam_lca(db, sam_file, output=None, top_percent=10, unique_only=True): if mapped_segs: tax_id = process_sam_hits(db, mapped_segs, top_percent) if tax_id is None: - log.warn('Query: {} has no valid taxonomy paths.'.format(query_name)) + log.warning('Query: {} has no valid taxonomy paths.'.format(query_name)) if unique_only: continue else: @@ -365,7 +362,7 @@ def coverage_lca(query_ids, parents, lca_percent=100): while query_id != 1: path.append(query_id) if parents.get(query_id, 0) == 0: - log.warn('Parent for query id: {} missing'.format(query_id)) + log.warning('Parent for query id: {} missing'.format(query_id)) break query_id = parents[query_id] if query_id == 1: @@ -595,7 +592,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade with open_or_gzopen(input_path, 'rt') as f, \ open_or_gzopen(output_path, 'wt') as out_f: if header: - out_f.write(next(f)) + out_f.write(f.readline()) # Cannot use next(f) for python2 for line in f: parts = line.split(sep) taxid = int(parts[taxid_column]) @@ -729,33 +726,31 @@ def kraken_dfs(db, lines, taxa_hits, total_hits, taxid, level): return cum_hits -def parser_kraken(parser=argparse.ArgumentParser()): +def parser_krakenuniq(parser=argparse.ArgumentParser()): parser.add_argument('db', help='Kraken database directory.') parser.add_argument('inBams', nargs='+', help='Input unaligned reads, BAM format.') parser.add_argument('--outReports', nargs='+', help='Kraken summary report output file. Multiple filenames space separated.') parser.add_argument('--outReads', nargs='+', help='Kraken per read classification output file. Multiple filenames space separated.') - parser.add_argument('--lockMemory', action='store_true', default=False, help='Lock kraken database in RAM. Requires high ulimit -l.') parser.add_argument( '--filterThreshold', default=0.05, type=float, help='Kraken filter threshold (default %(default)s)' ) util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, kraken, split_args=True) + util.cmd.attach_main(parser, krakenuniq, split_args=True) return parser -def kraken(db, inBams, outReports=None, outReads=None, lockMemory=False, filterThreshold=None, threads=None): +def krakenuniq(db, inBams, outReports=None, outReads=None, lockMemory=False, filterThreshold=None, threads=None): ''' - Classify reads by taxon using Kraken + Classify reads by taxon using KrakenUniq ''' assert outReads or outReports, ('Either --outReads or --outReport must be specified.') - kraken_tool = tools.kraken.Kraken() - kraken_tool.pipeline(db, inBams, outReports=outReports, outReads=outReads, lockMemory=lockMemory, - filterThreshold=filterThreshold, numThreads=threads) -__commands__.append(('kraken', parser_kraken)) - + kuniq_tool = tools.kraken.KrakenUniq() + kuniq_tool.pipeline(db, inBams, out_reports=outReports, out_reads=outReads, + filter_threshold=filterThreshold, num_threads=threads) +__commands__.append(('krakenuniq', parser_krakenuniq)) def parser_krona(parser=argparse.ArgumentParser()): - parser.add_argument('inTsv', help='Input tab delimited file.') + parser.add_argument('inReport', help='Input report file (default: tsv)') parser.add_argument('db', help='Krona taxonomy database directory.') parser.add_argument('outHtml', help='Output html report.') parser.add_argument('--queryColumn', help='Column of query id. (default %(default)s)', type=int, default=2) @@ -764,263 +759,106 @@ def parser_krona(parser=argparse.ArgumentParser()): parser.add_argument('--magnitudeColumn', help='Column of magnitude. (default %(default)s)', type=int, default=None) parser.add_argument('--noHits', help='Include wedge for no hits.', action='store_true') parser.add_argument('--noRank', help='Include no rank assignments.', action='store_true') + parser.add_argument('--inputType', help='Handling for specialized report types.', default='tsv', choices=['tsv', 'krakenuniq', 'kaiju']) util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, krona, split_args=True) return parser -def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=None, magnitudeColumn=None, noHits=None, noRank=None): +def krona(inReport, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=None, magnitudeColumn=None, noHits=None, noRank=None, + inputType=None): ''' Create an interactive HTML report from a tabular metagenomic report ''' krona_tool = tools.krona.Krona() - if inTsv.endswith('.gz'): - tmp_tsv = util.file.mkstempfname('.tsv') - with gzip.open(inTsv, 'rb') as f_in: - with open(tmp_tsv, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - to_import = [tmp_tsv] - else: - to_import = [inTsv] - root_name = os.path.basename(inTsv) - - krona_tool.import_taxonomy( - db, - to_import, - outHtml, - query_column=queryColumn, - taxid_column=taxidColumn, - score_column=scoreColumn, - magnitude_column=magnitudeColumn, - root_name=root_name, - no_hits=noHits, - no_rank=noRank - ) - - if inTsv.endswith('.gz'): - # Cleanup tmp .tsv files - for tmp_tsv in to_import: - os.unlink(tmp_tsv) -__commands__.append(('krona', parser_krona)) - -def parser_diamond(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', help='Input unaligned reads, BAM format.') - parser.add_argument('db', help='Diamond database directory.') - parser.add_argument('taxDb', help='Taxonomy database directory.') - parser.add_argument('outReport', help='Output taxonomy report.') - parser.add_argument('--outReads', help='Output LCA assignments for each read.') - util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, diamond, split_args=True) - return parser -def diamond(inBam, db, taxDb, outReport, outReads=None, threads=None): - ''' - Classify reads by the taxon of the Lowest Common Ancestor (LCA) - ''' - # do not convert this to samtools bam2fq unless we can figure out how to replicate - # the clipping functionality of Picard SamToFastq - picard = tools.picard.SamToFastqTool() - with util.file.fifo(2) as (fastq_pipe, diamond_pipe): - s2fq = picard.execute( - inBam, - fastq_pipe, - interleave=True, - illuminaClipping=True, - JVMmemory=picard.jvmMemDefault, - background=True, + if inputType == 'tsv': + root_name = os.path.basename(inReport) + if inReport.endswith('.gz'): + tmp_tsv = util.file.mkstempfname('.tsv') + with gzip.open(inReport, 'rb') as f_in: + with open(tmp_tsv, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + to_import = [tmp_tsv] + else: + to_import = [inReport] + + krona_tool.import_taxonomy( + db, + to_import, + outHtml, + query_column=queryColumn, + taxid_column=taxidColumn, + score_column=scoreColumn, + magnitude_column=magnitudeColumn, + root_name=root_name, + no_hits=noHits, + no_rank=noRank ) - diamond_tool = tools.diamond.Diamond() - taxonmap = join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz') - taxonnodes = join(taxDb, 'nodes.dmp') - rutils = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'read_utils.py') - cmd = '{read_utils} join_paired_fastq --outFormat fasta /dev/stdout {fastq}'.format( - read_utils=rutils, fastq=fastq_pipe) - - cmd += ' | {} blastx --out {output} --outfmt 102 --sallseqid'.format( - diamond_tool.install_and_get_path(), output=diamond_pipe) - cmd += ' --threads {threads}'.format(threads=util.misc.sanitize_thread_count(threads)) - cmd += ' --db {db} --taxonmap {taxonmap} --taxonnodes {taxonnodes}'.format( - db=db, - taxonmap=taxonmap, - taxonnodes=taxonnodes) - - if outReads is not None: - # Interstitial save of stdout to output file - cmd += ' | tee >(pigz --best > {out})'.format(out=outReads) - - diamond_ps = subprocess.Popen(cmd, shell=True, executable='/bin/bash') - - tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True) - - with open(diamond_pipe) as lca_p: - hits = taxa_hits_from_tsv(lca_p) - with open(outReport, 'w') as f: - for line in kraken_dfs_report(tax_db, hits): - print(line, file=f) - - s2fq.wait() - assert s2fq.returncode == 0 - diamond_ps.wait() - assert diamond_ps.returncode == 0 -__commands__.append(('diamond', parser_diamond)) - - -def parser_diamond_fasta(parser=argparse.ArgumentParser()): - parser.add_argument('inFasta', help='Input sequences, FASTA format, optionally gzip compressed.') - parser.add_argument('db', help='Diamond database file.') - parser.add_argument('taxDb', help='Taxonomy database directory.') - parser.add_argument('outFasta', help='Output sequences, same as inFasta, with taxid|###| prepended to each sequence identifier.') - parser.add_argument('--memLimitGb', type=float, default=None, help='approximate memory usage in GB') - util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, diamond_fasta, split_args=True) - return parser -def diamond_fasta(inFasta, db, taxDb, outFasta, threads=None, memLimitGb=None): - ''' - Classify fasta sequences by the taxon of the Lowest Common Ancestor (LCA) - ''' - - with util.file.tmp_dir() as tmp_dir: - # run diamond blastx on fasta sequences - cmd = [tools.diamond.Diamond().install_and_get_path(), 'blastx', - '-q', inFasta, - '--db', db, - '--outfmt', '102', # tsv: query name, taxid of LCA, e-value - '--salltitles',# to recover the entire fasta sequence name - '--sensitive', # this is necessary for longer reads or contigs - '--algo', '1', # for small query files - '--threads', str(util.misc.sanitize_thread_count(threads)), - '--taxonmap', os.path.join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz'), - '--taxonnodes', os.path.join(taxDb, 'nodes.dmp'), - '--tmpdir', tmp_dir, - ] - if memLimitGb: - cmd.extend(['--block-size', str(round(memLimitGb / 5.0, 1))]) - log.debug(' '.join(cmd)) - diamond_p = subprocess.Popen(cmd, stdout=subprocess.PIPE) - - # read the output report and load into an in-memory map - # of sequence ID -> tax ID (there shouldn't be that many sequences) - seq_to_tax = {} - for line in diamond_p.stdout: - row = line.decode('UTF-8').rstrip('\n\r').split('\t') - tax = row[1] if row[1] != '0' else '32644' # diamond returns 0 if unclassified, but the proper taxID for that is 32644 - seq_to_tax[row[0]] = tax - if diamond_p.poll(): - raise subprocess.CalledProcessError(diamond_p.returncode, cmd) - - # copy inFasta to outFasta while prepending taxid|###| to all sequence names - log.debug("transforming {} to {}".format(inFasta, outFasta)) - with util.file.open_or_gzopen(inFasta, 'rt') as inf: - with util.file.open_or_gzopen(outFasta, 'wt') as outf: - for seq in Bio.SeqIO.parse(inf, 'fasta'): - taxid = seq_to_tax.get(seq.id, '32644') # default to "unclassified" - for text_line in util.file.fastaMaker([( - '|'.join('taxid', taxid, seq.id), - str(seq.seq))]): - outf.write(text_line) - -__commands__.append(('diamond_fasta', parser_diamond_fasta)) - - -def parser_build_diamond_db(parser=argparse.ArgumentParser()): - parser.add_argument('protein_fastas', nargs='+', help='Input protein fasta files') - parser.add_argument('db', help='Output Diamond database file') - util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, build_diamond_db, split_args=True) - return parser -def build_diamond_db(protein_fastas, db, threads=None): - tool.diamond.Diamond().build(db, protein_fastas, options={'threads':str(util.misc.sanitize_thread_count(threads))}) -__commands__.append(('build_diamond_db', parser_build_diamond_db)) + if inReport.endswith('.gz'): + # Cleanup tmp .tsv files + for tmp_tsv in to_import: + os.unlink(tmp_tsv) + + elif inputType == 'krakenuniq': + krakenuniq = tools.kraken.KrakenUniq() + report = krakenuniq.read_report(inReport) + with util.file.tempfname() as fn: + with open(fn, 'w') as to_import: + for taxid, (tax_reads, cov) in report.items(): + print('{}\t{}\t{}'.format(taxid, tax_reads, cov), file=to_import) + krona_tool.import_taxonomy( + db, [fn], outHtml, + taxid_column=1, magnitude_column=2, + score_column=3, + no_hits=True, no_rank=True + ) + # Rename "Avg. score" to "Est. genome coverage" + html_lines = util.file.slurp_file(outHtml).split('\n') + with util.file.tempfname() as fn: + with open(fn, 'w') as new_report: + for line in html_lines: + if 'score' in line: + line = line.replace('Avg. score', 'Est. genome coverage') + print(line, file=new_report) + os.rename(fn, outHtml) + return + elif inputType == 'kaiju': + kaiju = tools.kaiju.Kaiju() + report = kaiju.read_report(inReport) + with util.file.tempfname() as fn: + print(fn) + with open(fn, 'w') as to_import: + for taxid, reads in report.items(): + print('{}\t{}'.format(taxid, reads), file=to_import) + krona_tool.import_taxonomy( + db, [fn], outHtml, + taxid_column=1, magnitude_column=2, + no_hits=True, no_rank=True + ) + return + else: + raise NotImplementedError +__commands__.append(('krona', parser_krona)) -def parser_align_rna_metagenomics(parser=argparse.ArgumentParser()): +def parser_kaiju(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input unaligned reads, BAM format.') - parser.add_argument('db', help='Bwa index prefix.') + parser.add_argument('db', help='Kaiju database .fmi file.') parser.add_argument('taxDb', help='Taxonomy database directory.') parser.add_argument('outReport', help='Output taxonomy report.') - parser.add_argument('--dupeReport', help='Generate report including duplicates.') - parser.add_argument( - '--sensitive', - dest='sensitive', - action="store_true", - help='Use sensitive instead of default BWA mem options.' - ) - parser.add_argument('--outBam', help='Output aligned, indexed BAM file. Default is to write to temp.') parser.add_argument('--outReads', help='Output LCA assignments for each read.') - parser.add_argument('--dupeReads', help='Output LCA assignments for each read including duplicates.') - parser.add_argument( - '--JVMmemory', - default=tools.picard.PicardTools.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)' - ) util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, align_rna_metagenomics, split_args=True) + util.cmd.attach_main(parser, kaiju, split_args=True) return parser -def align_rna_metagenomics( - inBam, - db, - taxDb, - outReport, - dupeReport=None, - outBam=None, - dupeReads=None, - outReads=None, - sensitive=None, - JVMmemory=None, - threads=None, - picardOptions=None, -): +def kaiju(inBam, db, taxDb, outReport, outReads=None, threads=None): ''' - Align to metagenomics bwa index, mark duplicates, and generate LCA report + Classify reads by the taxon of the Lowest Common Ancestor (LCA) ''' - picardOptions = picardOptions if picardOptions else [] - - bwa = tools.bwa.Bwa() - samtools = tools.samtools.SamtoolsTool() - - bwa_opts = ['-a'] - if sensitive: - bwa_opts += '-k 12 -A 1 -B 1 -O 1 -E 1'.split() - - # TODO: Use bwa.mem's min_score_to_filter argument to decrease false - # positives in the output. Currently, it works by summing the alignment - # score across all alignments output by bwa for each query (reads in a - # pair, supplementary, and secondary alignments). This is not reasonable - # for reads with secondary alignments because it will be easier for those - # reads/queries to exceed the threshold given by the value of the argument. - # In this context, bwa is called using '-a' as an option and its output - # will likely include many secondary alignments. One option is to add - # another argument to bwa.mem, similar to min_score_to_filter, that sets a - # threshold on the alignment score of output alignments but only filters on - # a per-alignment level (i.e., not by summing alignment scores across all - # alignments for each query). - - aln_bam = util.file.mkstempfname('.bam') - bwa.mem(inBam, db, aln_bam, options=bwa_opts) - - tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True) - - if dupeReport: - aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam') - samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=threads) - sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outReads=dupeReads, unique_only=False) - os.unlink(aln_bam_sorted) - aln_bam_deduped = outBam if outBam else util.file.mkstempfname('.align_deduped.bam') - opts = list(picardOptions) - dupe_removal_out_metrics = util.file.mkstempfname('.metrics') - pic = tools.picard.MarkDuplicatesTool() - pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory) - - - os.unlink(aln_bam) - aln_bam_dd_sorted = util.file.mkstempfname('.bam') - samtools.sort(aln_bam_deduped, aln_bam_dd_sorted, args=['-n'], threads=threads) - sam_lca_report(tax_db, aln_bam_dd_sorted, outReport=outReport, outReads=outReads) - - if not outBam: - os.unlink(aln_bam_deduped) -__commands__.append(('align_rna', parser_align_rna_metagenomics)) + kaiju_tool = tools.kaiju.Kaiju() + kaiju_tool.classify(db, taxDb, inBam, output_report=outReport, output_reads=outReads, num_threads=threads) +__commands__.append(('kaiju', parser_kaiju)) def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=None): @@ -1039,8 +877,6 @@ def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=No print(line, file=f) - - def parser_metagenomic_report_merge(parser=argparse.ArgumentParser()): parser.add_argument( "metagenomic_reports", @@ -1115,10 +951,8 @@ def metagenomic_report_merge(metagenomic_reports, out_kraken_summary, kraken_db, -def kraken_library_ids(library): - '''Parse gi/accession from ids of fasta files in library directory. ''' - library_taxids = set() - library_gis = set() +def fasta_library_accessions(library): + '''Parse accession from ids of fasta files in library directory. ''' library_accessions = set() for dirpath, dirnames, filenames in os.walk(library, followlinks=True): for filename in filenames: @@ -1127,32 +961,20 @@ def kraken_library_ids(library): filepath = os.path.join(dirpath, filename) for seqr in SeqIO.parse(filepath, 'fasta'): name = seqr.name - # Search for encoded taxid - mo = re.search('kraken:taxid\|(\d+)\|', name) - if mo: - taxid = int(mo.group(1)) - library_taxids.add(taxid) - continue - # Search for gi - mo = re.search('gi\|(\d+)\|', name) - if mo: - gi = int(mo.group(1)) - library_gis.add(gi) - continue # Search for accession - mo = re.search('([A-Z]+\d+\.\d+)', name) + mo = re.search('([A-Z]+_?\d+\.\d+)', name) if mo: accession = mo.group(1) library_accessions.add(accession) - return library_taxids, library_gis, library_accessions + return library_accessions -class KrakenBuildError(Exception): - '''Error while building kraken database.''' +class KrakenUniqBuildError(Exception): + '''Error while building KrakenUniq database.''' def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()): parser.add_argument('in_bam', help='Input bam file.') - parser.add_argument('read_IDs_to_tax_IDs', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.') + parser.add_argument('read_IDs_to_tax_IDs', help='TSV file mapping read IDs to taxIDs, Kraken-format by default. Assumes bijective mapping of read ID to tax ID.') parser.add_argument('out_bam', help='Output bam file, filtered to the taxa specified') parser.add_argument('nodes_dmp', help='nodes.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/') parser.add_argument('names_dmp', help='names.dmp file from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/') @@ -1168,27 +990,22 @@ def parser_filter_bam_to_taxa(parser=argparse.ArgumentParser()): ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, filter_bam_to_taxa, split_args=True) - return parser - -def filter_bam_to_taxa( in_bam, - read_IDs_to_tax_IDs, - out_bam, - nodes_dmp, - names_dmp, - tax_names=None, - tax_ids=None, - omit_children=False, - read_id_col=1, - tax_id_col=2, - JVMmemory=None ): + return parser + +def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam, + nodes_dmp, names_dmp, + tax_names=None, tax_ids=None, + omit_children=False, + read_id_col=1, tax_id_col=2, + JVMmemory=None): """ - Filter an (already classified) input bam file to only include reads that have been mapped to specified - taxonomic IDs or scientific names. This requires a classification file, as produced + Filter an (already classified) input bam file to only include reads that have been mapped to specified + taxonomic IDs or scientific names. This requires a classification file, as produced by tools such as Kraken, as well as the NCBI taxonomy database. """ tax_ids = set(tax_ids) if tax_ids else set() tax_names = tax_names or [] - # use TaxonomyDb() class above and tree traversal/collection functions above + # use TaxonomyDb() class above and tree traversal/collection functions above db = TaxonomyDb(nodes_path=nodes_dmp, names_path=names_dmp, load_nodes=True, load_names=True) db.children = parents_to_children(db.parents) @@ -1249,10 +1066,10 @@ def filter_bam_to_taxa( in_bam, # if we found reads matching the taxNames requested, if read_ids_written > 0: # filter the input bam to include only these - tools.picard.FilterSamReadsTool().execute(in_bam, - False, - temp_read_list, - out_bam, + tools.picard.FilterSamReadsTool().execute(in_bam, + False, + temp_read_list, + out_bam, JVMmemory=JVMmemory) else: # otherwise, "touch" the output bam to contain the @@ -1267,7 +1084,7 @@ def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()): parser.add_argument('--jsonOut', dest="json_out", type=argparse.FileType('w'), help='The path to a json file containing the relevant parsed summary data in json format.') parser.add_argument('--csvOut', dest="csv_out", type=argparse.FileType('w'), help='The path to a csv file containing sample-specific counts.') parser.add_argument('--taxHeading', nargs="+", dest="tax_headings", help='The taxonomic heading to analyze (default: %(default)s). More than one can be specified.', default="Viruses") - parser.add_argument('--taxlevelFocus', dest="taxlevel_focus", help='The taxonomic heading to summarize (totals by Genus, etc.) (default: %(default)s).', default="species", + parser.add_argument('--taxlevelFocus', dest="taxlevel_focus", help='The taxonomic heading to summarize (totals by Genus, etc.) (default: %(default)s).', default="species", choices=["species", "genus", "family", "order", "class", "phylum", "kingdom", "superkingdom"]) parser.add_argument('--topN', type=int, dest="top_n_entries", help='Only include the top N most abundant taxa by read count (default: %(default)s)', default=100) parser.add_argument('--countThreshold', type=int, dest="count_threshold", help='Minimum number of reads to be included (default: %(default)s)', default=1) @@ -1281,13 +1098,13 @@ def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()): def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel_focus, top_n_entries, count_threshold, no_hist, zero_fill, include_root): """ Aggregates taxonomic abundance data from multiple Kraken-format summary files. - It is intended to report information on a particular taxonomic level (--taxlevelFocus; ex. 'species'), - within a higher-level grouping (--taxHeading; ex. 'Viruses'). By default, when --taxHeading - is at the same level as --taxlevelFocus a summary with lines for each sample is emitted. + It is intended to report information on a particular taxonomic level (--taxlevelFocus; ex. 'species'), + within a higher-level grouping (--taxHeading; ex. 'Viruses'). By default, when --taxHeading + is at the same level as --taxlevelFocus a summary with lines for each sample is emitted. Otherwise, a histogram is returned. If per-sample information is desired, --noHist can be specified. In per-sample data, the suffix "-pt" indicates percentage, so a value of 0.02 is 0.0002 of the total number of reads for the sample. - If --topN is specified, only the top N most abundant taxa are included in the histogram count or per-sample output. - If a number is specified for --countThreshold, only taxa with that number of reads (or greater) are included. + If --topN is specified, only the top N most abundant taxa are included in the histogram count or per-sample output. + If a number is specified for --countThreshold, only taxa with that number of reads (or greater) are included. Full data returned via --jsonOut (filtered by --topN and --countThreshold), whereas -csvOut returns a summary. """ @@ -1295,7 +1112,7 @@ def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel same_level = False Abundance = collections.namedtuple("Abundance", "percent,count") - + def indent_len(in_string): return len(in_string)-len(in_string.lstrip()) @@ -1315,11 +1132,11 @@ def indent_len(in_string): csv.register_dialect('kraken_report', quoting=csv.QUOTE_MINIMAL, delimiter="\t") fieldnames = ["pct_of_reads","num_reads","reads_exc_children","rank","NCBI_tax_ID","sci_name"] row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report")) - + indent_of_line = indent_len(row["sci_name"]) # remove leading/trailing whitespace from each item row = { k:v.strip() for k, v in row.items()} - + # rows are formatted like so: # 0.00 16 0 D 10239 Viruses # @@ -1330,7 +1147,7 @@ def indent_len(in_string): # row["NCBI_tax_ID"] NCBI taxonomy ID # row["sci_name"] indented scientific name - # if the root-level bins (root, unclassified) should be included, do so, but bypass normal + # if the root-level bins (root, unclassified) should be included, do so, but bypass normal # stateful parsing logic since root does not have a distinct rank level if row["sci_name"].lower() in ["root","unclassified"] and include_root: sample_root_summary[row["sci_name"]] = collections.OrderedDict() @@ -1344,7 +1161,7 @@ def indent_len(in_string): if indent_of_selection == -1: if row["sci_name"].lower() in tax_headings_copy: tax_headings_copy.remove(row["sci_name"].lower()) - + should_process = True indent_of_selection = indent_of_line currently_being_processed = row["sci_name"] @@ -1353,15 +1170,15 @@ def indent_len(in_string): same_level = True if row["rank"] == "-": log.warning("Non-taxonomic parent level selected") - + if should_process: # skip "-" rank levels since they do not occur at the sample level # otherwise include the taxon row if the rank matches the desired level of focus - if (row["rank"] != "-" and rank_code(taxlevel_focus) == row["rank"]): + if (row["rank"] != "-" and rank_code(taxlevel_focus) == row["rank"]): if int(row["num_reads"])>=count_threshold: sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"])) - + for k,taxa in sample_summary.items(): sample_summary[k] = collections.OrderedDict(sorted(taxa.items(), key=lambda item: (item[1][1]) , reverse=True)[:top_n_entries]) @@ -1370,8 +1187,8 @@ def indent_len(in_string): "\"{name}\" with {reads} reads ({percent:.2%} of total); " "included since >{threshold} read{plural}".format( f=f, - heading=k, - level=taxlevel_focus, + heading=k, + level=taxlevel_focus, name=list(sample_summary[k].items())[0][0], reads=list(sample_summary[k].items())[0][1].count, percent=list(sample_summary[k].items())[0][1].percent/100.0, @@ -1417,12 +1234,12 @@ def indent_len(in_string): for sample, taxa in samples.items(): sample_dict = {} sample_dict["sample"] = sample - for heading,taxon in taxa.items(): + for heading,taxon in taxa.items(): for entry in taxon.keys(): sample_dict[entry+"-pt"] = taxon[entry].percent sample_dict[entry+"-ct"] = taxon[entry].count writer.writerow(sample_dict) - + csv_out.close() @@ -1455,29 +1272,28 @@ def indent_len(in_string): __commands__.append(('taxlevel_summary', parser_kraken_taxlevel_summary)) -def parser_kraken_build(parser=argparse.ArgumentParser()): - parser.add_argument('db', help='Kraken database output directory.') +def parser_krakenuniq_build(parser=argparse.ArgumentParser()): + parser.add_argument('db', help='Krakenuniq database output directory.') parser.add_argument('--library', help='Input library directory of fasta files. If not specified, it will be read from the "library" subdirectory of "db".') parser.add_argument('--taxonomy', help='Taxonomy db directory. If not specified, it will be read from the "taxonomy" subdirectory of "db".') parser.add_argument('--subsetTaxonomy', action='store_true', help='Subset taxonomy based on library fastas.') - parser.add_argument('--minimizerLen', type=int, help='Minimizer length (kraken default: 15)') - parser.add_argument('--kmerLen', type=int, help='k-mer length (kraken default: 31)') + parser.add_argument('--minimizerLen', type=int, help='Minimizer length (krakenuniq default: 15)') + parser.add_argument('--kmerLen', type=int, help='k-mer length (krakenuniq default: 31)') parser.add_argument('--maxDbSize', type=int, help='Maximum db size in GB (will shrink if too big)') parser.add_argument('--clean', action='store_true', help='Clean by deleting other database files after build') parser.add_argument('--workOnDisk', action='store_true', help='Work on disk instead of RAM. This is generally much slower unless the "db" directory lives on a RAM disk.') util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None))) - util.cmd.attach_main(parser, kraken_build, split_args=True) + util.cmd.attach_main(parser, krakenuniq_build, split_args=True) return parser -def kraken_build(db, library, taxonomy=None, subsetTaxonomy=None, - threads=None, workOnDisk=False, - minimizerLen=None, kmerLen=None, maxDbSize=None, clean=False): +def krakenuniq_build(db, library, taxonomy=None, subsetTaxonomy=None, + threads=None, workOnDisk=False, + minimizerLen=None, kmerLen=None, maxDbSize=None, clean=False): ''' - Builds a kraken database from library directory of fastas and taxonomy db - directory. The --subsetTaxonomy option allows shrinking the taxonomy to only - include taxids associated with the library folders. For this to work, the - library fastas must have the standard id names such as `>NC1234.1` - accessions, `>gi|123456789|ref|XXXX||`, or custom kraken name - `>kraken:taxid|1234|`. + Builds a krakenuniq database from library directory of fastas and taxonomy + db directory. The --subsetTaxonomy option allows shrinking the taxonomy to + only include taxids associated with the library folders. For this to work, + the library fastas must have the standard accession id names such as + `>NC1234.1` or `>NC_01234.1`. Setting the --minimizerLen (default: 16) small, such as 10, will drastically shrink the db size for small inputs, which is useful for testing. @@ -1503,45 +1319,28 @@ def kraken_build(db, library, taxonomy=None, subsetTaxonomy=None, taxonomy_exists = os.path.exists(taxonomy_dir) if taxonomy: if taxonomy_exists: - raise KrakenBuildError('Output db directory already contains taxonomy directory {}'.format(taxonomy_dir)) + raise KrakenUniqBuildError('Output db directory already contains taxonomy directory {}'.format(taxonomy_dir)) if subsetTaxonomy: - kraken_taxids, kraken_gis, kraken_accessions = kraken_library_ids(library) - - whitelist_taxid_f = util.file.mkstempfname() - with open(whitelist_taxid_f, 'wt') as f: - for taxid in kraken_taxids: - print(taxid, file=f) - - whitelist_gi_f = util.file.mkstempfname() - with open(whitelist_gi_f, 'wt') as f: - for gi in kraken_gis: - print(gi, file=f) + accessions = fasta_library_accessions(library) whitelist_accession_f = util.file.mkstempfname() with open(whitelist_accession_f, 'wt') as f: - for accession in kraken_accessions: + for accession in accessions: print(accession, file=f) # Context-managerize eventually taxonomy_tmp = tempfile.mkdtemp() - subset_taxonomy(taxonomy, taxonomy_tmp, whitelistTaxidFile=whitelist_taxid_f, - whitelistGiFile=whitelist_gi_f, whitelistAccessionFile=whitelist_accession_f) + subset_taxonomy(taxonomy, taxonomy_tmp, whitelistAccessionFile=whitelist_accession_f) shutil.move(taxonomy_tmp, taxonomy_dir) else: os.symlink(os.path.abspath(taxonomy), taxonomy_dir) - for fn in os.listdir(os.path.join(taxonomy_dir, 'accession2taxid')): - if not fn.endswith('accession2taxid'): - continue - - os.symlink(os.path.join(taxonomy_dir, 'accession2taxid', fn), - os.path.join(taxonomy_dir, fn)) else: if not taxonomy_exists: raise FileNotFoundError('Taxonomy directory {} not found'.format(taxonomy_dir)) if subsetTaxonomy: - raise KrakenBuildError('Cannot subset taxonomy if already in db folder') + raise KrakenUniqBuildError('Cannot subset taxonomy if already in db folder') - kraken_tool = tools.kraken.Kraken() + krakenuniq_tool = tools.kraken.KrakenUniq() options = {'--build': None} if threads: options['--threads'] = threads @@ -1553,11 +1352,11 @@ def kraken_build(db, library, taxonomy=None, subsetTaxonomy=None, options['--max-db-size'] = maxDbSize if workOnDisk: options['--work-on-disk'] = None - kraken_tool.build(db, options=options) + krakenuniq_tool.build(db, options=options) if clean: - kraken_tool.execute('kraken-build', db, '', options={'--clean': None}) -__commands__.append(('kraken_build', parser_kraken_build)) + krakenuniq_tool.execute('krakenuniq-build', db, '', options={'--clean': None}) +__commands__.append(('krakenuniq_build', parser_krakenuniq_build)) diff --git a/pipes/WDL/dx-defaults-classify_kaiju.json b/pipes/WDL/dx-defaults-classify_kaiju.json new file mode 100644 index 000000000..1616c0376 --- /dev/null +++ b/pipes/WDL/dx-defaults-classify_kaiju.json @@ -0,0 +1,8 @@ +{ + "classify_kaiju.kaiju.kaiju_db_lz4": + "dx://file-FVYQqP006zFF064QBGf022X1", + "classify_kaiju.kaiju.krona_taxonomy_db_tgz": + "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb", + "classify_kaiju.kaiju.ncbi_taxonomy_db_tgz": + "dx://file-F8KgJK009y3Qgy3FF1791Vgq" +} diff --git a/pipes/WDL/dx-defaults-classify_kraken.json b/pipes/WDL/dx-defaults-classify_kraken.json deleted file mode 100644 index be31e4ca5..000000000 --- a/pipes/WDL/dx-defaults-classify_kraken.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "classify_kraken.kraken.kraken_db_tar_lz4": - "dx://file-F8QF3bj0xf5gZv76BG4QgpF4", - "classify_kraken.kraken.krona_taxonomy_db_tgz": - "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb" -} diff --git a/pipes/WDL/dx-defaults-classify_krakenuniq.json b/pipes/WDL/dx-defaults-classify_krakenuniq.json new file mode 100644 index 000000000..eabf57fc7 --- /dev/null +++ b/pipes/WDL/dx-defaults-classify_krakenuniq.json @@ -0,0 +1,6 @@ +{ + "classify_krakenuniq.krakenuniq.krakenuniq_db_tar_lz4": + "dx://file-FVYQqP006zFF064QBGf022X1", + "classify_krakenuniq.krakenuniq.krona_taxonomy_db_tgz": + "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb" +} diff --git a/pipes/WDL/dx-defaults-demux_plus.json b/pipes/WDL/dx-defaults-demux_plus.json index a50a8c985..1b25a153d 100644 --- a/pipes/WDL/dx-defaults-demux_plus.json +++ b/pipes/WDL/dx-defaults-demux_plus.json @@ -15,8 +15,8 @@ "demux_plus.trim_clip_db": "dx://file-BXF0vYQ0QyBF509G9J12g927", - "demux_plus.kraken.kraken_db_tar_lz4": - "dx://file-F8QF3bj0xf5gZv76BG4QgpF4", - "demux_plus.kraken.krona_taxonomy_db_tgz": + "demux_plus.krakenuniq.krakenuniq_db_tar_lz4": + "dx://file-FVYQqP006zFF064QBGf022X1", + "demux_plus.krakenuniq.krona_taxonomy_db_tgz": "dx://file-F4z0fgj07FZ8jg8yP7yz0Qzb" } diff --git a/pipes/WDL/workflows/classify_kaiju.wdl b/pipes/WDL/workflows/classify_kaiju.wdl new file mode 100644 index 000000000..5fe139e34 --- /dev/null +++ b/pipes/WDL/workflows/classify_kaiju.wdl @@ -0,0 +1,5 @@ +import "tasks_metagenomics.wdl" as metagenomics + +workflow classify_kaiju { + call metagenomics.kaiju +} diff --git a/pipes/WDL/workflows/classify_kraken.wdl b/pipes/WDL/workflows/classify_kraken.wdl deleted file mode 100644 index 50a72d6f0..000000000 --- a/pipes/WDL/workflows/classify_kraken.wdl +++ /dev/null @@ -1,5 +0,0 @@ -import "tasks_metagenomics.wdl" as metagenomics - -workflow classify_kraken { - call metagenomics.kraken -} diff --git a/pipes/WDL/workflows/classify_krakenuniq.wdl b/pipes/WDL/workflows/classify_krakenuniq.wdl new file mode 100644 index 000000000..8a664f345 --- /dev/null +++ b/pipes/WDL/workflows/classify_krakenuniq.wdl @@ -0,0 +1,5 @@ +import "tasks_metagenomics.wdl" as metagenomics + +workflow classify_krakenuniq { + call metagenomics.krakenuniq +} diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl index 02170aa0a..70128d7bf 100644 --- a/pipes/WDL/workflows/demux_metag.wdl +++ b/pipes/WDL/workflows/demux_metag.wdl @@ -25,18 +25,14 @@ workflow demux_metag { assembler = "spades", reads_unmapped_bam = deplete.cleaned_bam } - call metagenomics.diamond_contigs as diamond { - input: - contigs_fasta = spades.contigs_fasta, - reads_unmapped_bam = deplete.cleaned_bam, - krona_taxonomy_db_tar_lz4 = krona_taxonomy_db_tgz - } } - call metagenomics.kraken as kraken { + call metagenomics.krakenuniq as kraken { + input: + reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams, + } + call metagenomics.kaiju as kaiju { input: reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams, - krona_taxonomy_db_tgz = krona_taxonomy_db_tgz } - } diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl index b1f388c39..d2a6e0c2c 100644 --- a/pipes/WDL/workflows/demux_plus.wdl +++ b/pipes/WDL/workflows/demux_plus.wdl @@ -35,7 +35,7 @@ workflow demux_plus { } } - call metagenomics.kraken as kraken { + call metagenomics.krakenuniq as krakenuniq { input: reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams } diff --git a/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl b/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl index 194255e24..6d43044ca 100644 --- a/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/workflows/tasks/tasks_metagenomics.wdl @@ -1,17 +1,12 @@ - -# TO DO: -# kraken_build (input & output tarballs) -# diamond, bwa, etc - -task kraken { +task krakenuniq { Array[File] reads_unmapped_bam - File kraken_db_tar_lz4 - File krona_taxonomy_db_tgz + File krakenuniq_db_tar_lz4 # krakenuniq/{database.kdb,taxonomy} + File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab parameter_meta { - kraken_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type + krakenuniq_db_tar_lz4: "stream" # for DNAnexus, until WDL implements the File| type krona_taxonomy_db_tgz : "stream" # for DNAnexus, until WDL implements the File| type - #reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type + reads_unmapped_bam: "stream" # for DNAnexus, until WDL implements the File| type } command { @@ -24,7 +19,7 @@ task kraken { # decompress DB to $DB_DIR read_utils.py extract_tarball \ - ${kraken_db_tar_lz4} $DB_DIR \ + ${krakenuniq_db_tar_lz4} $DB_DIR \ --loglevel=DEBUG read_utils.py extract_tarball \ ${krona_taxonomy_db_tgz} . \ @@ -33,17 +28,17 @@ task kraken { # prep input and output file names OUT_READS=fnames_outreads.txt OUT_REPORTS=fnames_outreports.txt - OUT_BASENAME=basenames_reads.txt + OUT_BASENAME=basenames_reports.txt for bam in ${sep=' ' reads_unmapped_bam}; do - echo "$(basename $bam .bam).kraken-reads" >> $OUT_BASENAME - echo "$(basename $bam .bam).kraken-reads.txt.gz" >> $OUT_READS - echo "$(basename $bam .bam).kraken-summary_report.txt" >> $OUT_REPORTS + echo "$(basename $bam .bam).krakenuniq-reads.txt.gz" >> $OUT_READS + echo "$(basename $bam .bam).krakenuniq" >> $OUT_BASENAME + echo "$(basename $bam .bam).krakenuniq-summary_report.txt" >> $OUT_REPORTS done # execute on all inputs and outputs serially, but with a single # database load into ram - metagenomics.py kraken \ - $DB_DIR \ + metagenomics.py krakenuniq \ + $DB_DIR/krakenuniq \ ${sep=' ' reads_unmapped_bam} \ --outReads `cat $OUT_READS` \ --outReport `cat $OUT_REPORTS` \ @@ -54,21 +49,18 @@ task kraken { # run single-threaded krona on up to nproc samples at once parallel -I ,, \ "metagenomics.py krona \ - ,,.txt.gz \ + ,,-summary_report.txt \ taxonomy \ - ,,.html \ - --noRank --noHits \ + ,,.krona.html \ + --noRank --noHits --inputType krakenuniq \ --loglevel=DEBUG" \ ::: `cat $OUT_BASENAME` - # run single-threaded gzip on up to nproc samples at once - parallel -I ,, "tar czf ,,.krona.tar.gz ,,.html*" ::: `cat $OUT_BASENAME` } output { - Array[File] kraken_classified_reads = glob("*.kraken-reads.txt.gz") - Array[File] kraken_summary_report = glob("*.kraken-summary_report.txt") - Array[File] krona_report_html = glob("*.kraken-reads.html") - Array[File] krona_report_tgz = glob("*.kraken-reads.krona.tar.gz") + Array[File] krakenuniq_classified_reads = glob("*.krakenuniq-reads.txt.gz") + Array[File] krakenuniq_summary_report = glob("*.krakenuniq-summary_report.txt") + Array[File] krona_report_html = glob("*.krakenuniq.krona.html") String viralngs_version = "viral-ngs_version_unknown" } @@ -105,13 +97,10 @@ task krona { ${input_basename}.html \ --noRank --noHits \ --loglevel=DEBUG - - tar czf ${input_basename}.krona.tar.gz ${input_basename}.html* } output { - File krona_report_html = "${input_basename}.html" - File krona_report_tgz = "${input_basename}.krona.tar.gz" + File krona_report_html = "${input_basename}.html" String viralngs_version = "viral-ngs_version_unknown" } @@ -182,19 +171,18 @@ task filter_bam_to_taxa { } -task diamond_contigs { - File contigs_fasta +task kaiju { File reads_unmapped_bam - File diamond_db_lz4 - File diamond_taxonomy_db_tar_lz4 - File krona_taxonomy_db_tar_lz4 + File kaiju_db_lz4 # .fmi + File ncbi_taxonomy_db_tgz # taxonomy/{nodes.dmp, names.dmp} + File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab - String contigs_basename = basename(contigs_fasta, ".fasta") + String input_basename = basename(reads_unmapped_bam, ".bam") parameter_meta { - diamond_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type - diamond_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type - krona_taxonomy_db_tar_lz4 : "stream" # for DNAnexus, until WDL implements the File| type + kaiju_db_lz4 : "stream" # for DNAnexus, until WDL implements the File| type + ncbi_taxonomy_db_tgz : "stream" + krona_taxonomy_db_tgz : "stream" } command { @@ -203,61 +191,41 @@ task diamond_contigs { if [ -d /mnt/tmp ]; then TMPDIR=/mnt/tmp fi - DIAMOND_TAXDB_DIR=$(mktemp -d) + DB_DIR=$(mktemp -d) - # find 90% memory - mem_in_gb=`/opt/viral-ngs/source/docker/calc_mem.py gb 90` + lz4 -d ${kaiju_db_lz4} $DB_DIR/kaiju.fmi - # decompress DBs to /mnt/db - cat ${diamond_db_lz4} | lz4 -d > $TMPDIR/diamond_db.dmnd & read_utils.py extract_tarball \ - ${diamond_taxonomy_db_tar_lz4} $DIAMOND_TAXDB_DIR \ - --loglevel=DEBUG & - wait - read_utils.py extract_tarball \ - ${krona_taxonomy_db_tar_lz4} . \ - --loglevel=DEBUG & # we don't need this until later + ${ncbi_taxonomy_db_tgz} $DB_DIR \ + --loglevel=DEBUG - # classify contigs - metagenomics.py diamond_fasta \ - ${contigs_fasta} \ - $TMPDIR/diamond_db.dmnd \ - $DIAMOND_TAXDB_DIR/taxonomy/ \ - ${contigs_basename}.diamond.fasta \ - --memLimitGb $mem_in_gb \ + read_utils.py extract_tarball \ + ${krona_taxonomy_db_tgz} . \ --loglevel=DEBUG - # map reads to contigs & create kraken-like read report - bwa index ${contigs_basename}.diamond.fasta - metagenomics.py align_rna \ + # classify contigs + metagenomics.py kaiju \ ${reads_unmapped_bam} \ - ${contigs_basename}.diamond.fasta \ - $DIAMOND_TAXDB_DIR/taxonomy/ \ - ${contigs_basename}.diamond.summary_report.txt \ - --outReads ${contigs_basename}.diamond.reads.txt.gz \ - --dupeReads ${contigs_basename}.diamond.reads_w_dupes.txt.gz \ - --outBam ${contigs_basename}.diamond.bam \ + $DB_DIR/kaiju.fmi \ + $DB_DIR/taxonomy \ + ${input_basename}.kaiju.report.txt \ + --outReads ${input_basename}.kaiju.reads.gz \ --loglevel=DEBUG # run krona - wait # for krona_taxonomy_db_tgz to download and extract metagenomics.py krona \ - ${contigs_basename}.diamond.reads.txt.gz \ + ${input_basename}.kaiju.report.txt \ taxonomy \ - ${contigs_basename}.diamond.html \ + ${input_basename}.kaiju.html \ + --inputType kaiju \ --noRank --noHits \ --loglevel=DEBUG - tar czf ${contigs_basename}.diamond.krona.tar.gz ${contigs_basename}.diamond.html* } output { - File diamond_contigs = "${contigs_basename}.diamond.fasta" - File reads_mapped_to_contigs = "${contigs_basename}.diamond.bam" - File diamond_summary_report = "${contigs_basename}.diamond.summary_report.txt" - File diamond_classified_reads = "${contigs_basename}.diamond.reads.txt.gz" - File diamond_classified_reads_w_dupes = "${contigs_basename}.diamond.reads_w_dupes.txt.gz" - File krona_report_html = "${contigs_basename}.diamond.html" - File krona_report_tgz = "${contigs_basename}.diamond.krona.tar.gz" + File kaiju_report = "${input_basename}.kaiju.report.txt" + File kaiju_reads = "${input_basename}.kaiju.reads.gz" + File krona_report_html = "${input_basename}.kaiju.html" String viralngs_version = "viral-ngs_version_unknown" } @@ -268,5 +236,3 @@ task diamond_contigs { dx_instance_type: "mem3_ssd1_x16" } } - - diff --git a/pipes/config.yaml b/pipes/config.yaml index 3178c6df8..82483df71 100644 --- a/pipes/config.yaml +++ b/pipes/config.yaml @@ -50,17 +50,17 @@ trim_clip_db: "s3://sabeti-public-dbs/trim_clip/contaminants.fasta" # Can be a remote path to a fasta file (s3://, gs://, etc.). spikeins_db: "s3://sabeti-public-dbs/spikeins/ercc_spike-ins.fasta" -# Directory of kraken database for metagenomics identification +# Directory of krakenuniq database for metagenomics identification # For pre-built hosted databases, you may download: # - https://storage.googleapis.com/sabeti-public/meta_dbs/kraken_viral_diversity.tar.gz -# Can be a remote path dirname (s3://, gs://, etc.). -kraken_db: "s3://sabeti-public-dbs/kraken" +# Can be a remote path dirname (s3://, gs://, etc.). +krakenuniq_db: "s3://sabeti-public-dbs/krakenuniq" -# Path of DIAMOND database file for metagenomics identification +# Path of Kaiju database file for metagenomics identification # For pre-built hosted databases, you may download: # - https://storage.googleapis.com/sabeti-public/meta_dbs/nr.dmnd.gz # Can be a remote path prefix (s3://, gs://, etc.). -diamond_db: "s3://sabeti-public-dbs/diamond/nr-20170529" +kaiju_db: "s3://sabeti-public-dbs/kaiju/nr/nr.fmi" # Directory of the krona database for generating html reports. # For pre-built hosted databases, you may download: @@ -68,14 +68,6 @@ diamond_db: "s3://sabeti-public-dbs/diamond/nr-20170529" # Can be a remote path dirname (s3://, gs://, etc.). krona_db: "s3://sabeti-public-dbs/krona" -# BWA index prefix for metagenomic alignment. -# This contains the full human reference genome, multiple diverse sequences covering human -# pathogenic viruses, as well as LSU and SSU rRNA sequences from SILVA. -# For pre-built hosted databases, you may download: -# - https://storage.googleapis.com/sabeti-public/meta_dbs/human_viral_rrna.tar.lz4 -# Can be a remote path prefix (s3://, gs://, etc.). -align_rna_db: "s3://sabeti-public-dbs/rna_bwa/human_viral_rrna" - # Directory of the NCBI taxonomy dmp files # For pre-packaged hosted databases, you may download: # - https://storage.googleapis.com/sabeti-public/meta_dbs/taxonomy.tar.lz4 @@ -226,11 +218,6 @@ vphaser_max_bins: 10 # allele freq > 0.005. If false, no filtering is performed. vcf_merge_naive_filter: false -# Strategy for kraken classification for multiple inputs. 'single' results in 1 kraken process for -# each input bam, while 'multiple' coalesces multiple input classifications into a single kraken -# run. -kraken_execution: single - # Random seed, for non-deterministic steps. 0 means use current time. random_seed: 0 diff --git a/pipes/rules/metagenomics.rules b/pipes/rules/metagenomics.rules index 3baf5a3e1..8d82bf8a2 100644 --- a/pipes/rules/metagenomics.rules +++ b/pipes/rules/metagenomics.rules @@ -6,204 +6,88 @@ samples = list(read_samples_file(config.get("samples_metagenomics"))) rule all_metagenomics: input: expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.raw.{method}.report"), sample=samples, method=['kraken', 'rna_bwa', 'rna_bwa.nodupes']), + "{sample}.raw.{method}.report"), sample=samples, method=['krakenuniq']), expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.raw.{method}.krona.html"), sample=samples, method=['kraken', 'rna_bwa', 'rna_bwa_nodupes']) + "{sample}.raw.{method}.krona.html"), sample=samples, method=['krakenuniq']) rule all_metagenomics_host_depleted: input: expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.cleaned.{method}.report"), sample=samples, method=['kraken', 'rna_bwa']), + "{sample}.cleaned.{method}.report"), sample=samples, method=['krakenuniq']), expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.cleaned.{method}.krona.html"), sample=samples, method=['kraken', 'rna_bwa']) - - -method_props = { - 'diamond': { - 'reads_ext': 'diamond.lca.gz', - }, - 'kraken': { - 'reads_ext': 'kraken.reads.gz', - }, - 'rna_bwa': { - 'reads_ext': 'rna_bwa.lca.gz', - }, - 'rna_bwa_nodupes': { - 'reads_ext': 'rna_bwa.lca_nodupes.gz', - } -} + "{sample}.cleaned.{method}.krona.html"), sample=samples, method=['krakenuniq']) + taxfiles = [ - 'names.dmp', - 'nodes.dmp', - 'merged.dmp' - ] - -taxfiles_kraken = [ - 'names.dmp', - 'nodes.dmp' + 'names.dmp', + 'nodes.dmp', ] taxfiles_krona = [ - 'taxonomy.tab', + 'taxonomy.tab', ] -rna_bwa_ext = ['sa', 'bwt', 'amb', 'sa', 'ann', 'pac'] -rule diamond: +rule kaiju: input: - bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"), - diamond_db = objectify_remote(expand("{path_prefix}.{ext}", path_prefix=config["diamond_db"], ext=["dmnd"])), - taxonomy_db = objectify_remote(expand("{path_prefix}/{taxfile}", path_prefix=config["taxonomy_db"], taxfile=taxfiles)) + bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"), + kaiju_db = objectify_remote(config['kaiju_db']), + taxonomy_db = objectify_remote(expand("{path_prefix}/{taxfile}", path_prefix=config['taxonomy_db'], taxfile=taxfiles)) output: - report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.diamond.report"), - reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.diamond.lca.gz") + report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kaiju.report"), + reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kaiju.reads.gz") resources: threads = int(config.get("number_of_threads", 1)), - mem_mb = 120*1000 + mem_mb = 50*1000 params: - UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00') + UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=12:00:00') run: - diamond_db_prefix = os.path.splitext(strip_protocol(config["diamond_db"], relative=True))[0] taxonomy_db_prefix = strip_protocol(config["taxonomy_db"], relative=True) - shell("{config[bin_dir]}/metagenomics.py diamond {input.bam} "+diamond_db_prefix+" "+taxonomy_db_prefix+" {output.report} --outReads {output.reads} --threads {resources.threads}") + shell("{config[bin_dir]}/metagenomics.py kaiju {input.bam} {input.kaiju_db} "+taxonomy_db_prefix+" {output.report} --outReads {output.reads} --threads {resources.threads}") -rule align_rna: + +rule krakenuniq: input: - bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"), - rna_bwa_db = objectify_remote(expand("{path_prefix}.{ext}", path_prefix=config["align_rna_db"], ext=rna_bwa_ext)), - taxonomy_db = objectify_remote(expand("{path_prefix}/{taxfile}", path_prefix=config["taxonomy_db"], taxfile=taxfiles)) + bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam") output: - report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.report"), - nodupes_report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.nodupes.report"), - bam = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.bam"), - lca = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca.gz"), - nodupes_lca = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.rna_bwa.lca_nodupes.gz") + report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.krakenuniq.report"), + reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.krakenuniq.reads.gz") resources: threads = int(config.get("number_of_threads", 1)), - mem_mb = 7*1000 + mem_mb = 120*1000 params: UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00') - run: - rna_bwa_path_prefix = strip_protocol(config["align_rna_db"], relative=True) - taxonomy_db_prefix = strip_protocol(config["taxonomy_db"], relative=True) - shell("{config[bin_dir]}/metagenomics.py align_rna {input.bam} "+rna_bwa_path_prefix+" "+taxonomy_db_prefix+" {output.nodupes_report} --dupeReport {output.report} --outBam {output.bam} --outReads {output.nodupes_lca} --dupeReads {output.lca} --JVMmemory {resources.mem_mb}m --threads {resources.threads}") - -if config['kraken_execution'] == 'multiple': - kraken_samples = [] - all_kraken_inputs = {} - all_kraken_reports = {} - all_kraken_reads = {} - for adjective in ('raw', 'cleaned'): - for sample in samples: - kraken_report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.kraken.report".format( - sample=sample, adjective=adjective)) - kraken_input = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam".format( - sample=sample, adjective=adjective)) - try: - itime = os.path.getmtime(kraken_input) - otime = os.path.getmtime(kraken_report) - except OSError: - kraken_samples.append(sample) - continue - if itime > otime: - kraken_samples.append(sample) - if not kraken_samples: - kraken_samples = samples.copy() - - all_kraken_inputs[adjective] = expand( - os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam"), - sample=kraken_samples, adjective=adjective) - all_kraken_reports[adjective] = expand( - os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.kraken.report"), - sample=kraken_samples, adjective=adjective) - all_kraken_reads[adjective] = expand( - os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.kraken.reads.gz"), - sample=kraken_samples, adjective=adjective) - - kraken_shell = textwrap.dedent("""\ - {config[bin_dir]}/metagenomics.py kraken {params.kraken_db} {input.bams} --outReads {output.reads} --outReports {output.reports} \ - {params.lock_memory} --threads {resources.threads} - """) - - rule kraken_multiple_raw: - input: - bams = all_kraken_inputs['raw'] - output: - reports = all_kraken_reports['raw'], - reads = all_kraken_reads['raw'] - resources: - threads = int(config.get("number_of_threads", 1)), - mem_mb = 120*1000 - params: - UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00'), - kraken_db = config['kraken_db'], - lock_memory = ' --lockMemory' if config.get('kraken_lock_memory') else '' - shell: - kraken_shell - - rule kraken_multiple_cleaned: - input: - bams = all_kraken_inputs['cleaned'] - output: - reports = all_kraken_reports['cleaned'], - reads = all_kraken_reads['cleaned'] - resources: - threads = int(config.get("number_of_threads", 1)), - mem_mb = 120*1000 - params: - UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00'), - kraken_db = config['kraken_db'], - lock_memory = ' --lockMemory' if config.get('kraken_lock_memory') else '' - shell: - kraken_shell - -else: - rule kraken: - input: - bam = os.path.join(config["data_dir"], config["subdirs"]["per_sample"], "{sample}.{adjective}.bam") - output: - report = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kraken.report"), - reads = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.kraken.reads.gz") - resources: - threads = int(config.get("number_of_threads", 1)), - mem_mb = 120*1000 - params: - UGER = config.get('UGER_queues', {}).get('long', '-l h_rt=36:00:00'), - kraken_db = config['kraken_db'], - lock_memory = ' --lockMemory' if config.get('kraken_lock_memory') else '' - shell: - """ - {config[bin_dir]}/metagenomics.py kraken {params.kraken_db} {input.bam} --outReads {output.reads} --outReports {output.report} \ - {params.lock_memory} --threads {resources.threads} - """ - - -rule all_kraken: + shell: + """ + {config[bin_dir]}/metagenomics.py krakenuniq {config[krakenuniq_db]} {input.bam} --outReads {output.reads} --outReports {output.report} \ + --threads {resources.threads} + """ + + +rule all_krakenuniq: input: expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.raw.kraken.report"), sample=samples), + "{sample}.raw.krakenuniq.report"), sample=samples), expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.raw.kraken.krona.html"), sample=samples) + "{sample}.raw.krakenuniq.krona.html"), sample=samples) -rule all_kraken_host_depleted: +rule all_krakenuniq_host_depleted: input: expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.cleaned.kraken.report"), sample=samples), + "{sample}.cleaned.krakenuniq.report"), sample=samples), expand(os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], - "{sample}.cleaned.kraken.krona.html"), sample=samples) + "{sample}.cleaned.krakenuniq.krona.html"), sample=samples) rule krona_import_taxonomy: input: - tsv = lambda wildcards: os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], \ - ".".join([wildcards.sample, wildcards.adjective, method_props[wildcards.method]['reads_ext']])), - krona_db = objectify_remote(expand("{path_prefix}/{kronafile}", path_prefix=config["krona_db"], kronafile=taxfiles_krona))#["taxonomy.tab", "gi_taxid.dat"])) + tsv = os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective}.{method}.report"), + krona_db = objectify_remote(expand("{path_prefix}/{kronafile}", path_prefix=config["krona_db"], kronafile=taxfiles_krona)) output: - os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.{method,kraken|diamond|rna_bwa|rna_bwa_nodupes}.krona.html") + os.path.join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{adjective,raw|cleaned}.{method,krakenuniq|kaiju}.krona.html") resources: mem_mb=32*1000 run: krona_db_prefix = strip_protocol(config["krona_db"], relative=True) - shell("{config[bin_dir]}/metagenomics.py krona {input.tsv} "+krona_db_prefix+" {output} --noRank") + shell("{config[bin_dir]}/metagenomics.py krona {input.tsv} "+krona_db_prefix+" {output} --noRank --inputType {wildcards.method}") diff --git a/requirements-conda.txt b/requirements-conda.txt index 10a0b8204..2415d8459 100644 --- a/requirements-conda.txt +++ b/requirements-conda.txt @@ -6,7 +6,8 @@ cd-hit-auxtools=4.6.8 diamond=0.9.10 kmc=3.1.1rc1 gatk=3.8 -kraken=1.0.0_fork3 +kaiju=1.6.3_yesimon +krakenuniq=0.5.7_yesimon krona=2.7 last=876 lbzip2=2.5 diff --git a/test/input/TestMetagenomicsSimple/db/library/krakenuniq.map b/test/input/TestMetagenomicsSimple/db/library/krakenuniq.map new file mode 100644 index 000000000..5724240e4 --- /dev/null +++ b/test/input/TestMetagenomicsSimple/db/library/krakenuniq.map @@ -0,0 +1,5 @@ +NC_002549.1 186538 +NC_004161.1 186539 +NC_006432.1 186540 +NC_014373.1 565995 +NC_014372.1 186541 \ No newline at end of file diff --git a/test/input/TestMetagenomicsViralMix/db/library/krakenuniq.map b/test/input/TestMetagenomicsViralMix/db/library/krakenuniq.map new file mode 100644 index 000000000..070cd373d --- /dev/null +++ b/test/input/TestMetagenomicsViralMix/db/library/krakenuniq.map @@ -0,0 +1,7 @@ +NC_002058.3 138950 +NC_002692.1 12253 +NC_004102.1 11103 +AY184220.1 12083 +EU255973.1 31646 +KJ207374.1 12253 +KM609478.1 42788 \ No newline at end of file diff --git a/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json b/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json index 791ce4273..105cb090d 100644 --- a/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json +++ b/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json @@ -11,6 +11,6 @@ { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-BzBpkK80x0z8GBQPfZ4kgPKp" } } ], - "stage-3.kraken_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F86qZg809y3Qvp7j7vgFKj4Z" } }, + "stage-3.krakenuniq_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-FVYQqP006zFF064QBGf022X1" } }, "stage-3.krona_taxonomy_db_tgz": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F4z0fgj07FZ8jg8yP7yz0Qzb" } } } diff --git a/test/integration/fixtures.py b/test/integration/fixtures.py new file mode 100644 index 000000000..47b5a5a45 --- /dev/null +++ b/test/integration/fixtures.py @@ -0,0 +1,50 @@ +import gzip +import os +from os.path import join +import shutil +import tools +import tools.kaiju +import tools.picard +import tools.krona +import util.file + +# @pytest.fixture(scope='module') +def krona(): + krona = tools.krona.Krona() + krona.install() + return krona + + +# @pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix']) +def db_type(request): + return request.param + + +# @pytest.fixture(scope='module') +def krona_db(request, tmpdir_module, krona, db_type): + data_dir = join(util.file.get_test_input_path(), db_type) + db_dir = join(data_dir, 'db') + + db = join(tmpdir_module, 'krona_db_{}'.format(db_type)) + os.mkdir(db) + for d in ('names.dmp', 'nodes.dmp'): + src = join(db_dir, 'taxonomy', d) + dest = join(db, d) + if os.path.isfile(src): + os.symlink(src, dest) + krona.create_db(db) + return db + + +def taxonomy_db(request, tmpdir_module, db_type): + taxonomy = join(tmpdir_module, db_type, 'taxonomy') + shutil.copytree(join(util.file.get_test_input_path(), db_type, 'db', 'taxonomy'), + taxonomy) + prot = join(taxonomy, 'accession2taxid', 'prot.accession2taxid') + prot_gz = prot + '.gz' + + with open(prot, 'rb') as f_in: + with gzip.open(prot_gz, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + return taxonomy diff --git a/test/integration/test_diamond.py b/test/integration/test_diamond.py deleted file mode 100644 index 93f010b9e..000000000 --- a/test/integration/test_diamond.py +++ /dev/null @@ -1,158 +0,0 @@ -# Integration tests for diamond - -from builtins import super -import argparse -import fnmatch -import gzip -import os -from os.path import join -import sys -import shutil -import tempfile -import pytest -from Bio import SeqIO - -import metagenomics -import tools -import tools.diamond -import tools.picard -import util.file - -def find_files(root_dir, filt): - matches = [] - for root, dirnames, filenames in os.walk(root_dir): - for filename in fnmatch.filter(filenames, filt): - yield join(root, filename) - - -@pytest.fixture(scope='module') -def fastq_to_sam(): - return tools.picard.FastqToSamTool() - - -@pytest.fixture(scope='module') -def sam_to_fastq(): - return tools.picard.SamToFastqTool() - - -@pytest.fixture(scope='module') -def diamond(): - diamond = tools.diamond.Diamond() - diamond.install() - return diamond - - -@pytest.fixture(scope='module') -def krona(): - krona = tools.krona.Krona() - krona.install() - return krona - - -@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix']) -def db_type(request): - return request.param - - -def input_fastq_paths(): - data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple') - return [os.path.join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] - - -def input_bam_paths(): - data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix') - return join(data_dir, 'test-reads.bam') - - -@pytest.fixture(scope='module') -def input_bam(request, tmpdir_module, fastq_to_sam, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - if db_type == 'TestMetagenomicsSimple': - fastqs = [os.path.join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] - - bam_name = 'zaire_ebola.bam' - bam = os.path.join(tmpdir_module, bam_name) - fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam) - return bam - - return input_bam_paths() - - -@pytest.fixture(scope='module') -def input_fastqs(request, tmpdir_module, sam_to_fastq, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - if db_type == 'TestMetagenomicsSimple': - fastqs = [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] - return fastqs - - bam = join(data_dir, 'test-reads.bam') - basename = os.path.basename(bam) - fastq1 = join(tmpdir_module, '{}.1.fastq'.format(basename)) - fastq2 = join(tmpdir_module, '{}.2.fastq'.format(basename)) - sam_to_fastq.execute(bam, fastq1, fastq2) - return fastq1, fastq2 - - -@pytest.fixture(scope='module') -def taxonomy_db(request, tmpdir_module, db_type): - taxonomy = os.path.join(tmpdir_module, db_type, 'taxonomy') - shutil.copytree(join(util.file.get_test_input_path(), db_type, 'db', 'taxonomy'), - taxonomy) - prot = os.path.join(taxonomy, 'accession2taxid', 'prot.accession2taxid') - prot_gz = prot + '.gz' - - with open(prot, 'rb') as f_in: - with gzip.open(prot_gz, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - - return taxonomy - - -@pytest.fixture(scope='module') -def diamond_db(request, tmpdir_module, diamond, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - db_dir = join(data_dir, 'db') - - db = os.path.join(tmpdir_module, db_type + '.dmnd') - translated = os.path.join(tmpdir_module, db_type + '.faa') - - lib_dir = join(db_dir, 'library') - util.file.cat(translated, find_files(db_dir, '*.faa')) - - diamond.build(db, [translated]) - return os.path.splitext(db)[0] - - -TAXONOMY_FILES = ('gi_taxid_nucl.dmp', - 'gi_taxid_prot.dmp', - 'names.dmp', - 'nodes.dmp', - 'merged.dmp') - - -@pytest.fixture(scope='module') -def krona_db(request, tmpdir_module, krona, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - db_dir = os.path.join(data_dir, 'db') - - db = os.path.join(tmpdir_module, 'krona_db_{}'.format(db_type)) - os.mkdir(db) - for d in TAXONOMY_FILES: - src = join(db_dir, 'taxonomy', d) - dest = join(db, d) - os.symlink(src, dest) - krona.create_db(db) - return db - - -@pytest.mark.skipif(tools.is_osx(), reason="not currently tested under OSX") -def test_diamond(diamond_db, taxonomy_db, input_bam): - out_report = util.file.mkstempfname('.report') - out_reads = util.file.mkstempfname('.lca.tsv') - cmd = [input_bam, diamond_db, taxonomy_db, out_report, '--outReads', out_reads] - parser = metagenomics.parser_diamond(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - - assert os.path.getsize(out_report) > 0 - assert os.path.getsize(out_reads) > 0 diff --git a/test/integration/test_kaiju.py b/test/integration/test_kaiju.py new file mode 100644 index 000000000..928e08933 --- /dev/null +++ b/test/integration/test_kaiju.py @@ -0,0 +1,163 @@ +# Integration tests for kaiju +from builtins import super +import argparse +import binascii +import fnmatch +import gzip +import os +from os.path import join +import sys +import shutil +import tempfile + +import lxml.html.clean +import pytest +from Bio import SeqIO + +import metagenomics +import tools +import tools.kaiju +import tools.picard +import util.file +import test.integration.fixtures + + +def is_gz_file(filepath): + with open(filepath, 'rb') as test_f: + return binascii.hexlify(test_f.read(2)) == b'1f8b' + + +def find_files(root_dir, filt): + matches = [] + for root, dirnames, filenames in os.walk(root_dir): + for filename in fnmatch.filter(filenames, filt): + yield join(root, filename) + + +krona = pytest.fixture(scope='module')(test.integration.fixtures.krona) +krona_db = pytest.fixture(scope='module')(test.integration.fixtures.krona_db) +taxonomy_db = pytest.fixture(scope='module')(test.integration.fixtures.taxonomy_db) + + +@pytest.fixture(scope='module') +def fastq_to_sam(): + return tools.picard.FastqToSamTool() + + +@pytest.fixture(scope='module') +def sam_to_fastq(): + return tools.picard.SamToFastqTool() + + +@pytest.fixture(scope='module') +def kaiju(): + kaiju = tools.kaiju.Kaiju() + kaiju.install() + return kaiju + + +@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix']) +def db_type(request): + return request.param + + +def input_fastq_paths(): + data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple') + return [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] + + +def input_bam_paths(): + data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix') + return join(data_dir, 'test-reads.bam') + + +@pytest.fixture(scope='module') +def input_bam(request, tmpdir_module, fastq_to_sam, db_type): + data_dir = join(util.file.get_test_input_path(), db_type) + if db_type == 'TestMetagenomicsSimple': + fastqs = [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] + + bam_name = 'zaire_ebola.bam' + bam = join(tmpdir_module, bam_name) + fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam) + return bam + + return input_bam_paths() + + +@pytest.fixture(scope='module') +def input_fastqs(request, tmpdir_module, sam_to_fastq, db_type): + data_dir = join(util.file.get_test_input_path(), db_type) + if db_type == 'TestMetagenomicsSimple': + fastqs = [join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] + return fastqs + + bam = join(data_dir, 'test-reads.bam') + basename = os.path.basename(bam) + fastq1 = join(tmpdir_module, '{}.1.fastq'.format(basename)) + fastq2 = join(tmpdir_module, '{}.2.fastq'.format(basename)) + sam_to_fastq.execute(bam, fastq1, fastq2) + return fastq1, fastq2 + + +@pytest.fixture(scope='module') +def kaiju_db(request, tmpdir_module, kaiju, taxonomy_db, db_type): + data_dir = join(util.file.get_test_input_path(), db_type) + db_dir = join(data_dir, 'db') + + db_prefix = join(tmpdir_module, db_type) + translated = join(tmpdir_module, db_type + '.faa') + + lib_dir = join(db_dir, 'library') + util.file.cat(translated, find_files(db_dir, '*.faa')) + + kaiju.build(db_prefix, [translated], tax_db=taxonomy_db, translate_accessions=True) + return db_prefix + '.fmi' + + +def test_kaiju(kaiju_db, krona_db, taxonomy_db, input_bam): + out_report = util.file.mkstempfname('.report') + out_reads = util.file.mkstempfname('.reads') + cmd = [input_bam, kaiju_db, taxonomy_db, out_report, '--outReads', out_reads] + parser = metagenomics.parser_kaiju(argparse.ArgumentParser()) + args = parser.parse_args(cmd) + args.func_main(args) + + assert os.path.getsize(out_report) > 0 + assert os.path.getsize(out_reads) > 0 + + with util.file.open_or_gzopen(out_report) as inf: + report_lines = [x.strip().split('\t') for x in inf.readlines()] + report_lines = [x for x in report_lines if x] + + assert is_gz_file(out_reads) + assert os.path.getsize(out_report) > 0 + + if 'TestMetagenomicsSimple' in kaiju_db: + zaire_found = False + tai_found = False + for line in report_lines: + if len(line) < 2: + continue + if 'Zaire ebolavirus' in line[-2] and float(line[0]) > 40: + zaire_found = True + elif 'Tai Forest' in line[-2]: + tai_found = True + assert zaire_found + assert not tai_found + + out_html = util.file.mkstempfname('.krona.html') + parser = metagenomics.parser_krona(argparse.ArgumentParser()) + args = parser.parse_args(['--inputType', 'kaiju', out_report, krona_db, out_html]) + args.func_main(args) + + if 'TestMetagenomicsSimple' in kaiju_db: + ebola_found = False + cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=False, page_structure=False) + tree = cleaner.clean_html(lxml.html.parse(out_html)) + root_node = tree.xpath('//krona/node')[0] + for n in root_node.iterdescendants(): + if n.get('name') == 'Zaire ebolavirus': + if int(n.xpath('magnitude/val')[0].text) > 0: + ebola_found = True + assert ebola_found diff --git a/test/integration/test_kraken.py b/test/integration/test_kraken.py deleted file mode 100644 index 72f5cbc87..000000000 --- a/test/integration/test_kraken.py +++ /dev/null @@ -1,192 +0,0 @@ -# Integration tests for kraken - -import argparse -import os -import os.path -from os.path import join -import sys -import pytest -import metagenomics -import unittest -import util.file -import tools -import tools.kraken -import tools.krona -import tools.picard - -@pytest.fixture() -def input_bam(db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - return join(data_dir, 'test-reads.bam') - - -@pytest.fixture(scope='module') -def kraken(): - kraken = tools.kraken.Kraken() - kraken.install() - return kraken - - -@pytest.fixture(scope='module') -def krona(): - krona = tools.krona.Krona() - krona.install() - return krona - - -@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix']) -def db_type(request): - return request.param - - -@pytest.fixture(scope='module') -def kraken_db(request, tmpdir_module, kraken, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - db_dir = join(data_dir, 'db') - - parser = metagenomics.parser_kraken(argparse.ArgumentParser()) - - db = os.path.join(tmpdir_module, 'kraken_db_{}'.format(db_type)) - parser = metagenomics.parser_kraken_build(argparse.ArgumentParser()) - cmd = [db, '--library', join(db_dir, 'library'), - '--taxonomy', join(db_dir, 'taxonomy'), - '--subsetTaxonomy', - '--minimizerLen', '10', - '--clean'] - - parser.parse_args(cmd) - args = parser.parse_args(cmd) - args.func_main(args) - return db - - -TAXONOMY_FILES = ('gi_taxid_nucl.dmp', - 'gi_taxid_prot.dmp', - 'names.dmp', - 'nodes.dmp', - 'merged.dmp') - - -@pytest.fixture(scope='module') -def krona_db(request, tmpdir_module, krona, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - db_dir = os.path.join(data_dir, 'db') - - db = os.path.join(tmpdir_module, 'krona_db_{}'.format(db_type)) - os.mkdir(db) - for d in TAXONOMY_FILES: - src = join(db_dir, 'taxonomy', d) - dest = join(db, d) - if os.path.isfile(src): - os.symlink(src, dest) - krona.create_db(db) - return db - -def test_kraken(kraken_db, input_bam): - out_report = util.file.mkstempfname('.report') - out_reads = util.file.mkstempfname('.reads.gz') - cmd = [kraken_db, input_bam, '--outReports', out_report, '--outReads', out_reads] - parser = metagenomics.parser_kraken(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - - with util.file.open_or_gzopen(out_reads, 'r') as inf: - assert len(inf.read()) > 0 - with util.file.open_or_gzopen(out_report) as inf: - report_lines = [x.strip().split() for x in inf.readlines()] - - assert os.path.getsize(out_report) > 0 - - ''' - # not sure what to do with this failing test for now that never seemed to work in the past - if 'TestMetagenomicsSimple' in kraken_db: - zaire_found = False - tai_found = False - for line in report_lines: - if line[-1] == 'Zaire ebolavirus' and float(line[0]) > 90: - zaire_found = True - elif 'Tai Forest' in line[-1]: - tai_found = True - assert zaire_found - assert not tai_found - ''' - -def test_kraken_multi(kraken_db): - in_bams = list(os.path.join(util.file.get_test_input_path(), d, 'test-reads.bam') for d in ('TestMetagenomicsSimple', 'TestMetagenomicsViralMix')) - out_reports = list(util.file.mkstempfname('.out_{}.report.txt'.format(i)) for i in (1,2)) - out_reads = list(util.file.mkstempfname('.out_{}.reads.txt.gz'.format(i)) for i in (1,2)) - cmd = [kraken_db] + in_bams \ - + ['--outReports'] + out_reports \ - + ['--outReads'] + out_reads - parser = metagenomics.parser_kraken(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - - # just check for non-empty outputs - for outfile in out_reads: - with util.file.open_or_gzopen(outfile, 'r') as inf: - assert len(inf.read()) > 0 - for outfile in out_reports: - with util.file.open_or_gzopen(outfile) as inf: - assert len(inf.read()) > 0 - -@unittest.skip("this deadlocks currently...") -def test_kraken_fifo(kraken_db): - in_bams = list(os.path.join(util.file.get_test_input_path(), d, 'test-reads.bam') for d in ('TestMetagenomicsSimple', 'TestMetagenomicsViralMix')) - out_reports = list(util.file.mkstempfname('.out_{}.report.txt'.format(i)) for i in (1,2)) - out_reads = list(util.file.mkstempfname('.out_{}.reads.txt.gz'.format(i)) for i in (1,2)) - with util.file.fifo(names=('inbam1.bam', 'inbam2.bam')) as (inbam1, inbam2): - with open(inbam1, 'wb') as b1, open(inbam2, 'wb') as b2: - p1 = subprocess.Popen(['cat', in_bams[0]], stdout=b1) - p2 = subprocess.Popen(['cat', in_bams[1]], stdout=b2) - cmd = [kraken_db, inbam1, inbam2] \ - + ['--outReports'] + out_reports \ - + ['--outReads'] + out_reads - parser = metagenomics.parser_kraken(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - print("waiting for kraken to drain fifo for first bam file") - p1.wait() - print("waiting for kraken to drain fifo for second bam file") - p2.wait() - - # just check for non-empty outputs - for outfile in out_reads: - with util.file.open_or_gzopen(outfile, 'r') as inf: - assert len(inf.read()) > 0 - for outfile in out_reports: - with util.file.open_or_gzopen(outfile) as inf: - assert len(inf.read()) > 0 - -def test_kraken_krona(kraken_db, krona_db, input_bam): - out_report = util.file.mkstempfname('.report') - out_reads = util.file.mkstempfname('.reads.gz') - - cmd = [kraken_db, input_bam, '--outReport', out_report, '--outReads', out_reads] - parser = metagenomics.parser_kraken(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - - out_html = util.file.mkstempfname('.krona.html') - parser = metagenomics.parser_krona(argparse.ArgumentParser()) - args = parser.parse_args([out_reads, krona_db, out_html]) - args.func_main(args) - -def test_kraken_on_empty(kraken_db, input_bam): - if 'TestMetagenomicsViralMix' not in kraken_db: - return - input_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam') - out_report = util.file.mkstempfname('.report') - out_reads = util.file.mkstempfname('.reads.gz') - cmd = [kraken_db, input_bam, '--outReport', out_report, '--outReads', out_reads] - parser = metagenomics.parser_kraken(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - - with util.file.open_or_gzopen(out_reads, 'r') as inf: - assert len(inf.read()) == 0 - with open(out_report, 'rt') as inf: - out_report_contents = inf.readlines() - assert len(out_report_contents) == 1 - out_report_contents = out_report_contents[0].rstrip('\n').split('\t') - assert out_report_contents == ['100.00', '0', '0', 'U', '0', 'unclassified'] diff --git a/test/integration/test_krakenuniq.py b/test/integration/test_krakenuniq.py new file mode 100644 index 000000000..b4b57cb36 --- /dev/null +++ b/test/integration/test_krakenuniq.py @@ -0,0 +1,141 @@ +# Integration tests for krakenuniq +import argparse +import binascii +import os +import os.path +from os.path import join + +import lxml.html.clean +import pytest + +import metagenomics +import unittest +import util.file +import tools +import tools.kraken +import tools.krona +import tools.picard +import test.integration.fixtures + + +def is_gz_file(filepath): + with open(filepath, 'rb') as test_f: + return binascii.hexlify(test_f.read(2)) == b'1f8b' + + +krona = pytest.fixture(scope='module')(test.integration.fixtures.krona) +krona_db = pytest.fixture(scope='module')(test.integration.fixtures.krona_db) + + +@pytest.fixture() +def input_bam(db_type): + data_dir = join(util.file.get_test_input_path(), db_type) + return join(data_dir, 'test-reads.bam') + + +@pytest.fixture(scope='module') +def krakenuniq(): + krakenuniq = tools.kraken.KrakenUniq() + krakenuniq.install() + return krakenuniq + + +@pytest.fixture(scope='module', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix']) +def db_type(request): + return request.param + + +@pytest.fixture(scope='module') +def krakenuniq_db(request, tmpdir_module, krakenuniq, db_type): + data_dir = join(util.file.get_test_input_path(), db_type) + db_dir = join(data_dir, 'db') + + db = join(tmpdir_module, 'krakenuniq_db_{}'.format(db_type)) + parser = metagenomics.parser_krakenuniq_build(argparse.ArgumentParser()) + cmd = [db, '--library', join(db_dir, 'library'), + '--taxonomy', join(db_dir, 'taxonomy'), + '--subsetTaxonomy', + '--minimizerLen', '10', + '--clean'] + + parser.parse_args(cmd) + args = parser.parse_args(cmd) + args.func_main(args) + return db + + +def test_krakenuniq(krakenuniq_db, input_bam): + out_report = util.file.mkstempfname('.report') + out_reads = util.file.mkstempfname('.reads.gz') + cmd = [krakenuniq_db, input_bam, '--outReports', out_report, '--outReads', out_reads] + parser = metagenomics.parser_krakenuniq(argparse.ArgumentParser()) + args = parser.parse_args(cmd) + args.func_main(args) + + with util.file.open_or_gzopen(out_reads, 'r') as inf: + assert len(inf.read()) > 0 + with util.file.open_or_gzopen(out_report) as inf: + report_lines = [x.strip().split('\t') for x in inf.readlines()] + report_lines = [x for x in report_lines if x] + + assert is_gz_file(out_reads) + assert os.path.getsize(out_report) > 0 + + if 'TestMetagenomicsSimple' in krakenuniq_db: + zaire_found = False + tai_found = False + for line in report_lines: + if 'Zaire ebolavirus' in line[-1] and float(line[0]) > 90: + zaire_found = True + elif 'Tai Forest' in line[-1]: + tai_found = True + assert zaire_found + assert not tai_found + + +def test_krakenuniq_krona(krakenuniq_db, krona_db, input_bam): + out_report = util.file.mkstempfname('.report') + out_reads = util.file.mkstempfname('.reads.gz') + + cmd = [krakenuniq_db, input_bam, '--outReport', out_report, '--outReads', out_reads] + parser = metagenomics.parser_krakenuniq(argparse.ArgumentParser()) + args = parser.parse_args(cmd) + args.func_main(args) + + out_html = util.file.mkstempfname('.krona.html') + parser = metagenomics.parser_krona(argparse.ArgumentParser()) + args = parser.parse_args(['--inputType', 'krakenuniq', out_report, krona_db, out_html]) + args.func_main(args) + + if 'TestMetagenomicsSimple' in krakenuniq_db: + ebola_found = False + cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=False, page_structure=False) + tree = cleaner.clean_html(lxml.html.parse(out_html)) + root_node = tree.xpath('//krona/node')[0] + for n in root_node.iterdescendants(): + if n.get('name') == 'Zaire ebolavirus': + if int(n.xpath('magnitude/val')[0].text) > 0: + ebola_found = True + assert ebola_found + + +def test_krakenuniq_on_empty(krakenuniq_db, input_bam): + if 'TestMetagenomicsViralMix' not in krakenuniq_db: + return + input_bam = join(util.file.get_test_input_path(), 'empty.bam') + out_report = util.file.mkstempfname('.report') + out_reads = util.file.mkstempfname('.reads.gz') + cmd = [krakenuniq_db, input_bam, '--outReport', out_report, '--outReads', out_reads] + parser = metagenomics.parser_krakenuniq(argparse.ArgumentParser()) + args = parser.parse_args(cmd) + args.func_main(args) + + with util.file.open_or_gzopen(out_reads, 'r') as inf: + assert len(inf.read()) == 0 + + assert is_gz_file(out_reads) + with open(out_report, 'rt') as inf: + lines = [line.strip() for line in inf.readlines() if not line.startswith('#') and not line.startswith('%')] + out_report_contents = [line for line in lines if line] + assert len(out_report_contents) == 1 + assert out_report_contents[0].split('\t') == ['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified'] diff --git a/test/integration/test_metagenomics_align.py b/test/integration/test_metagenomics_align.py deleted file mode 100644 index 264f5613e..000000000 --- a/test/integration/test_metagenomics_align.py +++ /dev/null @@ -1,112 +0,0 @@ -# Integration tests for metagenomics direct alignment - -from builtins import super -import argparse -import fnmatch -from os import listdir -import os.path -from os.path import join -import sys -import tempfile - -import pytest -from Bio import SeqIO - -import metagenomics -import util.file -import tools -import tools.bwa -import tools.krona -import tools.picard - -def find_files(root_dir, filt): - matches = [] - for root, dirnames, filenames in os.walk(root_dir): - for filename in fnmatch.filter(filenames, filt): - yield join(root, filename) - - -@pytest.fixture(scope='module') -def fastq_to_sam(): - return tools.picard.FastqToSamTool() - - -@pytest.fixture(scope='module') -def taxonomy_db(request, tmpdir_factory, db_type): - return join(util.file.get_test_input_path(), db_type, 'db', 'taxonomy') - - -@pytest.fixture(scope='module') -def input_bam(request, tmpdir_module, fastq_to_sam, db_type): - data_dir = join(util.file.get_test_input_path(), db_type) - if db_type == 'TestMetagenomicsSimple': - fastqs = [os.path.join(data_dir, f) for f in ['zaire_ebola.1.fastq', 'zaire_ebola.2.fastq']] - - bam_name = 'zaire_ebola.bam' - bam = os.path.join(tmpdir_module, bam_name) - fastq_to_sam.execute(fastqs[0], fastqs[1], '', bam) - return bam - - data_dir = join(util.file.get_test_input_path(), 'TestMetagenomicsViralMix') - return join(data_dir, 'test-reads.bam') - - -@pytest.fixture(scope='module') -def bwa(): - bwa = tools.bwa.Bwa() - bwa.install() - return bwa - - -# @pytest.fixture(scope='session', params=['TestMetagenomicsSimple', 'TestMetagenomicsViralMix']) -@pytest.fixture(scope='module', params=['TestMetagenomicsSimple']) -def db_type(request): - return request.param - - -FNA_TAXIDS = { - 'GCF_000889155.1_ViralProj51245_genomic.fna': '565995', # Bundibugyo_ebolavirus - 'GCF_000854085.1_ViralProj15006_genomic.fna': '186539', # Reston_ebolavirus - 'GCF_000855585.1_ViralProj15012_genomic.fna': '186540', # Sudan_ebolavirus - 'GCF_000888475.1_ViralProj51257_genomic.fna': '186541', # Tai_Forest_ebolavirus - 'GCF_000848505.1_ViralProj14703_genomic.fna': '186538', # Zaire_ebolavirus -} - - -@pytest.fixture(scope='module') -def bwa_db(request, tmpdir_module, bwa, db_type): - - data_dir = join(util.file.get_test_input_path(), db_type) - db_dir = join(data_dir, 'db') - - index_fa = os.path.join(tmpdir_module, db_type + '.bwa_index.fa') - db = os.path.join(tmpdir_module, db_type + '') - - with open(index_fa, "w") as f_out: - for fname in find_files(join(db_dir, 'library'), '*.fna'): - with open(fname) as f: - for seq_record in SeqIO.parse(f, 'fasta'): - seq_id = seq_record.id - try: - tax_id = FNA_TAXIDS[os.path.basename(fname)] - except KeyError: - continue - seq_record.id = 'taxid|{}|{}'.format(tax_id, seq_id) - SeqIO.write(seq_record, f_out, 'fasta') - - bwa.index(index_fa, output=db) - return db - - -def test_meta_bwa(bwa_db, taxonomy_db, input_bam): - out_report = util.file.mkstempfname('.report') - dupe_report = util.file.mkstempfname('.dupes.report') - out_bam = util.file.mkstempfname('.output.bam') - cmd = [input_bam, bwa_db, taxonomy_db, out_report, '--dupeReport', dupe_report, '--outBam', out_bam] - parser = metagenomics.parser_align_rna_metagenomics(argparse.ArgumentParser()) - args = parser.parse_args(cmd) - args.func_main(args) - - assert os.path.getsize(out_report) > 0 - assert os.path.getsize(dupe_report) > 0 - assert os.path.getsize(out_bam) > 0 diff --git a/test/pipelines/snakemake/test_diamond.py b/test/pipelines/snakemake/test_kaiju.py similarity index 69% rename from test/pipelines/snakemake/test_diamond.py rename to test/pipelines/snakemake/test_kaiju.py index 709abcc70..a689d4ca4 100755 --- a/test/pipelines/snakemake/test_diamond.py +++ b/test/pipelines/snakemake/test_kaiju.py @@ -7,14 +7,13 @@ import tools from test.pipelines.snakemake import snake -from test.integration.test_diamond import * # for pytest fixtures +from test.integration.test_kaiju import * # for pytest fixtures -@pytest.mark.skipif(tools.is_osx(), reason="not currently tested under OSX") @pytest.mark.skipif(sys.version_info < (3, 5), reason="Python version is too old for snakemake.") -def test_pipes(tmpdir_function, diamond_db, taxonomy_db, krona_db, input_bam): +def test_pipes(tmpdir_function, kaiju_db, taxonomy_db, krona_db, input_bam): runner = snake.SnakemakeRunner(workdir=tmpdir_function) override_config = { - 'diamond_db': diamond_db, + 'kaiju_db': kaiju_db, 'taxonomy_db': taxonomy_db, 'krona_db': krona_db, } @@ -24,12 +23,12 @@ def test_pipes(tmpdir_function, diamond_db, taxonomy_db, krona_db, input_bam): runner.create_sample_files(sample_files=['samples_metagenomics']) krona_out = join(runner.config['data_dir'], runner.config['subdirs']['metagenomics'], - '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'diamond.krona.html'])) + '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kaiju.krona.html'])) - diamond_out = join( + kaiju_out = join( runner.config['data_dir'], runner.config['subdirs']['metagenomics'], - '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'diamond.report']) + '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kaiju.report']) ) runner.run([krona_out]) - assert os.path.getsize(os.path.join(runner.workdir, diamond_out)) > 0 - assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0 \ No newline at end of file + assert os.path.getsize(os.path.join(runner.workdir, kaiju_out)) > 0 + assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0 diff --git a/test/pipelines/snakemake/test_kraken.py b/test/pipelines/snakemake/test_krakenuniq.py similarity index 69% rename from test/pipelines/snakemake/test_kraken.py rename to test/pipelines/snakemake/test_krakenuniq.py index ac2c446cd..063ddc9d0 100755 --- a/test/pipelines/snakemake/test_kraken.py +++ b/test/pipelines/snakemake/test_krakenuniq.py @@ -6,13 +6,13 @@ import pytest from test.pipelines.snakemake import snake -from test.integration.test_kraken import * # for pytest fixtures +from test.integration.test_krakenuniq import * # for pytest fixtures @pytest.mark.skipif(sys.version_info < (3, 5), reason="Python version is too old for snakemake.") -def test_pipes(tmpdir_function, kraken_db, krona_db, input_bam): +def test_pipes(tmpdir_function, krakenuniq_db, krona_db, input_bam): runner = snake.SnakemakeRunner(workdir=tmpdir_function) override_config = { - 'kraken_db': kraken_db, + 'krakenuniq_db': krakenuniq_db, 'krona_db': krona_db, } runner.set_override_config(override_config) @@ -20,17 +20,16 @@ def test_pipes(tmpdir_function, kraken_db, krona_db, input_bam): runner.link_samples([input_bam], destination='per_sample', link_transform=snake.rename_raw_bam) runner.create_sample_files(sample_files=['samples_metagenomics']) - kraken_out = join( + krakenuniq_out = join( runner.config['data_dir'], runner.config['subdirs']['metagenomics'], - '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kraken.report']) + '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'krakenuniq.report']) ) krona_out = join( runner.config['data_dir'], runner.config['subdirs']['metagenomics'], - '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'kraken.krona.html']) + '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw', 'krakenuniq.krona.html']) ) - # runner.run(['all_metagenomics']) - runner.run([kraken_out, krona_out]) - assert os.path.getsize(os.path.join(runner.workdir, kraken_out)) > 0 - assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0 \ No newline at end of file + runner.run([krakenuniq_out, krona_out]) + assert os.path.getsize(os.path.join(runner.workdir, krakenuniq_out)) > 0 + assert os.path.getsize(os.path.join(runner.workdir, krona_out)) > 0 diff --git a/test/pipelines/snakemake/test_metagenomics_align.py b/test/pipelines/snakemake/test_metagenomics_align.py deleted file mode 100755 index fc8fd0602..000000000 --- a/test/pipelines/snakemake/test_metagenomics_align.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python - -import os -import sys - -import pytest - -from test.pipelines.snakemake import snake -from test.integration.test_metagenomics_align import * # for pytest fixtures - -@pytest.mark.skipif(sys.version_info < (3, 5), reason="Python version is too old for snakemake.") -def test_pipes(tmpdir_function, bwa_db, taxonomy_db, input_bam): - runner = snake.SnakemakeRunner(workdir=tmpdir_function) - override_config = { - 'align_rna_db': bwa_db, - 'taxonomy_db': taxonomy_db, - } - runner.set_override_config(override_config) - runner.setup() - runner.link_samples([input_bam], destination='per_sample', link_transform=snake.rename_raw_bam) - runner.create_sample_files(sample_files=['samples_metagenomics']) - - report_out = join( - runner.config['data_dir'], runner.config['subdirs']['metagenomics'], - '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw.rna_bwa.report']) - ) - - bam_out = join( - runner.config['data_dir'], runner.config['subdirs']['metagenomics'], - '.'.join([os.path.splitext(os.path.basename(input_bam))[0], 'raw.rna_bwa.bam']) - ) - - runner.run([report_out]) - assert os.path.getsize(os.path.join(runner.workdir, report_out)) > 0 - assert os.path.getsize(os.path.join(runner.workdir, bam_out)) > 0 \ No newline at end of file diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py index 486043aac..66f981268 100644 --- a/test/unit/test_metagenomics.py +++ b/test/unit/test_metagenomics.py @@ -34,61 +34,6 @@ def test_help_parser_for_each_command(self): helpstring = parser.format_help() -@patch('metagenomics.kraken_dfs_report') -class TestDiamondCalls(TestCaseWithTmp): - def setUp(self): - super().setUp() - patcher = patch('subprocess.Popen') - self.addCleanup(patcher.stop) - self.mock_popen = patcher.start() - self.mock_popen.return_value.returncode = 0 - - patcher = patch('tools.picard.SamToFastqTool') - self.addCleanup(patcher.stop) - self.mock_s2f = patcher.start() - self.mock_s2f.return_value.execute.return_value.returncode = 0 - - patcher = patch('tools.diamond.Diamond', autospec=True) - self.addCleanup(patcher.stop) - self.mock_diamond = patcher.start() - - # Can't open unwritten named pipes - if six.PY2: - patcher = patch('__builtin__.open', mock.mock_open(read_data="id1\t1\n")) - else: - patcher = patch('builtins.open', mock.mock_open(read_data="id1\t1\n")) - self.addCleanup(patcher.stop) - patcher.start() - - # mock_open doesn't have __next__ for csv.reader - patcher = patch('metagenomics.taxa_hits_from_tsv', autospec=True) - self.addCleanup(patcher.stop) - self.mock_taxa_hits = patcher.start() - self.mock_taxa_hits.return_value = Counter({1: 100, 2: 40}) - - self.inBam = util.file.mkstempfname('.bam') - self.db = tempfile.mkdtemp('db') - self.tax_db = join(util.file.get_test_input_path(), 'TestMetagenomicsSimple', 'db', 'taxonomy') - - def test_output_reads(self, mock_dfs): - out_report = util.file.mkstempfname('report.txt') - out_reads = util.file.mkstempfname('lca.gz') - - metagenomics.diamond(self.inBam, self.db, self.tax_db, out_report, outReads=out_reads) - - cmd = self.mock_popen.call_args[0][0] - self.assertIn(out_reads, cmd) - assert isinstance(metagenomics.kraken_dfs_report.call_args[0][0], metagenomics.TaxonomyDb) - - def test_num_threads(self, mock_dfs): - out_report = util.file.mkstempfname('report.txt') - metagenomics.diamond(self.inBam, self.db, self.tax_db, out_report, threads=11) - expected_threads = min(11, _CPUS) - expected_threads = '--threads {}'.format(expected_threads) - cmd = self.mock_popen.call_args[0][0] - self.assertIn(expected_threads, cmd) - - class TestKronaCalls(TestCaseWithTmp): def setUp(self): @@ -103,7 +48,7 @@ def setUp(self): def test_krona_import_taxonomy(self): out_html = util.file.mkstempfname('.html') metagenomics.krona(self.inTsv, self.db, out_html, queryColumn=3, taxidColumn=5, scoreColumn=7, - noHits=True, noRank=True) + noHits=True, noRank=True, inputType='tsv') self.mock_krona().import_taxonomy.assert_called_once_with( self.db, [self.inTsv], out_html, query_column=3, taxid_column=5, score_column=7, no_hits=True, no_rank=True, magnitude_column=None, root_name=os.path.basename(self.inTsv)) @@ -310,12 +255,40 @@ def test_coverage_lca(taxa_db): assert metagenomics.coverage_lca([10, 11, 12], taxa_db.parents, 50) == 7 assert metagenomics.coverage_lca([9], taxa_db.parents) is None + +def test_krakenuniq(mocker): + p = mocker.patch('tools.kraken.KrakenUniq.pipeline') + args = [ + 'db', + 'input.bam', + '--outReports', 'output.report', + '--outReads', 'output.reads', + ] + args = metagenomics.parser_krakenuniq(argparse.ArgumentParser()).parse_args(args) + args.func_main(args) + p.assert_called_with('db', ['input.bam'], num_threads=mock.ANY, filter_threshold=mock.ANY, out_reports=['output.report'], out_reads=['output.reads']) + + +def test_kaiju(mocker): + p = mocker.patch('tools.kaiju.Kaiju.classify') + args = [ + 'input.bam', + 'db.fmi', + 'tax_db', + 'output.report', + '--outReads', 'output.reads', + ] + args = metagenomics.parser_kaiju(argparse.ArgumentParser()).parse_args(args) + args.func_main(args) + p.assert_called_with('db.fmi', 'tax_db', 'input.bam', output_report='output.report', num_threads=mock.ANY, output_reads='output.reads') + + class TestBamFilter(TestCaseWithTmp): def test_bam_filter_simple(self): temp_dir = tempfile.gettempdir() input_dir = util.file.get_test_input_path(self) taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy") - + filtered_bam = util.file.mkstempfname('.bam') args = [ os.path.join(input_dir,"input.bam"), @@ -336,7 +309,7 @@ def test_bam_filter_by_tax_id(self): temp_dir = tempfile.gettempdir() input_dir = util.file.get_test_input_path(self) taxonomy_dir = os.path.join(util.file.get_test_input_path(),"TestMetagenomicsSimple","db","taxonomy") - + filtered_bam = util.file.mkstempfname('.bam') args = [ os.path.join(input_dir,"input.bam"), @@ -352,4 +325,3 @@ def test_bam_filter_by_tax_id(self): expected_bam = os.path.join(input_dir,"expected.bam") assert_equal_bam_reads(self, filtered_bam, expected_bam) - diff --git a/test/unit/test_tools_kraken.py b/test/unit/test_tools_kraken.py index 9f40692a8..19045c7d7 100644 --- a/test/unit/test_tools_kraken.py +++ b/test/unit/test_tools_kraken.py @@ -10,8 +10,8 @@ @pytest.fixture -def kraken(): - return tools.kraken.Kraken() +def krakenuniq(): + return tools.kraken.KrakenUniq() @pytest.fixture @@ -26,68 +26,38 @@ def db(tmpdir_factory): @pytest.fixture(autouse=True) def mocks(mocker): - mock_run = mocker.patch('util.misc.run', autospec=True) mock_check_call = mocker.patch('subprocess.check_call', autospec=True) - - mock_conda = mocker.patch('tools.CondaPackage', autospec=True) - mock_conda.return_value.verify_install.return_value = "mock" - mock_conda.return_value.is_attempted.return_value = True - mock_conda.return_value.is_installed.return_value = True - mock_conda.return_value.require_executability = False - mock_conda.return_value.executable_path.return_value = "/dev/null" return { - 'conda': mock_conda, 'run': mock_run, 'check_call': mock_check_call, } -def test_kraken_classify(mocks, kraken, db, in_bam): +def test_krakenuniq_classify(mocks, krakenuniq, db, in_bam): out_reads = util.file.mkstempfname('.reads.txt') - kraken.classify(in_bam, db, out_reads) + krakenuniq.classify(in_bam, db, out_reads) args = mocks['check_call'].call_args[0][0] - assert 'kraken' == os.path.basename(args[0]) + assert 'krakenuniq' == os.path.basename(args[0]) assert util.misc.list_contains(['--db', db], args) assert util.misc.list_contains(['--output', out_reads], args) assert util.misc.list_contains(['--threads', str(_CPUS)], args) - -def test_kraken_filter(mocks, kraken, db): - in_reads = util.file.mkstempfname('.kraken_reads.unfilt.txt') - out_reads = util.file.mkstempfname('.kraken_reads.filt.txt') - for thresh in (0.05, 0.3, 0.81): - kraken.filter(in_reads, db, out_reads, thresh) - args = mocks['run'].call_args[0][0] - assert 'kraken-filter' == os.path.basename(args[0]) - assert in_reads in args - assert util.misc.list_contains(['--db', db], args) - assert util.misc.list_contains(['--threshold', str(thresh)], args) - -def test_kraken_report(mocks, kraken, db): - in_reads = util.file.mkstempfname('.kraken_reads.txt') - out_report = util.file.mkstempfname('.kraken_report.txt') - kraken.report(in_reads, db, out_report) - args = mocks['run'].call_args[0][0] - assert 'kraken-report' == os.path.basename(args[0]) - assert in_reads in args - assert util.misc.list_contains(['--db', db], args) - -def test_classify_num_threads(mocks, kraken, db, in_bam): +def test_classify_num_threads(mocks, krakenuniq, db, in_bam): out_reads = util.file.mkstempfname('.reads.txt') - kraken.classify(in_bam, db, out_reads) + krakenuniq.classify(in_bam, db, out_reads) args = mocks['check_call'].call_args[0][0] - assert 'kraken' == os.path.basename(args[0]) + assert 'krakenuniq' == os.path.basename(args[0]) assert '--threads' in args actual = args[args.index('--threads')+1] assert actual == str(_CPUS) for requested in (1,2,3,8,11,20): expected = min(_CPUS, requested) - kraken.classify(in_bam, db, out_reads, numThreads=requested) + krakenuniq.classify(in_bam, db, out_reads, num_threads=requested) args = mocks['check_call'].call_args[0][0] - assert 'kraken' == os.path.basename(args[0]) + assert 'krakenuniq' == os.path.basename(args[0]) assert '--threads' in args actual = args[args.index('--threads')+1] assert actual == str(expected), "failure for requested %s, expected %s, actual %s" % (requested, expected, actual) diff --git a/test/unit/test_tools_krakenuniq.py b/test/unit/test_tools_krakenuniq.py new file mode 100644 index 000000000..2c35eaf38 --- /dev/null +++ b/test/unit/test_tools_krakenuniq.py @@ -0,0 +1,93 @@ +# Unit tests for krakenuniq +import os.path + +import pytest + +import util.file +import util.misc +import tools.kraken +from test import _CPUS + + +@pytest.fixture +def kraken(): + return tools.kraken.Kraken() + + +@pytest.fixture +def in_bam(): + return os.path.join(util.file.get_test_input_path(), 'almost-empty.bam') + + +@pytest.fixture +def db(tmpdir_factory): + return str(tmpdir_factory.mktemp('db')) + + +@pytest.fixture(autouse=True) +def mocks(mocker): + + mock_run = mocker.patch('util.misc.run', autospec=True) + mock_check_call = mocker.patch('subprocess.check_call', autospec=True) + + mock_conda = mocker.patch('tools.CondaPackage', autospec=True) + mock_conda.return_value.verify_install.return_value = "mock" + mock_conda.return_value.is_attempted.return_value = True + mock_conda.return_value.is_installed.return_value = True + mock_conda.return_value.require_executability = False + mock_conda.return_value.executable_path.return_value = "/dev/null" + return { + 'conda': mock_conda, + 'run': mock_run, + 'check_call': mock_check_call, + } + + +def test_kraken_classify(mocks, kraken, db, in_bam): + out_reads = util.file.mkstempfname('.reads.txt') + kraken.classify(in_bam, db, out_reads) + args = mocks['check_call'].call_args[0][0] + assert 'kraken' == os.path.basename(args[0]) + assert util.misc.list_contains(['--db', db], args) + assert util.misc.list_contains(['--output', out_reads], args) + assert util.misc.list_contains(['--threads', str(_CPUS)], args) + + +def test_kraken_filter(mocks, kraken, db): + in_reads = util.file.mkstempfname('.kraken_reads.unfilt.txt') + out_reads = util.file.mkstempfname('.kraken_reads.filt.txt') + for thresh in (0.05, 0.3, 0.81): + kraken.filter(in_reads, db, out_reads, thresh) + args = mocks['run'].call_args[0][0] + assert 'kraken-filter' == os.path.basename(args[0]) + assert in_reads in args + assert util.misc.list_contains(['--db', db], args) + assert util.misc.list_contains(['--threshold', str(thresh)], args) + +def test_kraken_report(mocks, kraken, db): + in_reads = util.file.mkstempfname('.kraken_reads.txt') + out_report = util.file.mkstempfname('.kraken_report.txt') + kraken.report(in_reads, db, out_report) + args = mocks['run'].call_args[0][0] + assert 'kraken-report' == os.path.basename(args[0]) + assert in_reads in args + assert util.misc.list_contains(['--db', db], args) + +def test_classify_num_threads(mocks, kraken, db, in_bam): + out_reads = util.file.mkstempfname('.reads.txt') + + kraken.classify(in_bam, db, out_reads) + args = mocks['check_call'].call_args[0][0] + assert 'kraken' == os.path.basename(args[0]) + assert '--threads' in args + actual = args[args.index('--threads')+1] + assert actual == str(_CPUS) + + for requested in (1,2,3,8,11,20): + expected = min(_CPUS, requested) + kraken.classify(in_bam, db, out_reads, numThreads=requested) + args = mocks['check_call'].call_args[0][0] + assert 'kraken' == os.path.basename(args[0]) + assert '--threads' in args + actual = args[args.index('--threads')+1] + assert actual == str(expected), "failure for requested %s, expected %s, actual %s" % (requested, expected, actual) diff --git a/tools/kaiju.py b/tools/kaiju.py new file mode 100644 index 000000000..c44230aba --- /dev/null +++ b/tools/kaiju.py @@ -0,0 +1,185 @@ +''' +Kaiju - DNA-to-protein metagenomic classifier +''' +from builtins import super +import collections +import itertools +import logging +import os +import os.path +import shlex +import shutil +import subprocess +import tools + +from Bio import SeqIO + +import util.file + +TOOL_VERSION = '1.6.3_yesimon' + +log = logging.getLogger(__name__) + + +def read_a2t(fn, base_accession=True): + if base_accession: + accession_col = 0 + else: + accession_col = 1 + d = {} + with open(fn) as f: + f.readline() # Cannot use next(f) in python2 + for line in f.readlines(): + parts = line.split('\t') + taxid = int(parts[2]) + accession = parts[accession_col] + d[accession] = taxid + return d + + +class Kaiju(tools.Tool): + + def __init__(self, install_methods=None): + if not install_methods: + install_methods = [ + tools.CondaPackage("kaiju", version=TOOL_VERSION) + ] + super(Kaiju, self).__init__(install_methods=install_methods) + + def version(self): + return TOOL_VERSION + + def build(self, db_prefix, protein_fastas, threads=None, options=None, option_string=None, + tax_db=None, translate_accessions=False): + '''Create a kaiju database. + + Args: + db_prefix: Kaiju database prefix (file path not containing extension) to create. + protein_fastas: List of input fasta files to process. + tax_db: Contains accession2taxid used if translate_accessions is True. + translate_accessions: If fasta IDs are accessions, translate to + _ format kaiju expects. + ''' + assert len(protein_fastas), ('Kaiju requires input files to create a database.') + options = options or {} + options['-a'] = 'protein' + options['-n'] = util.misc.sanitize_thread_count(threads) + options['-o'] = db_prefix + + temp_file = util.file.temp_catted_files(protein_fastas, prefix='kaiju_', suffix='.faa') + + db_fasta_fn = util.file.tempfname() + with db_fasta_fn as db_fasta_fn: + if translate_accessions: + with temp_file as input_fasta: + with open(db_fasta_fn, 'w') as db_fasta: + prot2taxid_fn = os.path.join(tax_db, 'accession2taxid', 'prot.accession2taxid') + prot2taxid = read_a2t(prot2taxid_fn) + for record in SeqIO.parse(input_fasta, 'fasta'): + accession = record.id.split('.', 1)[0] + taxid = prot2taxid.get(accession) + if taxid is not None: + new_id = '_'.join([record.id, str(taxid)]) + record.id = new_id + record.description = new_id + SeqIO.write(record, db_fasta, 'fasta') + option_string = db_fasta_fn + else: + option_string = temp_ + + self.execute('mkbwt', options=options, option_string=option_string) + self.execute('mkfmi', option_string=db_prefix) + + def classify(self, db, tax_db, in_bam, output_reads=None, output_report=None, rank='species', verbose=None, num_threads=None): + assert output_reads or output_report + output_ctx = util.file.tempfname() + with util.file.tempfname() as temp_reads: + # Fake loop to break out early if report is not wanted + while True: + tmp_fastq1 = util.file.mkstempfname('_1.fastq') + tmp_fastq2 = util.file.mkstempfname('_2.fastq') + picard = tools.picard.SamToFastqTool() + picard_opts = { + 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, + 'CLIPPING_ACTION': 'X' + } + picard.execute(in_bam, tmp_fastq1, tmp_fastq2, + picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), + JVMmemory=picard.jvmMemDefault) + + nodes_dmp = os.path.join(tax_db, 'nodes.dmp') + names_dmp = os.path.join(tax_db, 'names.dmp') + + opts = { + '-t': nodes_dmp, + '-f': db, + '-o': temp_reads, + '-z': util.misc.sanitize_thread_count(num_threads), + '-i': tmp_fastq1 + } + if verbose: + opts['-v'] = None + + if os.path.getsize(tmp_fastq2) >= 50: + opts['-j'] = tmp_fastq2 + + self.execute('kaiju', options=opts) + if not output_report: + break + + opts = { + '-r': rank, + '-p': True, + '-t': nodes_dmp, + '-n': names_dmp, + '-i': temp_reads, + '-o': output_report, + '-w': None, + '-x': None + } + if verbose: + opts['-v'] = None + self.execute('kaijuReport', options=opts) + break + if output_reads: + subprocess.check_call(['pigz', '-9', temp_reads]) + shutil.move(temp_reads + '.gz', output_reads) + + def execute(self, command, options=None, option_string=None, return_stdout=False): + '''Run a kaiju command + + Args: + options: Dict of command line options to values. Set value to None + for an option with no value. + return_stdout: Whether to return stdout as well as in + (exitcode, stdout). + ''' + cmd = [command] + if options: + # We need some way to allow empty options args like --log, hence + # we filter out on 'x is None'. + cmd.extend([str(x) for x in itertools.chain(*options.items()) if x is not None]) + if option_string: + cmd.extend(shlex.split(option_string)) + log.debug("Calling {}: {}".format(command, " ".join(cmd))) + subprocess.check_call(cmd) + + def read_report(self, report_fn): + report = collections.Counter() + with open(report_fn) as f: + f.readline() # Cannot use next(f) in python2 + for line in f: + if line.startswith('---'): + continue + parts = line.strip().split('\t') + percent = float(parts[0]) + reads = int(parts[1]) + fullname = parts[2] + if 'cannot be assigned to a species' in fullname: + tax_id = 1 + elif 'unclassified' == fullname: + tax_id = 0 + else: + tax_id = parts[3] + report[tax_id] = reads + return report diff --git a/tools/kraken.py b/tools/kraken.py index 19a93ef89..c3eef1acc 100644 --- a/tools/kraken.py +++ b/tools/kraken.py @@ -2,6 +2,7 @@ KRAKEN metagenomics classifier ''' from __future__ import print_function +import collections import itertools import logging import os @@ -11,6 +12,7 @@ import subprocess import sys import tempfile + import tools import tools.picard import tools.samtools @@ -18,24 +20,30 @@ import util.misc from builtins import super -TOOL_NAME = 'kraken' -TOOL_VERSION = '1.0.0_fork3' +KRAKEN_VERSION = '1.0.0_fork3' +KRAKENUNIQ_VERSION = '0.5.7_yesimon' + log = logging.getLogger(__name__) + class Kraken(tools.Tool): - BINS = ['kraken', 'kraken-build', 'kraken-filter', 'kraken-mpa-report', 'kraken-report', 'kraken-translate'] + BINS = { + 'classify': 'kraken', + 'build': 'kraken-build', + 'filter': 'kraken-filter', + 'report': 'kraken-report'} def __init__(self, install_methods=None): self.subtool_name = self.subtool_name if hasattr(self, "subtool_name") else "kraken" if not install_methods: install_methods = [] - install_methods.append(tools.CondaPackage(TOOL_NAME, executable=self.subtool_name, version=TOOL_VERSION, channel='broad-viral')) + install_methods.append(tools.CondaPackage('kraken', executable=self.subtool_name, version=KRAKEN_VERSION, channel='broad-viral')) super(Kraken, self).__init__(install_methods=install_methods) def version(self): - return TOOL_VERSION + return KRAKEN_VERSION @property def libexec(self): @@ -52,7 +60,7 @@ def build(self, db, options=None, option_string=None): *args: List of input filenames to process. ''' options['--threads'] = util.misc.sanitize_thread_count(options.get('--threads')) - self.execute('kraken-build', db, db, options=options, + self.execute(self.BINS['build'], db, db, options=options, option_string=option_string) def _db_opts(self, db, threads): @@ -115,7 +123,7 @@ def pipeline(self, db, inBams, outReports=None, outReads=None, fastq_pipes = pipes[:n_bams * 2] kraken_output_pipes = pipes[n_bams * 2:] - kraken_bin = os.path.join(self.libexec, 'kraken') + kraken_bin = 'kraken' opts = '' if lockMemory: opts += ' --lock-memory' @@ -146,13 +154,13 @@ def pipeline(self, db, inBams, outReports=None, outReads=None, if outReports: if filterThreshold is not None: - kraken_filter_bin = os.path.join(self.libexec, 'kraken-filter') + kraken_filter_bin = 'kraken-filter' cmd += ' | {kraken_filter}{tax_opts} --threshold {filterThreshold}'.format( kraken_filter=kraken_filter_bin, tax_opts=tax_filter_opts, filterThreshold=filterThreshold) - kraken_report_bin = os.path.join(self.libexec, 'kraken-report') + kraken_report_bin = 'kraken-report' cmd += ' | {kraken_report}{tax_opts} > {outReport}'.format( kraken_report=kraken_report_bin, tax_opts=tax_report_opts, @@ -207,6 +215,7 @@ def classify(self, inBam, db, outReads, numThreads=None): '--fastq-input': None, '--gzip-compressed': None, } + # Detect if input bam was paired by checking fastq 2 if os.path.getsize(tmp_fastq2) < 50: res = self.execute('kraken', db, outReads, args=[tmp_fastq1], options=opts) else: @@ -218,13 +227,13 @@ def classify(self, inBam, db, outReads, numThreads=None): def filter(self, inReads, db, outReads, filterThreshold): """Filter Kraken hits """ - self.execute('kraken-filter', db, outReads, args=[inReads], + self.execute(self.BINS['filter'], db, outReads, args=[inReads], options={'--threshold': filterThreshold}) def report(self, inReads, db, outReport): """Convert Kraken read-based output to summary reports """ - self.execute('kraken-report', db, outReport, args=[inReads]) + self.execute(self.BINS['report'], db, outReport, args=[inReads]) def execute(self, command, db, output, args=None, options=None, option_string=None): @@ -237,15 +246,17 @@ def execute(self, command, db, output, args=None, options=None, options: List of keyword options. option_string: Raw strip command line options. ''' - assert command in Kraken.BINS, 'Kraken command is unknown' options = options or {} - if command == 'kraken': - options['--output'] = output + if command == self.BINS['classify']: + if output: + options['--output'] = output + elif 'krakenuniq' in command: + options['--output'] = 'off' option_string = option_string or '' args = args or [] - cmd = [os.path.join(self.libexec, command), '--db', db] + cmd = [command, '--db', db] # We need some way to allow empty options args like --build, hence # we filter out on 'x is None'. cmd.extend([str(x) for x in itertools.chain(*options.items()) @@ -254,18 +265,119 @@ def execute(self, command, db, output, args=None, options=None, cmd.extend(args) log.debug('Calling %s: %s', command, ' '.join(cmd)) - if command == 'kraken': + if command == self.BINS['classify']: + subprocess.check_call(cmd) + elif command == self.BINS['build']: subprocess.check_call(cmd) - elif command == 'kraken-build': - jellyfish_path = Jellyfish().install_and_get_path() - env = os.environ.copy() - env['PATH'] = ':'.join([os.path.dirname(jellyfish_path), env['PATH']]) - subprocess.check_call(cmd, env=env) else: with util.file.open_or_gzopen(output, 'w') as of: util.misc.run(cmd, stdout=of, stderr=subprocess.PIPE, check=True) +@tools.skip_install_test() class Jellyfish(Kraken): """ Tool wrapper for Jellyfish (installed by kraken-all metapackage) """ subtool_name = 'jellyfish' + + +class KrakenUniq(Kraken): + + BINS = { + 'classify': 'krakenuniq', + 'build': 'krakenuniq-build', + 'filter': 'krakenuniq-filter', + 'report': 'krakenuniq-report'} + + def __init__(self, install_methods=None): + self.subtool_name = self.subtool_name if hasattr(self, 'subtool_name') else 'krakenuniq' + if not install_methods: + install_methods = [] + install_methods.append(tools.CondaPackage('krakenuniq', executable=self.subtool_name, version=KRAKENUNIQ_VERSION, channel='broad-viral')) + super(KrakenUniq, self).__init__(install_methods=install_methods) + + def version(self): + return TOOL_VERSION + + def pipeline(self, db, in_bams, out_reports=None, out_reads=None, + filter_threshold=None, num_threads=None): + + try: + from itertools import zip_longest + except: # Python 2 compat + from itertools import izip_longest as zip_longest + assert out_reads is not None or out_reports is not None + out_reports = out_reports or [] + out_reads = out_reads or [] + + for in_bam, out_read, out_report in zip_longest(in_bams, out_reads, out_reports): + self.classify(in_bam, db, out_reads=out_read, out_report=out_report, num_threads=None) + + def classify(self, in_bam, db, out_reads=None, out_report=None, num_threads=None): + """Classify input reads (bam) + + Args: + in_bam: unaligned reads + db: Kraken built database directory. + outReads: Output file of command. + """ + tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz') + tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz') + # Do not convert this to samtools bam2fq unless we can figure out how to replicate + # the clipping functionality of Picard SamToFastq + picard = tools.picard.SamToFastqTool() + picard_opts = { + 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, + 'CLIPPING_ACTION': 'X' + } + picard.execute(in_bam, tmp_fastq1, tmp_fastq2, + picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), + JVMmemory=picard.jvmMemDefault) + + opts = { + '--threads': util.misc.sanitize_thread_count(num_threads), + '--fastq-input': None, + '--gzip-compressed': None, + '--preload': None + } + if out_report: + opts['--report-file'] = out_report + # Detect if input bam was paired by checking fastq 2 + if os.path.getsize(tmp_fastq2) < 50: + res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1], options=opts) + else: + opts['--paired'] = None + res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1, tmp_fastq2], options=opts) + os.unlink(tmp_fastq1) + os.unlink(tmp_fastq2) + if out_report: + with open(out_report, 'rt+') as f: + lines = [line.strip() for line in f.readlines() if not line.startswith('#')] + lines = [line for line in lines if line] + if not lines: + f.seek(f.tell() - 1, os.SEEK_SET) + print('\t'.join(['%', 'reads', 'taxReads', 'kmers', 'dup', 'cov', 'taxID', 'rank', 'taxName']), file=f) + print('\t'.join(['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified']), file=f) + + def read_report(self, report_fn): + report = collections.Counter() + with open(report_fn) as f: + for line in f: + if line.startswith('#') or line.startswith('%'): + continue + line = line.strip() + if not line: + continue + parts = line.split('\t') + percent = float(parts[0]) + cum_reads = int(parts[1]) + tax_reads = int(parts[2]) + tax_kmers = int(parts[3]) + if parts[5] == 'NA': # unclassified + cov = 0 + else: + cov = float(parts[5]) + tax_id = int(parts[6]) + rank = parts[7] + name = parts[8] + report[tax_id] = (tax_reads, tax_kmers) + return report diff --git a/travis/install-conda.sh b/travis/install-conda.sh index d51779199..def3c6737 100755 --- a/travis/install-conda.sh +++ b/travis/install-conda.sh @@ -47,11 +47,7 @@ else # if it does not exist, we need to install miniconda # Use recommendations from https://github.com/bioconda/bioconda-recipes/issues/13774 conda update --quiet -y conda # conda config --set channel_priority strict - travis_wait conda install --quiet -y pycryptosat - conda config --set sat_solver pycryptosat - conda install --quiet -y openjdk==8.0.112 fi # update certs -conda update --quiet -y openssl pyopenssl ca-certificates certifi conda info -a # for debugging diff --git a/util/file.py b/util/file.py index aac233bf2..6db6b1296 100644 --- a/util/file.py +++ b/util/file.py @@ -321,11 +321,13 @@ def mkdir_p(dirpath): else: raise + def touch_p(path, times=None): '''Touch file, making parent directories if they don't exist.''' mkdir_p(os.path.dirname(path)) touch(path, times=times) + def open_or_gzopen(fname, *opts, **kwargs): mode = 'r' open_opts = list(opts) @@ -822,34 +824,6 @@ def find_broken_symlinks(rootdir, followlinks=False): return broken_links_to_remove -def join_paired_fastq(input_fastqs, output_format='fastq', num_n=None): - '''Join paired/interleaved fastq(s) into single reads connected by Ns''' - assert output_format in ('fastq', 'fasta') - num_n = num_n or 31 - - if len(input_fastqs) > 1: - f1 = SeqIO.parse(input_fastqs[0], 'fastq') - f2 = SeqIO.parse(input_fastqs[1], 'fastq') - records = zip(f1, f2) - else: - if input_fastqs[0] == '-': - input_fastqs[0] = sys.stdin - records = util.misc.batch_iterator(SeqIO.parse(input_fastqs[0], 'fastq'), 2) - for r1, r2 in records: - if r1.id.endswith('/1'): - rid = r1.id[:-2] - else: - rid = r1.id - jseq = Seq(str(r1.seq) + 'N' * num_n + str(r2.seq)) - labbrevs = None - if output_format == 'fastq': - # Illumina quality score of 2 indicates unreliable base - labbrevs = { - 'phred_quality': r1.letter_annotations['phred_quality'] + [2] * num_n + r2.letter_annotations['phred_quality'] - } - rec = SeqRecord(jseq, id=rid, description='', letter_annotations=labbrevs) - yield rec - def uncompressed_file_type(fname): """Return the original file extension of either a compressed or an uncompressed file.""" base, ext = os.path.splitext(fname)