diff --git a/.circleci/config.yml b/.circleci/config.yml index fa4edc2d..ff30bc73 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,7 @@ version: 2.1 jobs: build_and_dryrun: <<: *defaults - resource_class: small + resource_class: medium environment: N_THREADS: 1 MEM: 2 @@ -304,27 +304,27 @@ workflows: build_and_test: jobs: - build_and_dryrun - - genome_quantify: - requires: - - build_and_dryrun - - get_example_data + # - genome_quantify: + # requires: + # - build_and_dryrun + #- get_example_data # - run_test: # requires: # - build_and_dryrun # - get_example_data - - qc_and_assembly: - requires: - - build_and_dryrun - - get_example_data + # - qc_and_assembly: + # requires: + # - build_and_dryrun + # - get_example_data - - binning: - requires: - - qc_and_assembly - - genecatalog: - requires: - - qc_and_assembly + # - binning: + # requires: + # - qc_and_assembly + # - genecatalog: + # requires: + # - qc_and_assembly # - getenvs: # requires: # - build_and_dryrun diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index c809b588..028c8b61 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -1,55 +1,351 @@ -name: Python Package using Conda +name: Combined Workflow -on: [push] +on: + # Runs on pushes targeting the default branch + push: + branches: ["master"] + pull_request: + branches: ["master"] + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + + +defaults: + run: + shell: bash -el {0} # use default shell + + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: - build-linux: + build-and-dryrun: runs-on: ubuntu-latest strategy: - max-parallel: 5 - defaults: - run: - shell: bash -el {0} # use default shell + max-parallel: 1 - steps: + steps: - name: Checkout uses: actions/checkout@v3.5.2 - - - name: Setup_conda_and_cache - uses: actions/cache@v2 - env: - # Increase this value to reset cache if etc/example-environment.yml has not changed - CACHE_NUMBER: 0 - with: - path: ~/conda_pkgs_dir - key: - ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('atlasenv.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: atlasenv - use-mamba: true - python-version: 3.11 - mamba-version: "*" - channels: conda-forge,bioconda,defaults + + # - name: Get current month + # id: date + # run: echo "date=$(date +%Y-%m)" >> "${GITHUB_OUTPUT}" + + - uses: mamba-org/setup-micromamba@v1 + with: environment-file: atlasenv.yml - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + environment-name: atlasenv + # persist on the same month. + cache-environment-key: atlasenv- #${{ steps.date.outputs.date }} - name: Install atlas run: | - echo "You are using env '$CONDA_PREFIX'" python -m pip install . --no-deps -vv + - name: Test atlas run: | atlas --help atlas --version - - name: Import atlas + + - name: test import modules atlas run: | python -c "from atlas import utils" - name: Dryrun + run: test/dryrun.sh + + + + run-qc-and-assembly: + runs-on: ubuntu-latest + needs: build-and-dryrun + env: + N_THREADS: 2 + MEM: 0.8 + + steps: + - name: Checkout + uses: actions/checkout@v3.5.2 + + + - uses: mamba-org/setup-micromamba@v1 + with: + environment-file: atlasenv.yml + environment-name: atlasenv + cache-environment-key: atlasenv- + + + - name: Install atlas + run: | + python -m pip install . --no-deps -vv + atlas --help + + - name: get example data + id: get-data + uses: actions/cache/restore@v3 + with: + path: test_reads.tar.gz + key: example-data + + - name: Download test data + if: steps.get-data.outputs.cache-hit != 'true' + run: wget --quiet https://zenodo.org/record/3992790/files/test_reads.tar.gz + + - name: cache example data + if: steps.get-data.outputs.cache-hit != 'true' + uses: actions/cache/save@v3 + with: + path: test_reads.tar.gz + key: ${{ steps.get-data.outputs.cache-primary-key }} + + - name: extract data + run: | + tar -xzf test_reads.tar.gz + ls -l test_reads + + - name: get conda envs + id: get-envs + uses: actions/cache/restore@v3 + with: + path: databases + key: conda-envs-assembly + + # - name: upack conda envs + # if: steps.get-envs.outputs.cache-hit != 'true' + # run: tar -xzf assembly_conda_envs.tar.gz + + + - name: Init + run: | + atlas init "test_reads" --threads "$N_THREADS" --working-dir wd + + + - name: Install dependencies for qc and assembly + if: steps.get-envs.outputs.cache-hit != 'true' + run: atlas run assembly --conda-create-envs-only -w wd + + - name: List envs + run: ls -l databases/conda_envs + + - name: Store conda envs + if: steps.get-envs.outputs.cache-hit != 'true' + uses: actions/cache/save@v3 + with: + path: databases + key: ${{ steps.get-envs.outputs.cache-primary-key }} + + + - name: Run QC + run: | + atlas run qc --max-mem $MEM --jobs=$N_THREADS --restart-times=2 --working-dir wd + + - name: Test Assembly + run: | + atlas run assembly --max-mem $MEM --jobs=$N_THREADS --restart-times=2 --working-dir wd + + - name: Store Logs + uses: actions/upload-artifact@v2 + with: + name: logs + path: wd/logs + + - name: Store Sample Logs + uses: actions/upload-artifact@v2 + with: + name: sample_logs + path: wd/sample1/logs + + - name: Store reports + uses: actions/upload-artifact@v2 + with: + name: reports + path: wd/reports + + + - name: Cache working dir + uses: actions/cache/save@v3 + with: + path: wd + key: assembly-working-dir + + + + run-genecatalog: + runs-on: ubuntu-latest + needs: [ build-and-dryrun, run-qc-and-assembly ] + env: + N_THREADS: 2 + MEM: 3 + + + steps: + - name: Checkout + uses: actions/checkout@v3.5.2 + + + - uses: mamba-org/setup-micromamba@v1 + with: + environment-file: atlasenv.yml + environment-name: atlasenv + cache-environment-key: atlasenv- + + + - name: install atlas + run: | + python -m pip install . --no-deps -vv + atlas --help + + + - name: get conda envs + id: get-envs + uses: actions/cache/restore@v3 + with: + path: databases + key: conda-envs-genecatalog + restore-keys: | + conda-envs-assembly + conda-envs- + + - name: Restore working dir + uses: actions/cache/restore@v3 + with: + path: wd + key: assembly-working-dir + + - name: dryrun assembly shold need nothing to be done + run: | + ls -l wd + ls -l databases/conda_envs + atlas run assembly -w wd -n + + + - name: test Genecatalog run: | - test/dryrun.sh + atlas run genecatalog --restart-times=2 --working-dir wd --omit-from combine_egg_nogg_annotations combine_dram_genecatalog_annotations + + - name: Store Logs + uses: actions/upload-artifact@v2 + with: + name: logs + path: wd/logs + + - name: Store Sample Logs + uses: actions/upload-artifact@v2 + with: + name: sample_logs + path: wd/sample1/logs + + - name: Store conda envs + #if: steps.get-envs.outputs.cache-hit != 'true' + uses: actions/cache/save@v3 + with: + path: databases + key: ${{ steps.get-envs.outputs.cache-primary-key }} + + + run-binning: + runs-on: ubuntu-latest + needs: [ build-and-dryrun, run-qc-and-assembly ] + env: + N_THREADS: 2 + MEM: 3 + + steps: + - name: Checkout + uses: actions/checkout@v3.5.2 + + + - uses: mamba-org/setup-micromamba@v1 + with: + environment-file: atlasenv.yml + environment-name: atlasenv + cache-environment-key: atlasenv- + + + - name: install atlas + run: | + python -m pip install . --no-deps -vv + atlas --help + + + - name: get conda envs + id: get-envs + uses: actions/cache/restore@v3 + with: + path: databases + key: conda-envs-binning + restore-keys: | + conda-envs- + + - name: Restore working dir + uses: actions/cache/restore@v3 + with: + path: wd + fail-on-cache-miss: true + key: assembly-working-dir + + - name: dryrun assembly shold need nothing to be done + run: | + ls -l wd + ls -l databases + ls -l databases/conda_envs + atlas run assembly -w wd -n + + + - name: test binning + run: | + atlas run binning --restart-times=2 --working-dir wd --omit-from checkm2_download_db + + - name: Store Logs + uses: actions/upload-artifact@v2 + with: + name: logs + path: wd/logs + + - name: Store Sample Logs + uses: actions/upload-artifact@v2 + with: + name: sample_logs + path: wd/sample1/logs + + - name: Store conda envs + # if: steps.get-envs.outputs.cache-hit != 'true' + uses: actions/cache/save@v3 + with: + path: databases + key: ${{ steps.get-envs.outputs.cache-primary-key }} + + # run-quantify-genomes: + # runs-on: ubuntu-latest + # needs: build-and-dryrun + + # steps: + # - name: Checkout + # uses: actions/checkout@v3.5.2 + + + # - uses: mamba-org/setup-micromamba@v1 + # with: + # environment-file: atlasenv.yml + # environment-name: atlasenv + # cache-environment-key: atlasenv- + + + # - name: Install atlas + # run: | + # python -m pip install . --no-deps -vv + # atlas --help + + + # - name: get conda envs + # id: get-envs + # uses: actions/cache/restore@v3 + # with: + # path: databases + # key: conda-envs-assembly + # restore-keys: conda-envs-* diff --git a/atlas/atlas.py b/atlas/atlas.py index 6edcfee2..7d9fef92 100644 --- a/atlas/atlas.py +++ b/atlas/atlas.py @@ -9,7 +9,7 @@ from snakemake.io import load_configfile from .make_config import validate_config -from .init.atlas_init import run_init#, run_init_sra +from .init.atlas_init import run_init # , run_init_sra from .__init__ import __version__ @@ -66,7 +66,7 @@ def cli(obj): cli.add_command(run_init) -#cli.add_command(run_init_sra) +# cli.add_command(run_init_sra) def get_snakefile(file="workflow/Snakefile"): diff --git a/atlas/init/atlas_init.py b/atlas/init/atlas_init.py index 680fea43..f8b6d062 100644 --- a/atlas/init/atlas_init.py +++ b/atlas/init/atlas_init.py @@ -62,7 +62,6 @@ def prepare_sample_table_for_atlas( sample_table["BinGroup"] = "All" - validate_sample_table(sample_table) sample_table.to_csv(outfile, sep="\t") diff --git a/atlas/make_config.py b/atlas/make_config.py index dd8365c1..a56b5b9c 100644 --- a/atlas/make_config.py +++ b/atlas/make_config.py @@ -236,8 +236,6 @@ def make_config( ) - - def validate_config(config, workflow): conf = load_configfile(config) diff --git a/atlas/sample_table.py b/atlas/sample_table.py index 1c127d1b..dba15a00 100644 --- a/atlas/sample_table.py +++ b/atlas/sample_table.py @@ -93,7 +93,7 @@ def validate_bingroup_size_cobinning(sampleTable, logger): logger.error( "If you want to use co-binning, you should have at least 5-10 samples per bin group. \n" ) - BinGroupSizeError("BinGroup too small") + raise BinGroupSizeError("BinGroup too small") def validate_bingroup_size_metabat(sampleTable, logger): diff --git a/test/dryrun.sh b/test/dryrun.sh index f6c8d733..468f5bab 100755 --- a/test/dryrun.sh +++ b/test/dryrun.sh @@ -12,17 +12,37 @@ atlas run --help databaseDir="test/databases" WD='test/Dryrun' reads_dir='test/reads/empty' +snakemake_args=" --quiet rules $@ --dryrun " +test_script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -echo "touch reads dir" + +create_reads_dir() { + + local reads_dir="$1" + local N=$2 + +echo "touch reads dir: $reads_dir" + +rm -rf $reads_dir mkdir -p $reads_dir -for sample in Sample1 Sample2 Sample3 Sample4 Sample5 Sample6; - do + +for (( i=1; i<=$N; i++ )); do + sample="Sample$i" + for fraction in R1 R2; do touch $reads_dir/${sample}_${fraction}.fastq.gz done done +} + +# need at least 10 samples for cobinning + +create_reads_dir $reads_dir 10 + + + rm -fr $WD @@ -37,10 +57,10 @@ atlas init --db-dir $databaseDir --threads=$NThreads -w $WD $reads_dir echo "Dryrun all" -atlas run all -w $WD --max-mem $MaxMem --jobs $NThreads --dryrun $@ +atlas run all -w $WD $snakemake_args echo "Dryrun strains" -atlas run genomes strains -w $WD --max-mem $MaxMem --jobs $NThreads --dryrun $@ +atlas run genomes strains -w $WD $snakemake_args for binner in metabat SemiBin vamb DASTool ; do @@ -49,7 +69,7 @@ for binner in metabat SemiBin vamb DASTool ; do Dryrun Binner $binner " - atlas run genomes -w $WD --config final_binner=$binner --dryrun $@ + atlas run binning -w $WD --config final_binner=$binner $snakemake_args done @@ -60,20 +80,45 @@ echo " Dryrun with skip QC and megahit " # + +rm -fr $WD + WD=${WD}/noQC rm -fr $WD -atlas init --db-dir $databaseDir --threads=$NThreads --skip-qc -w $WD --assembler megahit $reads_dir -atlas run all -w $WD --dryrun $@ +atlas init --db-dir $databaseDir --skip-qc -w $WD --assembler megahit $reads_dir +atlas run all -w $WD $snakemake_args -echo echo " execution with profile " mkdir -p $WD/local - printf 'cores: 1\n' > $WD/local/config.yaml + printf 'cores: 2\n' > $WD/local/config.yaml + + atlas run qc -w $WD --profile $WD/local $snakemake_args + + +# clean up +rm -rf $WD $reads_dir + + + + + + +echo " + test with external genomes + " + +bash $test_script_dir/test_external_genomes.sh $snakemake_args + + + +echo " + test init with different samples + " - atlas run qc -w $WD --dryrun --max-mem $MaxMem --jobs $NThreads --profile $WD/local $@ +bash $test_script_dir/test_init_many_samples.sh $snakemake_args \ No newline at end of file diff --git a/test/test_ci.sh b/test/test_ci.sh index dd9b72cf..01ced696 100755 --- a/test/test_ci.sh +++ b/test/test_ci.sh @@ -9,18 +9,20 @@ set -exuo pipefail atlas --version # get test reads -#wget https://zenodo.org/record/3992790/files/test_reads.tar.gz -#tar -xzf test_reads.tar.gz +wget https://zenodo.org/record/3992790/files/test_reads.tar.gz +tar -xzf test_reads.tar.gz +ls -l test_reads + databaseDir="databases" WD='test_ci' -reads_dir="example_data/reads/test" +reads_dir="test_reads" #"example_data/reads/test" rm -f $WD/samples.tsv # -atlas init --db-dir --interleaved-fastq $databaseDir -w $WD $reads_dir +atlas init $reads_dir --db-dir $databaseDir -w $WD #--interleaved-fastq atlas run None screen -w $WD qc $@ diff --git a/test/test_external_genomes.sh b/test/test_external_genomes.sh index 1f0b123f..0d5abc06 100755 --- a/test/test_external_genomes.sh +++ b/test/test_external_genomes.sh @@ -12,7 +12,7 @@ MaxMem=3 databaseDir="test/databases" WD='test/genome_quant' reads_dir='test/reads/empty' - +snakemake_args=" --quiet rules $@ --dryrun " echo "touch reads dir" mkdir -p $reads_dir @@ -41,4 +41,7 @@ atlas init --db-dir $databaseDir --skip-qc -w $WD $reads_dir echo "Run" -atlas run None quantify_genomes -w $WD --config genome_dir="other_genomes" --dryrun $@ \ No newline at end of file +atlas run quantify_genomes -w $WD --config genome_dir="other_genomes" $snakemake_args + + +rm -rf $WD $reads_dir \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 3e520ff6..4e2eb047 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -40,7 +40,7 @@ wildcard_constraints: include: "rules/sample_table.smk" include: "rules/download.smk" # contains hard coded variables include: "rules/qc.smk" -include: "rules/screen.smk" # expects function get_input_fastq defined in qc +include: "rules/screen.smk" # expects function get_input_fastq defined in qc include: "rules/assemble.smk" include: "rules/binning.smk" include: "rules/derep.smk" @@ -201,9 +201,10 @@ rule qc: output: touch("finished_QC"), + rule screen: input: - "QC/screen/sketch_comparison.tsv.gz" + "QC/screen/sketch_comparison.tsv.gz", # overwrite commands in rules/download.snakefile @@ -222,14 +223,12 @@ onerror: for r in workflow.rules: if not "mem_mb" in r.resources: - if "mem" in r.resources: r.resources["mem_mb"] = r.resources["mem"] * 1000 else: # default r.resources["mem_mb"] = config["mem"] * 1000 - # add time if ot present. Simple jobs use simple time if "time_min" not in r.resources: diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk index ea88f14c..74e2e2f7 100644 --- a/workflow/rules/assemble.smk +++ b/workflow/rules/assemble.smk @@ -70,7 +70,6 @@ if SKIP_QC & (len(MULTIFILE_FRACTIONS) < 3): -Xmx{resources.java_mem}G 2> {log} """ - else: localrules: @@ -97,7 +96,6 @@ else: - # rule normalize_reads: input: @@ -273,7 +271,6 @@ if config.get("assembler", "megahit") == "megahit": shell: "cat {input} > {output}" - def megahit_input_parsing(input): Nfiles = len(input) @@ -350,7 +347,6 @@ if config.get("assembler", "megahit") == "megahit": shell: "cp {input} {output}" - else: if PAIRED_END: ASSEMBLY_FRACTIONS = ["R1", "R2"] @@ -465,16 +461,17 @@ else: temp("{sample}/assembly/{sample}_raw_contigs.fasta"), shell: "cp {input} {output}" -# standardizes header labels within contig FASTAs +# standardizes header labels within contig FASTAs + rule rename_contigs: input: "{sample}/assembly/{sample}_raw_contigs.fasta", output: fasta="{sample}/assembly/{sample}_prefilter_contigs.fasta", - mapping_table = "{sample}/assembly/old2new_contig_names.tsv" + mapping_table="{sample}/assembly/old2new_contig_names.tsv", threads: config.get("simplejob_threads", 1) resources: mem=config["simplejob_mem"], @@ -489,8 +486,6 @@ rule rename_contigs: "../scripts/rename_assembly.py" - - if config["filter_contigs"]: ruleorder: align_reads_to_prefilter_contigs > align_reads_to_final_contigs @@ -570,9 +565,10 @@ if config["filter_contigs"]: minl={params.minl} \ trim={params.trim} \ -Xmx{resources.java_mem}G 2> {log}""" -# HACK: this makes two copies of the same file +# HACK: this makes two copies of the same file + else: # no filter diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index a1082726..233d2da7 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -137,11 +137,13 @@ rule get_metabat_depth_file: "{sample}/binning/metabat/metabat.log", conda: "../envs/metabat.yaml" - threads: config["threads"] # multithreaded trough OMP_NUM_THREADS + threads: config["threads"] # multithreaded trough OMP_NUM_THREADS resources: - mem_mb=config["mem"]*1000, + mem_mb=config["mem"] * 1000, params: - minid = lambda wc, input: config["cobinning_readmapping_id"] *100 if len(input.bams)>1 else 97 + minid=lambda wc, input: config["cobinning_readmapping_id"] * 100 + if len(input.bams) > 1 + else 97, shell: "jgi_summarize_bam_contig_depths " " --percentIdentity {params.minid} " diff --git a/workflow/rules/cobinning.smk b/workflow/rules/cobinning.smk index d68d7d79..810ab7e0 100644 --- a/workflow/rules/cobinning.smk +++ b/workflow/rules/cobinning.smk @@ -30,14 +30,14 @@ rule filter_contigs: def get_samples_of_bingroup(wildcards): - - samples_of_group= sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist() + samples_of_group = sampleTable.query( + f'BinGroup=="{wildcards.bingroup}"' + ).index.tolist() return samples_of_group def get_filtered_contigs_of_bingroup(wildcards): - samples_of_group = get_samples_of_bingroup(wildcards) if len(samples_of_group) < 5: @@ -51,7 +51,6 @@ def get_filtered_contigs_of_bingroup(wildcards): def get_bams_of_bingroup(wildcards): - samples_of_group = get_samples_of_bingroup(wildcards) return expand( @@ -63,7 +62,7 @@ def get_bams_of_bingroup(wildcards): rule combine_contigs: input: - fasta= get_filtered_contigs_of_bingroup, + fasta=get_filtered_contigs_of_bingroup, output: "Intermediate/cobinning/{bingroup}/combined_contigs.fasta.gz", log: @@ -80,9 +79,9 @@ rule combine_contigs: with gz.open(input_fasta, "rb") as fin: for line in fin: # if line is a header add sample name - if line[0] == ord('>'): + if line[0] == ord(">"): line = f">{sample}{params.seperator}".encode() + line[1:] - # write each line to the combined file + # write each line to the combined file fout.write(line) @@ -95,7 +94,7 @@ rule minimap_index: index_size="12G", resources: mem=config["mem"], # limited num of fatnodes (>200g) - threads: config["simplejob_threads"], + threads: config["simplejob_threads"] log: "logs/cobinning/{bingroup}/minimap_index.log", benchmark: @@ -142,7 +141,9 @@ rule minimap: shell: """minimap2 -t {threads} -ax sr {input.mmi} {input.fq} | grep -v "^@" | cat {input.dict} - | samtools view -F 3584 -b - > {output.bam} 2>{log}""" - # samtools filters out secondary alignments + +# samtools filters out secondary alignments + ruleorder: sort_bam > minimap @@ -156,8 +157,8 @@ rule sort_bam: prefix="Intermediate/cobinning/{bingroup}/bams/tmp.{sample}", threads: 2 resources: - mem_mb=config["simplejob_mem"] *1000, - time_min=int(config["runtime"]["simplejob"]*60), + mem_mb=config["simplejob_mem"] * 1000, + time_min=int(config["runtime"]["simplejob"] * 60), log: "logs/cobinning/{bingroup}/mapping/sortbam/{sample}.log", conda: @@ -175,14 +176,14 @@ rule summarize_bam_contig_depths: "logs/cobinning/{bingroup}/combine_coverage.log", conda: "../envs/metabat.yaml" - threads: config["threads"] # multithreaded trough OMP_NUM_THREADS + threads: config["threads"] # multithreaded trough OMP_NUM_THREADS benchmark: "logs/benchmarks/cobinning/{bingroup}/summarize_bam_contig_depths.tsv" resources: - mem_mb=config["mem"]*1000, - time_min = config["runtime"]["long"]*60 + mem_mb=config["mem"] * 1000, + time_min=config["runtime"]["long"] * 60, params: - minid = config["cobinning_readmapping_id"] *100 + minid=config["cobinning_readmapping_id"] * 100, shell: "jgi_summarize_bam_contig_depths " " --percentIdentity {params.minid} " @@ -218,8 +219,8 @@ rule run_vamb: "../envs/vamb.yaml" threads: config["threads"] resources: - mem_mb=config["mem"]*1000, - time_min=config["runtime"]["long"]*60, + mem_mb=config["mem"] * 1000, + time_min=config["runtime"]["long"] * 60, log: "logs/cobinning/run_vamb/{bingroup}.log", benchmark: @@ -258,7 +259,7 @@ rule parse_vamb_output: fasta_extension=".fna", output_path=lambda wc: vamb_cluster_attribution_path, # path with {sample} to replace samples=SAMPLES, - bingroups = sampleTable.BinGroup.unique() + bingroups=sampleTable.BinGroup.unique(), conda: "../envs/fasta.yaml" script: diff --git a/workflow/rules/derep.smk b/workflow/rules/derep.smk index 60f0f352..63312277 100644 --- a/workflow/rules/derep.smk +++ b/workflow/rules/derep.smk @@ -42,7 +42,6 @@ rule skani_2_parquet: threads: 1 run: try: - skani_column_dtypes = { "Ref_file": "category", "Query_file": "category", diff --git a/workflow/rules/genecatalog.smk b/workflow/rules/genecatalog.smk index 313cd485..f82e0a2a 100644 --- a/workflow/rules/genecatalog.smk +++ b/workflow/rules/genecatalog.smk @@ -50,7 +50,6 @@ if config["genecatalog"]["source"] == "contigs": cat_files(input.fna, output.fna) cat_files(input.short, output.short) - else: localrules: @@ -175,15 +174,15 @@ if (config["genecatalog"]["clustermethod"] == "linclust") or ( "logs/Genecatalog/clustering/generate_orf_info.log", script: "../scripts/generate_orf_info.py" -# cluster genes with cd-hit-est +# cluster genes with cd-hit-est + elif config["genecatalog"]["clustermethod"] == "cd-hit-est": include: "cdhit.smk" - else: raise Exception( "Didn't understood the genecatalog clustermethod: {}".format( @@ -388,7 +387,6 @@ rule combine_gene_coverages: old_subset_folder = Path("Genecatalog/subsets/genes") new_subset_folder = "Intermediate/genecatalog/subsets" if old_subset_folder.exists(): - logger.info(f"I move {old_subset_folder} to {new_subset_folder}") import shutil @@ -416,7 +414,6 @@ checkpoint gene_subsets: def get_subset_names(wildcards): - dir_for_subsets = Path(checkpoints.gene_subsets.get(**wildcards).output[0]) subset_names = glob_wildcards(str(dir_for_subsets / "{subset}.faa")).subset @@ -518,7 +515,6 @@ rule combine_egg_nogg_annotations: time=config["runtime"]["default"], run: try: - import pandas as pd Tables = [ @@ -538,7 +534,6 @@ rule combine_egg_nogg_annotations: combined.to_parquet(output[0], index=False) except Exception as e: - import traceback with open(log[0], "w") as logfile: @@ -566,7 +561,6 @@ rule convert_eggNOG_tsv2parquet: df.to_parquet(output[0], index=False) except Exception as e: - import traceback with open(log[0], "w") as logfile: @@ -618,7 +612,6 @@ rule DRAM_annotate_genecatalog: def combine_genecatalog_dram_input(wildcards): - all_subsets = get_subset_names(wildcards) return expand( diff --git a/workflow/rules/genomes.smk b/workflow/rules/genomes.smk index c17db795..ea53d19d 100644 --- a/workflow/rules/genomes.smk +++ b/workflow/rules/genomes.smk @@ -90,7 +90,7 @@ rule get_contig2genomes: if ext == ".gz": bin_name = os.path.splitext(bin_name)[0] - # write names of contigs in mapping file + # write names of contigs in mapping file with open(fasta) as f: for line in f: if line[0] == ">": @@ -226,7 +226,6 @@ if config["genome_aligner"] == "minimap": wrapper: "v1.19.0/bio/minimap2/aligner" - elif config["genome_aligner"] == "bwa": rule index_genomes: @@ -262,7 +261,6 @@ elif config["genome_aligner"] == "bwa": wrapper: "v1.19.0/bio/bwa-mem2/mem" - else: raise Exception( "'genome_aligner' not understood, it should be 'minimap' or 'bwa', got '{genome_aligner}'. check config file".format( diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 360e0860..1629ed85 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -184,7 +184,6 @@ if not SKIP_QC: " -Xmx{resources.java_mem}G " " 2> {log}" - PROCESSED_STEPS.append("filtered") rule apply_quality_filter: @@ -209,8 +208,8 @@ if not SKIP_QC: params: ref=( "ref=%s" % config.get("preprocess_adapters") - if config.get("preprocess_adapters") - else "" + if config.get("preprocess_adapters") + else "" ), mink=( "" @@ -302,7 +301,6 @@ if not SKIP_QC: " -Xmx{resources.java_mem}G " " 2> {log}" - # if there are no references, decontamination will be skipped if len(config.get("contaminant_references", {}).keys()) > 0: PROCESSED_STEPS.append("clean") @@ -406,7 +404,6 @@ if not SKIP_QC: -Xmx{resources.java_mem}G 2>> {log} """ - PROCESSED_STEPS.append("QC") localrules: @@ -436,7 +433,7 @@ if not SKIP_QC: with open(input.rrna_reads[i], "rb") as infile2: shutil.copyfileobj(infile2, outFile) - # append to sample table + # append to sample table sample_table = load_sample_table(params.sample_table) qc_header = [f"Reads_QC_{fraction}" for fraction in MULTIFILE_FRACTIONS] sample_table.loc[wildcards.sample, qc_header] = output @@ -444,7 +441,6 @@ if not SKIP_QC: - #### STATS @@ -489,7 +485,6 @@ if PAIRED_END: readlength.sh {params.inputs} out={output.read_length} 2>> {log} """ - else: rule calculate_read_length_hist: @@ -541,7 +536,6 @@ rule combine_read_length_stats: - # rule combine_cardinality: # input: # expand("{sample}/sequence_quality_control/read_stats/QC_cardinality.txt",sample=SAMPLES), @@ -595,7 +589,6 @@ if PAIRED_END: - localrules: combine_read_counts, write_read_counts, diff --git a/workflow/rules/sample_table.smk b/workflow/rules/sample_table.smk index ff0e05a3..d8155029 100644 --- a/workflow/rules/sample_table.smk +++ b/workflow/rules/sample_table.smk @@ -187,12 +187,11 @@ def get_assembly(wildcards): """ - Header= "Assembly" + Header = "Assembly" try: return get_files_from_sampleTable(wildcards.sample, Header) except FileNotInSampleTableException: # return files as named by atlas pipeline - return "{sample}/{sample}_contigs.fasta".format(sample=wildcards.sample ) - + return "{sample}/{sample}_contigs.fasta".format(sample=wildcards.sample) diff --git a/workflow/rules/screen.smk b/workflow/rules/screen.smk index a94d185b..f60346a1 100644 --- a/workflow/rules/screen.smk +++ b/workflow/rules/screen.smk @@ -3,7 +3,7 @@ rule generate_sketch: input: unpack(get_input_fastq), output: - "Intermediate/screen/sketches/{sample}.sketch.gz" + "Intermediate/screen/sketches/{sample}.sketch.gz", log: "logs/screen/make_sketch/{sample}.log", conda: @@ -14,20 +14,21 @@ rule generate_sketch: java_mem=int(config["simplejob_mem"] * JAVA_MEM_FRACTION), shell: "bbsketch.sh " - "in={input[0]}" # take only one read + "in={input[0]}" " samplerate=0.5" " minkeycount=2 " " out={output} " " blacklist=nt ssu=f name0={wildcards.sample} depth=t overwrite=t " " -Xmx{resources.java_mem}g " " &> {log}" + # take only one read rule compare_sketch: input: - expand( rules.generate_sketch.output, sample =SAMPLES ), + expand(rules.generate_sketch.output, sample=SAMPLES), output: - "QC/screen/sketch_comparison.tsv.gz" + "QC/screen/sketch_comparison.tsv.gz", priority: 100 log: "logs/screen/compare_sketch.log", @@ -46,5 +47,4 @@ rule compare_sketch: " &> {log}" - -# sendsketch.sh sample2.sketch printdepth2=t level=2 printqfname=f printvolume=t color=f out \ No newline at end of file +# sendsketch.sh sample2.sketch printdepth2=t level=2 printqfname=f printvolume=t color=f out diff --git a/workflow/rules/semibin.smk b/workflow/rules/semibin.smk index 761033c2..13be0cfa 100644 --- a/workflow/rules/semibin.smk +++ b/workflow/rules/semibin.smk @@ -73,7 +73,6 @@ rule semibin_train: def semibin_input(wildcards): - bingroup_of_sample = sampleTable.loc[wildcards.sample, "BinGroup"] samples_of_bingroup = sampleTable.query( f'BinGroup=="{bingroup_of_sample}"' diff --git a/workflow/scripts/combine_dram_gene_annotations.py b/workflow/scripts/combine_dram_gene_annotations.py index 90cc3073..0dad7690 100644 --- a/workflow/scripts/combine_dram_gene_annotations.py +++ b/workflow/scripts/combine_dram_gene_annotations.py @@ -72,7 +72,6 @@ def handle_exception(exc_type, exc_value, exc_traceback): cols = db_columns[db] if not df.columns.intersection(cols).empty: - Tables[db].append(df[cols].dropna(axis=0, how="all")) del df @@ -81,7 +80,6 @@ def handle_exception(exc_type, exc_value, exc_traceback): out_dir.mkdir() for db in Tables: - combined = pd.concat(Tables[db], axis=0) combined.sort_index(inplace=True) diff --git a/workflow/scripts/filter_genes.py b/workflow/scripts/filter_genes.py index fb33afc9..b399db59 100644 --- a/workflow/scripts/filter_genes.py +++ b/workflow/scripts/filter_genes.py @@ -34,8 +34,8 @@ def handle_exception(exc_type, exc_value, exc_traceback): import pyfastx -faa_iterator = pyfastx.Fastx(snakemake.input.faa, format="fasta",comment=True) -fna_iterator = pyfastx.Fastx(snakemake.input.fna, format="fasta",comment=True) +faa_iterator = pyfastx.Fastx(snakemake.input.faa, format="fasta", comment=True) +fna_iterator = pyfastx.Fastx(snakemake.input.fna, format="fasta", comment=True) with open(snakemake.output.faa, "w") as out_faa, open(