diff --git a/.gitignore b/.gitignore index c8bdb77..68263f1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ auspice/ build/ logs/ benchmarks/ +ingest/fauna/ +ingest/andersen-lab/ # Sensitive environment variables environment* diff --git a/ingest/README.md b/ingest/README.md index d2507b0..a9662b9 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -28,7 +28,7 @@ nextstrain build \ . ``` -This command produces one metadata file, `results/metadata.tsv`, and one sequences file per gene segment like `results/sequences_ha.fasta`. +This command produces one metadata file, `fauna/results/metadata.tsv`, and one sequences file per gene segment like `fauna/results/sequences_ha.fasta`. Each file represents all available subtypes. Add the `upload_all` target to the command above to run the complete ingest pipeline _and_ upload results to AWS S3. @@ -54,7 +54,7 @@ It does not merge or deduplicate the data with the fauna data used in the defaul nextstrain build . merge_andersen_segment_metadata ``` -The results will be available in `results/andersen-lab/`. +The results will be available in `andersen-lab/results/`. ## Configuration diff --git a/ingest/Snakefile b/ingest/Snakefile index 4adcbea..af7412a 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,19 +1,22 @@ path_to_fauna = '../fauna' -config["s3_dst"] = "s3://nextstrain-data-private/files/workflows/avian-flu" -config["segments"] = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"] + +# Use default configuration values. Override with Snakemake's --configfile/--config options. +configfile: "defaults/config.yaml" wildcard_constraints: segment = "|".join(config["segments"]) rule all: + # As of 2024-05-16 the default ingest only ingests data from fauna input: - sequences=expand("results/metadata_{segment}.tsv", segment=config["segments"]), - metadata="results/metadata.tsv", + sequences=expand("fauna/results/sequences_{segment}.fasta", segment=config["segments"]), + metadata="fauna/results/metadata.tsv", rule upload_all: + # As of 2024-05-16 the default upload only uploads data from fauna input: - sequences=expand("s3/sequences_{segment}.done", segment=config["segments"]), - metadata="s3/metadata.done", + sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]), + metadata="fauna/s3/metadata.done", include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml new file mode 100644 index 0000000..c46739f --- /dev/null +++ b/ingest/defaults/config.yaml @@ -0,0 +1,11 @@ +segments: + - pb2 + - pb1 + - pa + - ha + - np + - na + - mp + - ns + +s3_dst: "s3://nextstrain-data-private/files/workflows/avian-flu" diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/rules/ingest_andersen_lab.smk index eca87b6..023d547 100644 --- a/ingest/rules/ingest_andersen_lab.smk +++ b/ingest/rules/ingest_andersen_lab.smk @@ -6,7 +6,7 @@ from the Andersen Lab's avian-influenza repo rule fetch_andersen_lab_repo: output: - andersen_lab_repo = temp("data/andersen-lab-avian-influenza.tar.gz") + andersen_lab_repo = temp("andersen-lab/data/avian-influenza.tar.gz") shell: """ curl -fsSL \ @@ -18,29 +18,33 @@ rule fetch_andersen_lab_repo: rule extract_metadata: input: - andersen_lab_repo = "data/andersen-lab-avian-influenza.tar.gz" + andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" output: - metadata = "data/andersen-lab/PRJNA1102327_metadata.csv" + metadata = "andersen-lab/data/PRJNA1102327_metadata.csv" + params: + output_dir = lambda wildcards, output: Path(output.metadata).parent shell: """ tar xz --file={input.andersen_lab_repo} \ --strip-components=2 \ - -C data/andersen-lab \ + -C {params.output_dir} \ --wildcards \ "*/metadata/PRJNA1102327_metadata.csv" """ rule extract_consensus_sequences: input: - andersen_lab_repo = "data/andersen-lab-avian-influenza.tar.gz" + andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" output: - fasta = directory("data/andersen-lab/fasta"), - output_flag = touch("data/andersen-lab/extract_consensus_sequences.done") + fasta = directory("andersen-lab/data/fasta"), + output_flag = touch("andersen-lab/data/extract_consensus_sequences.done") + params: + output_dir = lambda wildcards, output: Path(output.fasta).parent shell: """ tar xz --file={input.andersen_lab_repo} \ --strip-components=1 \ - -C data/andersen-lab \ + -C {params.output_dir} \ --wildcards \ "*/fasta" """ @@ -51,14 +55,14 @@ rule rename_and_concatenate_segment_fastas: and concatenate FASTAs of the same segment """ input: - extract_consensus_sequences_flag = "data/andersen-lab/extract_consensus_sequences.done" + extract_consensus_sequences_flag = "andersen-lab/data/extract_consensus_sequences.done" output: - fasta = "data/andersen-lab/{segment}.fasta" + fasta = "andersen-lab/data/{segment}.fasta" params: segment = lambda wildcards: wildcards.segment.upper() shell: """ - for fasta in data/andersen-lab/fasta/*_{params.segment}_cns.fa; do + for fasta in andersen-lab/data/fasta/*_{params.segment}_cns.fa; do seqkit replace \ -p "Consensus_(SRR[0-9]+)_.*" \ -r '$1' \ @@ -69,12 +73,12 @@ rule rename_and_concatenate_segment_fastas: rule curate_metadata: input: - metadata = "data/andersen-lab/PRJNA1102327_metadata.csv", + metadata = "andersen-lab/data/PRJNA1102327_metadata.csv", geolocation_rules = "defaults/geolocation_rules.tsv" output: - metadata = "data/andersen-lab/metadata.tsv" + metadata = "andersen-lab/data/metadata.tsv" log: - "logs/curate_metadata.txt", + "andersen-lab/logs/curate_metadata.txt", shell: """ augur curate normalize-strings \ @@ -92,13 +96,13 @@ rule match_metadata_and_segment_fasta: and outputs the matching metadata TSV and sequence FASTAs per segment. """ input: - metadata = "data/andersen-lab/metadata.tsv", - fasta = "data/andersen-lab/{segment}.fasta" + metadata = "andersen-lab/data/metadata.tsv", + fasta = "andersen-lab/data/{segment}.fasta" output: - metadata = "results/andersen-lab/metadata_{segment}.tsv", - fasta = "results/andersen-lab/sequences_{segment}.fasta" + metadata = "andersen-lab/results/metadata_{segment}.tsv", + fasta = "andersen-lab/results/sequences_{segment}.fasta" log: - "logs/match_segment_metadata_and_fasta/{segment}.txt", + "andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt", shell: """ augur curate passthru \ @@ -121,10 +125,10 @@ rule merge_andersen_segment_metadata: have sequence data (no QC performed). """ input: - segments = expand("results/andersen-lab/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "results/andersen-lab/metadata_ha.tsv", + segments = expand("andersen-lab/results/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "andersen-lab/results/metadata_ha.tsv", output: - metadata = "results/andersen-lab/metadata.tsv", + metadata = "andersen-lab/results/metadata.tsv", shell: """ python scripts/add_segment_counts.py \ diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index 86be0c1..d072738 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -1,10 +1,15 @@ +from pathlib import Path + + rule download_segment: output: - sequences = "data/{segment}.fasta", + sequences = "fauna/data/{segment}.fasta", params: fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", + output_dir = lambda wildcards, output: Path(output.sequences).parent, + output_fstem = lambda wildcards, output: Path(output.sequences).stem, benchmark: - "benchmarks/download_segment_{segment}.txt" + "fauna/benchmarks/download_segment_{segment}.txt" shell: """ python3 {path_to_fauna}/vdb/download.py \ @@ -12,16 +17,16 @@ rule download_segment: --virus avian_flu \ --fasta_fields {params.fasta_fields} \ --select locus:{wildcards.segment} \ - --path data \ - --fstem {wildcards.segment} + --path {params.output_dir} \ + --fstem {params.output_fstem} """ rule parse_segment: input: - sequences = "data/{segment}.fasta", + sequences = "fauna/data/{segment}.fasta", output: - sequences = "results/sequences_{segment}.fasta", - metadata = "results/metadata_{segment}.tsv", + sequences = "fauna/results/sequences_{segment}.fasta", + metadata = "fauna/results/metadata_{segment}.tsv", params: fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" @@ -44,10 +49,10 @@ rule merge_segment_metadata: for each segment, but that would be a nice improvement. """ input: - segments = expand("results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "results/metadata_ha.tsv", + segments = expand("fauna/results/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "fauna/results/metadata_ha.tsv", output: - metadata = "results/metadata.tsv", + metadata = "fauna/results/metadata.tsv", shell: """ python scripts/add_segment_counts.py \ @@ -58,9 +63,9 @@ rule merge_segment_metadata: rule upload_sequences: input: - sequences="results/sequences_{segment}.fasta", + sequences="fauna/results/sequences_{segment}.fasta", output: - flag=touch("s3/sequences_{segment}.done"), + flag=touch("fauna/s3/sequences_{segment}.done"), params: s3_dst=config["s3_dst"], shell: @@ -73,9 +78,9 @@ rule upload_sequences: rule upload_metadata: input: - metadata="results/metadata.tsv", + metadata="fauna/results/metadata.tsv", output: - flag=touch("s3/metadata.done"), + flag=touch("fauna/s3/metadata.done"), params: s3_dst=config["s3_dst"], shell: diff --git a/ingest/scripts/add_segment_counts.py b/ingest/scripts/add_segment_counts.py index b302d5e..c94d2d8 100644 --- a/ingest/scripts/add_segment_counts.py +++ b/ingest/scripts/add_segment_counts.py @@ -63,7 +63,7 @@ def summary(strain_count): row[column]=strain_count[row['strain']] with open(args.output, 'w') as fh: - writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t') + writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t', lineterminator='\n') writer.writeheader() for row in rows: writer.writerow(row) diff --git a/rules/common.smk b/rules/common.smk index 2603f9b..944ecf9 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -15,7 +15,7 @@ if LOCAL_INGEST: output: sequences = "data/{segment}/sequences.fasta", params: - sequences = lambda w: f"ingest/results/sequences_{w.segment}.fasta" + sequences = lambda w: f"ingest/fauna/results/sequences_{w.segment}.fasta" shell: """ cp {params.sequences} {output.sequences} @@ -26,7 +26,7 @@ if LOCAL_INGEST: metadata = "data/metadata.tsv", shell: """ - cp ingest/results/metadata.tsv {output.metadata} + cp ingest/fauna/results/metadata.tsv {output.metadata} """ else: