From 6b9f1cf0354c5e93c35dbbe817089ecb8e728922 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 12:12:18 +1200 Subject: [PATCH 1/6] Fix typo --- ingest/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index 4adcbea..c657a6a 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -7,7 +7,7 @@ wildcard_constraints: rule all: input: - sequences=expand("results/metadata_{segment}.tsv", segment=config["segments"]), + sequences=expand("results/sequences_{segment}.fasta", segment=config["segments"]), metadata="results/metadata.tsv", rule upload_all: From 30b2640d676bfbdd0cb924e10a40c893b1aae69d Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 13:23:30 +1200 Subject: [PATCH 2/6] use LF not CRLF for metadata The default line endings for `csv.DictWriter` are CRLF (amazingly) --- ingest/scripts/add_segment_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/scripts/add_segment_counts.py b/ingest/scripts/add_segment_counts.py index b302d5e..c94d2d8 100644 --- a/ingest/scripts/add_segment_counts.py +++ b/ingest/scripts/add_segment_counts.py @@ -63,7 +63,7 @@ def summary(strain_count): row[column]=strain_count[row['strain']] with open(args.output, 'w') as fh: - writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t') + writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t', lineterminator='\n') writer.writeheader() for row in rows: writer.writerow(row) From 05dd9ff807797eeaeb6ad47c5cf4fc0fac5f9591 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 12:50:47 +1200 Subject: [PATCH 3/6] namespace fauna ingest files in preparation for the subsequent commit which will add another ingest source --- ingest/README.md | 2 +- ingest/Snakefile | 10 ++++++---- ingest/rules/upload_from_fauna.smk | 25 +++++++++++++------------ rules/common.smk | 4 ++-- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/ingest/README.md b/ingest/README.md index d2507b0..eb8c25c 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -28,7 +28,7 @@ nextstrain build \ . ``` -This command produces one metadata file, `results/metadata.tsv`, and one sequences file per gene segment like `results/sequences_ha.fasta`. +This command produces one metadata file, `results/fauna/metadata.tsv`, and one sequences file per gene segment like `results/fauna/sequences_ha.fasta`. Each file represents all available subtypes. Add the `upload_all` target to the command above to run the complete ingest pipeline _and_ upload results to AWS S3. diff --git a/ingest/Snakefile b/ingest/Snakefile index c657a6a..210a934 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -6,14 +6,16 @@ wildcard_constraints: segment = "|".join(config["segments"]) rule all: + # As of 2024-05-16 the default ingest only ingests data from fauna input: - sequences=expand("results/sequences_{segment}.fasta", segment=config["segments"]), - metadata="results/metadata.tsv", + sequences=expand("results/fauna/sequences_{segment}.fasta", segment=config["segments"]), + metadata="results/fauna/metadata.tsv", rule upload_all: + # As of 2024-05-16 the default upload only uploads data from fauna input: - sequences=expand("s3/sequences_{segment}.done", segment=config["segments"]), - metadata="s3/metadata.done", + sequences=expand("s3/fauna/sequences_{segment}.done", segment=config["segments"]), + metadata="s3/fauna/metadata.done", include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index 86be0c1..eb9ef1e 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -1,8 +1,9 @@ rule download_segment: output: - sequences = "data/{segment}.fasta", + sequences = "data/fauna/{segment}.fasta", params: fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", + output_dir = "data/fauna", benchmark: "benchmarks/download_segment_{segment}.txt" shell: @@ -12,16 +13,16 @@ rule download_segment: --virus avian_flu \ --fasta_fields {params.fasta_fields} \ --select locus:{wildcards.segment} \ - --path data \ + --path {params.output_dir} \ --fstem {wildcards.segment} """ rule parse_segment: input: - sequences = "data/{segment}.fasta", + sequences = "data/fauna/{segment}.fasta", output: - sequences = "results/sequences_{segment}.fasta", - metadata = "results/metadata_{segment}.tsv", + sequences = "results/fauna/sequences_{segment}.fasta", + metadata = "results/fauna/metadata_{segment}.tsv", params: fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" @@ -44,10 +45,10 @@ rule merge_segment_metadata: for each segment, but that would be a nice improvement. """ input: - segments = expand("results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "results/metadata_ha.tsv", + segments = expand("results/fauna/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "results/fauna/metadata_ha.tsv", output: - metadata = "results/metadata.tsv", + metadata = "results/fauna/metadata.tsv", shell: """ python scripts/add_segment_counts.py \ @@ -58,9 +59,9 @@ rule merge_segment_metadata: rule upload_sequences: input: - sequences="results/sequences_{segment}.fasta", + sequences="results/fauna/sequences_{segment}.fasta", output: - flag=touch("s3/sequences_{segment}.done"), + flag=touch("s3/fauna/sequences_{segment}.done"), params: s3_dst=config["s3_dst"], shell: @@ -73,9 +74,9 @@ rule upload_sequences: rule upload_metadata: input: - metadata="results/metadata.tsv", + metadata="results/fauna/metadata.tsv", output: - flag=touch("s3/metadata.done"), + flag=touch("s3/fauna/metadata.done"), params: s3_dst=config["s3_dst"], shell: diff --git a/rules/common.smk b/rules/common.smk index 2603f9b..dd95a6f 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -15,7 +15,7 @@ if LOCAL_INGEST: output: sequences = "data/{segment}/sequences.fasta", params: - sequences = lambda w: f"ingest/results/sequences_{w.segment}.fasta" + sequences = lambda w: f"ingest/results/fauna/sequences_{w.segment}.fasta" shell: """ cp {params.sequences} {output.sequences} @@ -26,7 +26,7 @@ if LOCAL_INGEST: metadata = "data/metadata.tsv", shell: """ - cp ingest/results/metadata.tsv {output.metadata} + cp ingest/results/fauna/metadata.tsv {output.metadata} """ else: From 5e903033875bfcca53a61f6a33436ef5d048ff67 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 20 May 2024 13:28:02 -0700 Subject: [PATCH 4/6] ingest/upload_from_fauna: parse output.sequences Parse the `output.sequences` path for the `output_dir` and the `output_fstem` that are passed to the fauna script to ensure we don't run into out of sync issues if we ever change the output. --- ingest/rules/upload_from_fauna.smk | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index eb9ef1e..a181821 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -1,9 +1,13 @@ +from pathlib import Path + + rule download_segment: output: sequences = "data/fauna/{segment}.fasta", params: fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", - output_dir = "data/fauna", + output_dir = lambda wildcards, output: Path(output.sequences).parent, + output_fstem = lambda wildcards, output: Path(output.sequences).stem, benchmark: "benchmarks/download_segment_{segment}.txt" shell: @@ -14,7 +18,7 @@ rule download_segment: --fasta_fields {params.fasta_fields} \ --select locus:{wildcards.segment} \ --path {params.output_dir} \ - --fstem {wildcards.segment} + --fstem {params.output_fstem} """ rule parse_segment: From 42c5a5e15b4681aa8e4f01bfe3c0811267483900 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 20 May 2024 13:50:41 -0700 Subject: [PATCH 5/6] ingest: move data source namespace up one level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we will need to namespace very data source within ingest, it makes more sense for the data source namespace to be up one level. The ingest build directory structure will look like: ``` . └── ingest/ ├── fauna/ │ ├── data │ ├── results │ └── s3 └── andersen-lab/ ├── data ├── results └── s3 ``` Based on discussion in --- .gitignore | 2 ++ ingest/README.md | 4 +-- ingest/Snakefile | 8 ++--- ingest/rules/ingest_andersen_lab.smk | 48 +++++++++++++++------------- ingest/rules/upload_from_fauna.smk | 24 +++++++------- rules/common.smk | 4 +-- 6 files changed, 48 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index c8bdb77..68263f1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ auspice/ build/ logs/ benchmarks/ +ingest/fauna/ +ingest/andersen-lab/ # Sensitive environment variables environment* diff --git a/ingest/README.md b/ingest/README.md index eb8c25c..a9662b9 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -28,7 +28,7 @@ nextstrain build \ . ``` -This command produces one metadata file, `results/fauna/metadata.tsv`, and one sequences file per gene segment like `results/fauna/sequences_ha.fasta`. +This command produces one metadata file, `fauna/results/metadata.tsv`, and one sequences file per gene segment like `fauna/results/sequences_ha.fasta`. Each file represents all available subtypes. Add the `upload_all` target to the command above to run the complete ingest pipeline _and_ upload results to AWS S3. @@ -54,7 +54,7 @@ It does not merge or deduplicate the data with the fauna data used in the defaul nextstrain build . merge_andersen_segment_metadata ``` -The results will be available in `results/andersen-lab/`. +The results will be available in `andersen-lab/results/`. ## Configuration diff --git a/ingest/Snakefile b/ingest/Snakefile index 210a934..b4cfee8 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -8,14 +8,14 @@ wildcard_constraints: rule all: # As of 2024-05-16 the default ingest only ingests data from fauna input: - sequences=expand("results/fauna/sequences_{segment}.fasta", segment=config["segments"]), - metadata="results/fauna/metadata.tsv", + sequences=expand("fauna/results/sequences_{segment}.fasta", segment=config["segments"]), + metadata="fauna/results/metadata.tsv", rule upload_all: # As of 2024-05-16 the default upload only uploads data from fauna input: - sequences=expand("s3/fauna/sequences_{segment}.done", segment=config["segments"]), - metadata="s3/fauna/metadata.done", + sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]), + metadata="fauna/s3/metadata.done", include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" diff --git a/ingest/rules/ingest_andersen_lab.smk b/ingest/rules/ingest_andersen_lab.smk index eca87b6..023d547 100644 --- a/ingest/rules/ingest_andersen_lab.smk +++ b/ingest/rules/ingest_andersen_lab.smk @@ -6,7 +6,7 @@ from the Andersen Lab's avian-influenza repo rule fetch_andersen_lab_repo: output: - andersen_lab_repo = temp("data/andersen-lab-avian-influenza.tar.gz") + andersen_lab_repo = temp("andersen-lab/data/avian-influenza.tar.gz") shell: """ curl -fsSL \ @@ -18,29 +18,33 @@ rule fetch_andersen_lab_repo: rule extract_metadata: input: - andersen_lab_repo = "data/andersen-lab-avian-influenza.tar.gz" + andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" output: - metadata = "data/andersen-lab/PRJNA1102327_metadata.csv" + metadata = "andersen-lab/data/PRJNA1102327_metadata.csv" + params: + output_dir = lambda wildcards, output: Path(output.metadata).parent shell: """ tar xz --file={input.andersen_lab_repo} \ --strip-components=2 \ - -C data/andersen-lab \ + -C {params.output_dir} \ --wildcards \ "*/metadata/PRJNA1102327_metadata.csv" """ rule extract_consensus_sequences: input: - andersen_lab_repo = "data/andersen-lab-avian-influenza.tar.gz" + andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz" output: - fasta = directory("data/andersen-lab/fasta"), - output_flag = touch("data/andersen-lab/extract_consensus_sequences.done") + fasta = directory("andersen-lab/data/fasta"), + output_flag = touch("andersen-lab/data/extract_consensus_sequences.done") + params: + output_dir = lambda wildcards, output: Path(output.fasta).parent shell: """ tar xz --file={input.andersen_lab_repo} \ --strip-components=1 \ - -C data/andersen-lab \ + -C {params.output_dir} \ --wildcards \ "*/fasta" """ @@ -51,14 +55,14 @@ rule rename_and_concatenate_segment_fastas: and concatenate FASTAs of the same segment """ input: - extract_consensus_sequences_flag = "data/andersen-lab/extract_consensus_sequences.done" + extract_consensus_sequences_flag = "andersen-lab/data/extract_consensus_sequences.done" output: - fasta = "data/andersen-lab/{segment}.fasta" + fasta = "andersen-lab/data/{segment}.fasta" params: segment = lambda wildcards: wildcards.segment.upper() shell: """ - for fasta in data/andersen-lab/fasta/*_{params.segment}_cns.fa; do + for fasta in andersen-lab/data/fasta/*_{params.segment}_cns.fa; do seqkit replace \ -p "Consensus_(SRR[0-9]+)_.*" \ -r '$1' \ @@ -69,12 +73,12 @@ rule rename_and_concatenate_segment_fastas: rule curate_metadata: input: - metadata = "data/andersen-lab/PRJNA1102327_metadata.csv", + metadata = "andersen-lab/data/PRJNA1102327_metadata.csv", geolocation_rules = "defaults/geolocation_rules.tsv" output: - metadata = "data/andersen-lab/metadata.tsv" + metadata = "andersen-lab/data/metadata.tsv" log: - "logs/curate_metadata.txt", + "andersen-lab/logs/curate_metadata.txt", shell: """ augur curate normalize-strings \ @@ -92,13 +96,13 @@ rule match_metadata_and_segment_fasta: and outputs the matching metadata TSV and sequence FASTAs per segment. """ input: - metadata = "data/andersen-lab/metadata.tsv", - fasta = "data/andersen-lab/{segment}.fasta" + metadata = "andersen-lab/data/metadata.tsv", + fasta = "andersen-lab/data/{segment}.fasta" output: - metadata = "results/andersen-lab/metadata_{segment}.tsv", - fasta = "results/andersen-lab/sequences_{segment}.fasta" + metadata = "andersen-lab/results/metadata_{segment}.tsv", + fasta = "andersen-lab/results/sequences_{segment}.fasta" log: - "logs/match_segment_metadata_and_fasta/{segment}.txt", + "andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt", shell: """ augur curate passthru \ @@ -121,10 +125,10 @@ rule merge_andersen_segment_metadata: have sequence data (no QC performed). """ input: - segments = expand("results/andersen-lab/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "results/andersen-lab/metadata_ha.tsv", + segments = expand("andersen-lab/results/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "andersen-lab/results/metadata_ha.tsv", output: - metadata = "results/andersen-lab/metadata.tsv", + metadata = "andersen-lab/results/metadata.tsv", shell: """ python scripts/add_segment_counts.py \ diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index a181821..d072738 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -3,13 +3,13 @@ from pathlib import Path rule download_segment: output: - sequences = "data/fauna/{segment}.fasta", + sequences = "fauna/data/{segment}.fasta", params: fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", output_dir = lambda wildcards, output: Path(output.sequences).parent, output_fstem = lambda wildcards, output: Path(output.sequences).stem, benchmark: - "benchmarks/download_segment_{segment}.txt" + "fauna/benchmarks/download_segment_{segment}.txt" shell: """ python3 {path_to_fauna}/vdb/download.py \ @@ -23,10 +23,10 @@ rule download_segment: rule parse_segment: input: - sequences = "data/fauna/{segment}.fasta", + sequences = "fauna/data/{segment}.fasta", output: - sequences = "results/fauna/sequences_{segment}.fasta", - metadata = "results/fauna/metadata_{segment}.tsv", + sequences = "fauna/results/sequences_{segment}.fasta", + metadata = "fauna/results/metadata_{segment}.tsv", params: fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" @@ -49,10 +49,10 @@ rule merge_segment_metadata: for each segment, but that would be a nice improvement. """ input: - segments = expand("results/fauna/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "results/fauna/metadata_ha.tsv", + segments = expand("fauna/results/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "fauna/results/metadata_ha.tsv", output: - metadata = "results/fauna/metadata.tsv", + metadata = "fauna/results/metadata.tsv", shell: """ python scripts/add_segment_counts.py \ @@ -63,9 +63,9 @@ rule merge_segment_metadata: rule upload_sequences: input: - sequences="results/fauna/sequences_{segment}.fasta", + sequences="fauna/results/sequences_{segment}.fasta", output: - flag=touch("s3/fauna/sequences_{segment}.done"), + flag=touch("fauna/s3/sequences_{segment}.done"), params: s3_dst=config["s3_dst"], shell: @@ -78,9 +78,9 @@ rule upload_sequences: rule upload_metadata: input: - metadata="results/fauna/metadata.tsv", + metadata="fauna/results/metadata.tsv", output: - flag=touch("s3/fauna/metadata.done"), + flag=touch("fauna/s3/metadata.done"), params: s3_dst=config["s3_dst"], shell: diff --git a/rules/common.smk b/rules/common.smk index dd95a6f..944ecf9 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -15,7 +15,7 @@ if LOCAL_INGEST: output: sequences = "data/{segment}/sequences.fasta", params: - sequences = lambda w: f"ingest/results/fauna/sequences_{w.segment}.fasta" + sequences = lambda w: f"ingest/fauna/results/sequences_{w.segment}.fasta" shell: """ cp {params.sequences} {output.sequences} @@ -26,7 +26,7 @@ if LOCAL_INGEST: metadata = "data/metadata.tsv", shell: """ - cp ingest/results/fauna/metadata.tsv {output.metadata} + cp ingest/fauna/results/metadata.tsv {output.metadata} """ else: From daeac83235cf3fa4dbcfb2a015457c2f64d61356 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 20 May 2024 15:42:25 -0700 Subject: [PATCH 6/6] ingest: move config values to defaults/config.yaml Make it easier to override the default configs for testing by providing the configs through a default config file. --- ingest/Snakefile | 5 +++-- ingest/defaults/config.yaml | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 ingest/defaults/config.yaml diff --git a/ingest/Snakefile b/ingest/Snakefile index b4cfee8..af7412a 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,6 +1,7 @@ path_to_fauna = '../fauna' -config["s3_dst"] = "s3://nextstrain-data-private/files/workflows/avian-flu" -config["segments"] = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"] + +# Use default configuration values. Override with Snakemake's --configfile/--config options. +configfile: "defaults/config.yaml" wildcard_constraints: segment = "|".join(config["segments"]) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml new file mode 100644 index 0000000..c46739f --- /dev/null +++ b/ingest/defaults/config.yaml @@ -0,0 +1,11 @@ +segments: + - pb2 + - pb1 + - pa + - ha + - np + - na + - mp + - ns + +s3_dst: "s3://nextstrain-data-private/files/workflows/avian-flu"