Skip to content

Commit

Permalink
Merge pull request #36 from nextstrain/james/ingest-fixes
Browse files Browse the repository at this point in the history
Ingest fixes
  • Loading branch information
joverlee521 authored May 21, 2024
2 parents 96b74e6 + daeac83 commit 14f0758
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 47 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ auspice/
build/
logs/
benchmarks/
ingest/fauna/
ingest/andersen-lab/

# Sensitive environment variables
environment*
Expand Down
4 changes: 2 additions & 2 deletions ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ nextstrain build \
.
```

This command produces one metadata file, `results/metadata.tsv`, and one sequences file per gene segment like `results/sequences_ha.fasta`.
This command produces one metadata file, `fauna/results/metadata.tsv`, and one sequences file per gene segment like `fauna/results/sequences_ha.fasta`.
Each file represents all available subtypes.

Add the `upload_all` target to the command above to run the complete ingest pipeline _and_ upload results to AWS S3.
Expand All @@ -54,7 +54,7 @@ It does not merge or deduplicate the data with the fauna data used in the defaul
nextstrain build . merge_andersen_segment_metadata
```

The results will be available in `results/andersen-lab/`.
The results will be available in `andersen-lab/results/`.

## Configuration

Expand Down
15 changes: 9 additions & 6 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
path_to_fauna = '../fauna'
config["s3_dst"] = "s3://nextstrain-data-private/files/workflows/avian-flu"
config["segments"] = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]

# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

wildcard_constraints:
segment = "|".join(config["segments"])

rule all:
# As of 2024-05-16 the default ingest only ingests data from fauna
input:
sequences=expand("results/metadata_{segment}.tsv", segment=config["segments"]),
metadata="results/metadata.tsv",
sequences=expand("fauna/results/sequences_{segment}.fasta", segment=config["segments"]),
metadata="fauna/results/metadata.tsv",

rule upload_all:
# As of 2024-05-16 the default upload only uploads data from fauna
input:
sequences=expand("s3/sequences_{segment}.done", segment=config["segments"]),
metadata="s3/metadata.done",
sequences=expand("fauna/s3/sequences_{segment}.done", segment=config["segments"]),
metadata="fauna/s3/metadata.done",

include: "rules/upload_from_fauna.smk"
include: "rules/ingest_andersen_lab.smk"
11 changes: 11 additions & 0 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
segments:
- pb2
- pb1
- pa
- ha
- np
- na
- mp
- ns

s3_dst: "s3://nextstrain-data-private/files/workflows/avian-flu"
48 changes: 26 additions & 22 deletions ingest/rules/ingest_andersen_lab.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from the Andersen Lab's avian-influenza repo

rule fetch_andersen_lab_repo:
output:
andersen_lab_repo = temp("data/andersen-lab-avian-influenza.tar.gz")
andersen_lab_repo = temp("andersen-lab/data/avian-influenza.tar.gz")
shell:
"""
curl -fsSL \
Expand All @@ -18,29 +18,33 @@ rule fetch_andersen_lab_repo:

rule extract_metadata:
input:
andersen_lab_repo = "data/andersen-lab-avian-influenza.tar.gz"
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
output:
metadata = "data/andersen-lab/PRJNA1102327_metadata.csv"
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv"
params:
output_dir = lambda wildcards, output: Path(output.metadata).parent
shell:
"""
tar xz --file={input.andersen_lab_repo} \
--strip-components=2 \
-C data/andersen-lab \
-C {params.output_dir} \
--wildcards \
"*/metadata/PRJNA1102327_metadata.csv"
"""

rule extract_consensus_sequences:
input:
andersen_lab_repo = "data/andersen-lab-avian-influenza.tar.gz"
andersen_lab_repo = "andersen-lab/data/avian-influenza.tar.gz"
output:
fasta = directory("data/andersen-lab/fasta"),
output_flag = touch("data/andersen-lab/extract_consensus_sequences.done")
fasta = directory("andersen-lab/data/fasta"),
output_flag = touch("andersen-lab/data/extract_consensus_sequences.done")
params:
output_dir = lambda wildcards, output: Path(output.fasta).parent
shell:
"""
tar xz --file={input.andersen_lab_repo} \
--strip-components=1 \
-C data/andersen-lab \
-C {params.output_dir} \
--wildcards \
"*/fasta"
"""
Expand All @@ -51,14 +55,14 @@ rule rename_and_concatenate_segment_fastas:
and concatenate FASTAs of the same segment
"""
input:
extract_consensus_sequences_flag = "data/andersen-lab/extract_consensus_sequences.done"
extract_consensus_sequences_flag = "andersen-lab/data/extract_consensus_sequences.done"
output:
fasta = "data/andersen-lab/{segment}.fasta"
fasta = "andersen-lab/data/{segment}.fasta"
params:
segment = lambda wildcards: wildcards.segment.upper()
shell:
"""
for fasta in data/andersen-lab/fasta/*_{params.segment}_cns.fa; do
for fasta in andersen-lab/data/fasta/*_{params.segment}_cns.fa; do
seqkit replace \
-p "Consensus_(SRR[0-9]+)_.*" \
-r '$1' \
Expand All @@ -69,12 +73,12 @@ rule rename_and_concatenate_segment_fastas:

rule curate_metadata:
input:
metadata = "data/andersen-lab/PRJNA1102327_metadata.csv",
metadata = "andersen-lab/data/PRJNA1102327_metadata.csv",
geolocation_rules = "defaults/geolocation_rules.tsv"
output:
metadata = "data/andersen-lab/metadata.tsv"
metadata = "andersen-lab/data/metadata.tsv"
log:
"logs/curate_metadata.txt",
"andersen-lab/logs/curate_metadata.txt",
shell:
"""
augur curate normalize-strings \
Expand All @@ -92,13 +96,13 @@ rule match_metadata_and_segment_fasta:
and outputs the matching metadata TSV and sequence FASTAs per segment.
"""
input:
metadata = "data/andersen-lab/metadata.tsv",
fasta = "data/andersen-lab/{segment}.fasta"
metadata = "andersen-lab/data/metadata.tsv",
fasta = "andersen-lab/data/{segment}.fasta"
output:
metadata = "results/andersen-lab/metadata_{segment}.tsv",
fasta = "results/andersen-lab/sequences_{segment}.fasta"
metadata = "andersen-lab/results/metadata_{segment}.tsv",
fasta = "andersen-lab/results/sequences_{segment}.fasta"
log:
"logs/match_segment_metadata_and_fasta/{segment}.txt",
"andersen-lab/logs/match_segment_metadata_and_fasta/{segment}.txt",
shell:
"""
augur curate passthru \
Expand All @@ -121,10 +125,10 @@ rule merge_andersen_segment_metadata:
have sequence data (no QC performed).
"""
input:
segments = expand("results/andersen-lab/metadata_{segment}.tsv", segment=config["segments"]),
metadata = "results/andersen-lab/metadata_ha.tsv",
segments = expand("andersen-lab/results/metadata_{segment}.tsv", segment=config["segments"]),
metadata = "andersen-lab/results/metadata_ha.tsv",
output:
metadata = "results/andersen-lab/metadata.tsv",
metadata = "andersen-lab/results/metadata.tsv",
shell:
"""
python scripts/add_segment_counts.py \
Expand Down
33 changes: 19 additions & 14 deletions ingest/rules/upload_from_fauna.smk
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
from pathlib import Path


rule download_segment:
output:
sequences = "data/{segment}.fasta",
sequences = "fauna/data/{segment}.fasta",
params:
fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
output_dir = lambda wildcards, output: Path(output.sequences).parent,
output_fstem = lambda wildcards, output: Path(output.sequences).stem,
benchmark:
"benchmarks/download_segment_{segment}.txt"
"fauna/benchmarks/download_segment_{segment}.txt"
shell:
"""
python3 {path_to_fauna}/vdb/download.py \
--database vdb \
--virus avian_flu \
--fasta_fields {params.fasta_fields} \
--select locus:{wildcards.segment} \
--path data \
--fstem {wildcards.segment}
--path {params.output_dir} \
--fstem {params.output_fstem}
"""

rule parse_segment:
input:
sequences = "data/{segment}.fasta",
sequences = "fauna/data/{segment}.fasta",
output:
sequences = "results/sequences_{segment}.fasta",
metadata = "results/metadata_{segment}.tsv",
sequences = "fauna/results/sequences_{segment}.fasta",
metadata = "fauna/results/metadata_{segment}.tsv",
params:
fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
prettify_fields = "region country division location host originating_lab submitting_lab authors PMID"
Expand All @@ -44,10 +49,10 @@ rule merge_segment_metadata:
for each segment, but that would be a nice improvement.
"""
input:
segments = expand("results/metadata_{segment}.tsv", segment=config["segments"]),
metadata = "results/metadata_ha.tsv",
segments = expand("fauna/results/metadata_{segment}.tsv", segment=config["segments"]),
metadata = "fauna/results/metadata_ha.tsv",
output:
metadata = "results/metadata.tsv",
metadata = "fauna/results/metadata.tsv",
shell:
"""
python scripts/add_segment_counts.py \
Expand All @@ -58,9 +63,9 @@ rule merge_segment_metadata:

rule upload_sequences:
input:
sequences="results/sequences_{segment}.fasta",
sequences="fauna/results/sequences_{segment}.fasta",
output:
flag=touch("s3/sequences_{segment}.done"),
flag=touch("fauna/s3/sequences_{segment}.done"),
params:
s3_dst=config["s3_dst"],
shell:
Expand All @@ -73,9 +78,9 @@ rule upload_sequences:

rule upload_metadata:
input:
metadata="results/metadata.tsv",
metadata="fauna/results/metadata.tsv",
output:
flag=touch("s3/metadata.done"),
flag=touch("fauna/s3/metadata.done"),
params:
s3_dst=config["s3_dst"],
shell:
Expand Down
2 changes: 1 addition & 1 deletion ingest/scripts/add_segment_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def summary(strain_count):
row[column]=strain_count[row['strain']]

with open(args.output, 'w') as fh:
writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t', lineterminator='\n')
writer.writeheader()
for row in rows:
writer.writerow(row)
Expand Down
4 changes: 2 additions & 2 deletions rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ if LOCAL_INGEST:
output:
sequences = "data/{segment}/sequences.fasta",
params:
sequences = lambda w: f"ingest/results/sequences_{w.segment}.fasta"
sequences = lambda w: f"ingest/fauna/results/sequences_{w.segment}.fasta"
shell:
"""
cp {params.sequences} {output.sequences}
Expand All @@ -26,7 +26,7 @@ if LOCAL_INGEST:
metadata = "data/metadata.tsv",
shell:
"""
cp ingest/results/metadata.tsv {output.metadata}
cp ingest/fauna/results/metadata.tsv {output.metadata}
"""

else:
Expand Down

0 comments on commit 14f0758

Please sign in to comment.