From 472fc5c737736b9e3708b6a439188730dc61ab30 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 8 Sep 2022 10:45:38 -0400 Subject: [PATCH 01/20] add script to download reference files --- get_refs.sh | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100755 get_refs.sh diff --git a/get_refs.sh b/get_refs.sh new file mode 100755 index 00000000..d3d2d44d --- /dev/null +++ b/get_refs.sh @@ -0,0 +1,93 @@ +#!/bin/bash +set -euo pipefail + +dest_dir=${1:-"scpca-references"} + +aws_root="https://scpca-references.s3.amazonaws.com" + +ref_dir="homo_sapiens/ensembl-104" +ref_paths=( + "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai" + "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.gtf.gz" + "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt" + "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv" + "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" +) + +salmon_index_files=( + "complete_ref_lens.bin" + "ctable.bin" + "ctg_offsets.bin" + "duplicate_clusters.tsv" + "info.json" + "mphf.bin" + "pos.bin" + "pre_indexing.log" + "rank.bin" + "ref_indexing.log" + "refAccumLengths.bin" + "reflengths.bin" + "refseq.bin" + "seq.bin" + "versionInfo.json" +) + +salmon_index_dirs=( + "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome" + "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" +) +for dir in ${salmon_index_dirs[@]} +do + for file in ${salmon_index_files[@]} + do + ref_paths+=("${dir}/${file}") + done +done + + +star_index_files=( + "chrLength.txt" + "chrName.txt" + "chrNameLength.txt" + "chrStart.txt" + "exonGeTrInfo.tab" + "exonInfo.tab" + "geneInfo.tab" + "Genome" + "genomeParameters.txt" + "Log.out" + "SA" + "SAindex" + "sjdbInfo.txt" + "sjdbList.fromGTF.out.tab" + "sjdbList.out.tab" + "transcriptInfo.tab" +) + +star_dir="${ref_dir}/star_index/Homo_sapiens.GRCh38.104.star_idx" +for file in ${star_index_files[@]} +do + ref_paths+=("${star_dir}/${file}") +done + + +barcode_files=( + "3M-february-2018.txt" + "737K-august-2016.txt" + "cellranger_mit_license.txt" + "visium-v1.txt" + "visium-v1.txt" +) + +barcode_dir="barcodes/10X" +for file in ${barcode_files[@]} +do + ref_paths+=("${barcode_dir}/${file}") +done + +for path in ${ref_paths[@]} +do + echo "Getting $path" + curl -s --create-dirs "$aws_root/$path" -o "$dest_dir/$path" +done From 5ed43b4bef6ae7185d4119629eb111b0584183e3 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 8 Sep 2022 10:46:28 -0400 Subject: [PATCH 02/20] update reference files to use params should make local files a bit easier --- config/reference_paths.config | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/config/reference_paths.config b/config/reference_paths.config index 8b9e9f0c..76d97fec 100644 --- a/config/reference_paths.config +++ b/config/reference_paths.config @@ -1,20 +1,21 @@ -// reference params and files +// reference params and files assembly = 'Homo_sapiens.GRCh38.104' -ref_dir = 's3://scpca-references/homo_sapiens/ensembl-104' -ref_fasta = "$ref_dir/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" -ref_fasta_index = "$ref_dir/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai" -ref_gtf = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.gtf.gz" +ref_rootdir = 's3://scpca-references' +ref_dir = "${params.ref_rootdir}/homo_sapiens/ensembl-104" +ref_fasta = "${params.ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" +ref_fasta_index = "${params.ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai" +ref_gtf = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.gtf.gz" -// index files -splici_index = "$ref_dir/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome" -bulk_index = "$ref_dir/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" -cellranger_index = "$ref_dir/cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full" -star_index = "$ref_dir/star_index/Homo_sapiens.GRCh38.104.star_idx" +// index files +splici_index = "${params.ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome" +bulk_index = "${params.ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" +cellranger_index = "${params.ref_dir}/cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full" +star_index = "${params.ref_dir}/star_index/Homo_sapiens.GRCh38.104.star_idx" -// annotation files -mito_file = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt" -t2g_3col_path = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv" -t2g_bulk_path = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" +// annotation files +mito_file = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt" +t2g_3col_path = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv" +t2g_bulk_path = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" // barcode files -barcode_dir = 's3://scpca-references/barcodes/10X' +barcode_dir = '${params.ref_rootdir}/barcodes/10X' From f26a50849dd9d0fa5ca0dfd36c5001f6aeb8287b Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 8 Sep 2022 12:20:41 -0400 Subject: [PATCH 03/20] Fix barcode file --- config/reference_paths.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/reference_paths.config b/config/reference_paths.config index 76d97fec..7d54fb8d 100644 --- a/config/reference_paths.config +++ b/config/reference_paths.config @@ -18,4 +18,4 @@ t2g_3col_path = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced t2g_bulk_path = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" // barcode files -barcode_dir = '${params.ref_rootdir}/barcodes/10X' +barcode_dir = "${params.ref_rootdir}/barcodes/10X" From 9b8c91225a3c95bbbb3b46375a20f76e704ea8f6 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 30 Sep 2022 12:57:03 -0400 Subject: [PATCH 04/20] add containerfile --- get_refs.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/get_refs.sh b/get_refs.sh index d3d2d44d..41210840 100755 --- a/get_refs.sh +++ b/get_refs.sh @@ -3,6 +3,8 @@ set -euo pipefail dest_dir=${1:-"scpca-references"} + +containerfile_url="https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config" aws_root="https://scpca-references.s3.amazonaws.com" ref_dir="homo_sapiens/ensembl-104" @@ -91,3 +93,9 @@ do echo "Getting $path" curl -s --create-dirs "$aws_root/$path" -o "$dest_dir/$path" done + + +containers=`curl -s https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config \ + | grep CONTAINER \ + | cut -d"'" -f 2 \ + | grep -v "^$" ` From 850772d2bfa195ce2a98a158504379690fc39263 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 11:25:07 -0400 Subject: [PATCH 05/20] Add python reference file --- get_refs.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100755 get_refs.py diff --git a/get_refs.py b/get_refs.py new file mode 100755 index 00000000..310ef0f3 --- /dev/null +++ b/get_refs.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +import argparse +import pathlib +from signal import SIG_DFL +import sys +import urllib.request + +parser = argparse.ArgumentParser() +parser.add_argument("--refdir", type=str, + default="scpca-references", + help = "destination directory for downloaded reference files") +parser.add_argument("--replace", action = argparse.BooleanOptionalAction, + default = False, + help = "replace previously downloaded files") +parser.add_argument("--revision", type=str, + default="main", + metavar = "vX.X.X", + help = "tag for a specific workflow version (defaults to latest revision)") + +args = parser.parse_args() + +aws_root = "https://scpca-references.s3.amazonaws.com" +containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config" + +# genome reference files +genome_dir = pathlib.Path("homo_sapiens/ensembl-104") +ref_subdirs =[ + "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", + "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai", + "annotation/Homo_sapiens.GRCh38.104.gtf.gz", + "annotation/Homo_sapiens.GRCh38.104.mitogenes.txt", + "annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv", + "annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" +] + +ref_paths = [genome_dir / sd for sd in ref_subdirs] + +# salmon index files +salmon_index_files = [ + "complete_ref_lens.bin", + "ctable.bin", + "ctg_offsets.bin", + "duplicate_clusters.tsv", + "info.json", + "mphf.bin", + "pos.bin", + "pre_indexing.log", + "rank.bin", + "ref_indexing.log", + "refAccumLengths.bin", + "reflengths.bin", + "refseq.bin", + "seq.bin", + "versionInfo.json" +] + +salmon_index_dirs = [ + genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome", + genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" +] +for sa_dir in salmon_index_dirs: + ref_paths += [sa_dir / f for f in salmon_index_files] + +# star index files +star_index_files = [ + "chrLength.txt", + "chrName.txt", + "chrNameLength.txt", + "chrStart.txt", + "exonGeTrInfo.tab", + "exonInfo.tab", + "geneInfo.tab", + "Genome", + "genomeParameters.txt", + "Log.out", + "SA", + "SAindex", + "sjdbInfo.txt", + "sjdbList.fromGTF.out.tab", + "sjdbList.out.tab", + "transcriptInfo.tab" +] + +star_dir = genome_dir/ "star_index/Homo_sapiens.GRCh38.104.star_idx" +ref_paths += [star_dir / f for f in star_index_files] + +# get barcode file paths +barcode_dir = pathlib.Path("barcodes/10X") +barcode_files = [ + "3M-february-2018.txt", + "737K-august-2016.txt", + "cellranger_mit_license.txt", + "visium-v1.txt", + "visium-v2.txt" +] + +ref_paths += [barcode_dir / f for f in barcode_files] + +# download all the files and put them in the correct locations +print("Downloading reference files:") +for path in ref_paths[0:2]: + outfile = args.refdir / path + if outfile.exists() and not args.replace: + continue + print(f"Getting {path}") + # make parents + outfile.parent.mkdir(exist_ok=True, parents = True) + # download and write + file_url = f"{aws_root}/{path}" + try: + urllib.request.urlretrieve(file_url, outfile) + except urllib.error.URLError: + print(f"The file download failed for {file_url}, please check the URL for errors", + file = sys.stderr) + exit(1) + From 48ed950f8b95424cc088eab12375a62f10a2a64c Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 12:09:35 -0400 Subject: [PATCH 06/20] write params file --- get_refs.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/get_refs.py b/get_refs.py index 310ef0f3..6051f509 100755 --- a/get_refs.py +++ b/get_refs.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 import argparse +import os import pathlib -from signal import SIG_DFL +import shutil import sys import urllib.request @@ -10,9 +11,12 @@ parser.add_argument("--refdir", type=str, default="scpca-references", help = "destination directory for downloaded reference files") -parser.add_argument("--replace", action = argparse.BooleanOptionalAction, - default = False, +parser.add_argument("--replace", + action = "store_true", help = "replace previously downloaded files") +parser.add_argument("--paramfile", type=str, + default="", + help = "nextflow param file to write") parser.add_argument("--revision", type=str, default="main", metavar = "vX.X.X", @@ -24,6 +28,7 @@ containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config" # genome reference files +assembly = "Homo_sapiens.GRCh38.104" genome_dir = pathlib.Path("homo_sapiens/ensembl-104") ref_subdirs =[ "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", @@ -98,7 +103,7 @@ ref_paths += [barcode_dir / f for f in barcode_files] # download all the files and put them in the correct locations -print("Downloading reference files:") +print("Downloading reference files...") for path in ref_paths[0:2]: outfile = args.refdir / path if outfile.exists() and not args.replace: @@ -114,4 +119,24 @@ print(f"The file download failed for {file_url}, please check the URL for errors", file = sys.stderr) exit(1) +print("Done with reference file downloads\n" + f"Reference files can be found at '{args.refdir}'\n") + + + +if args.paramfile: + pfile = pathlib.Path(args.paramfile) + # check if paramfile exists & move old if needed + if pfile.exists(): + print(f"A file already exists at `{pfile}`, renaming previous file to `{pfile.name}.bak`") + shutil.move(pfile, str(pfile) + ".bak") + # create parameter dictionary + nf_params = { + 'assembly': assembly, + 'ref_rootdir': os.path.abspath(args.refdir) + } + with open(pfile, 'w') as f: + f.write("# local nextflow reference file parameters, generated by `get_refs.py`\n\n") + for key, value in nf_params.items(): + f.write(f"{key}: {value}\n") From fcb267376b2ec1421a60c42e3440e3a493ef2118 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 13:56:43 -0400 Subject: [PATCH 07/20] add docker and singularity pulls --- get_refs.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/get_refs.py b/get_refs.py index 6051f509..2bd84d6d 100755 --- a/get_refs.py +++ b/get_refs.py @@ -3,7 +3,9 @@ import argparse import os import pathlib +import re import shutil +import subprocess import sys import urllib.request @@ -21,9 +23,19 @@ default="main", metavar = "vX.X.X", help = "tag for a specific workflow version (defaults to latest revision)") - +parser.add_argument("--docker", + action = "store_true", + help = "pull and cache images for docker") +parser.add_argument("--singularity", + action = "store_true", + help = "pull and cache images for singularity") +parser.add_argument("--singularity_cache", type=str, + metavar = "CACHE_DIR", + help = "cache directory for singularity" +) args = parser.parse_args() +# scpca-nf resource urls aws_root = "https://scpca-references.s3.amazonaws.com" containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config" @@ -115,15 +127,14 @@ file_url = f"{aws_root}/{path}" try: urllib.request.urlretrieve(file_url, outfile) - except urllib.error.URLError: + except urllib.error.URLError as e: + print(e.reason) print(f"The file download failed for {file_url}, please check the URL for errors", file = sys.stderr) exit(1) print("Done with reference file downloads\n" f"Reference files can be found at '{args.refdir}'\n") - - if args.paramfile: pfile = pathlib.Path(args.paramfile) # check if paramfile exists & move old if needed @@ -140,3 +151,35 @@ for key, value in nf_params.items(): f.write(f"{key}: {value}\n") +if args.singularity or args.docker: + print("getting list of required containers") + containers = {} + try: + container_file = urllib.request.urlopen(containerfile_url) + except urllib.error.URLError as e: + print(e.reason) + print(f"The file download failed for {container_url}, please check the URL for errors") + print(f"Is `{args.revision}` a valid release tag?") + exit(1) + + # pattern match to find container id & location + container_re = re.compile(r'(?P.+_CONTAINER)\s*=\s*([\'"])(?P.+)\2') + for line in container_file: + match = container_re.search(line.decode()) + if match: + containers[match.group('id')] = match.group('loc') + +# pull docker images +if args.docker: + for loc in containers.values(): + subprocess.run(["docker", "pull", loc]) + +# pull singularity images (with cache location) +if args.singularity: + if args.singularity_cache: + os.environ['SINGULARITY_CACHEDIR'] = args.singularity_cache + for loc in containers.values(): + subprocess.run( + ["singularity", "pull", f"docker://{loc}"], + env = os.environ + ) From 4f9232f173258d77afff1ae6208475d4a52fecd1 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 14:01:52 -0400 Subject: [PATCH 08/20] default param file name --- get_refs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/get_refs.py b/get_refs.py index 2bd84d6d..43b70186 100755 --- a/get_refs.py +++ b/get_refs.py @@ -17,8 +17,8 @@ action = "store_true", help = "replace previously downloaded files") parser.add_argument("--paramfile", type=str, - default="", - help = "nextflow param file to write") + default="local_refs.params", + help = "nextflow param file to write (default: `local_refs.params`)") parser.add_argument("--revision", type=str, default="main", metavar = "vX.X.X", From 407db7db48d265d79c1ad785e66f3f8183f8df47 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 14:38:56 -0400 Subject: [PATCH 09/20] Add messages for containers --- get_refs.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/get_refs.py b/get_refs.py index 43b70186..a72afe73 100755 --- a/get_refs.py +++ b/get_refs.py @@ -152,7 +152,7 @@ f.write(f"{key}: {value}\n") if args.singularity or args.docker: - print("getting list of required containers") + print("Getting list of required containers") containers = {} try: container_file = urllib.request.urlopen(containerfile_url) @@ -171,15 +171,22 @@ # pull docker images if args.docker: + print("Pulling docker images...") for loc in containers.values(): subprocess.run(["docker", "pull", loc]) + print("Done pulling docker images\n") -# pull singularity images (with cache location) +# pull singularity images (to optionally specified cache location) if args.singularity: + print("Pulling singularity images...") if args.singularity_cache: - os.environ['SINGULARITY_CACHEDIR'] = args.singularity_cache + os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(args.singularity_cache) for loc in containers.values(): subprocess.run( ["singularity", "pull", f"docker://{loc}"], env = os.environ ) + print("Done pulling singularity images") + if args.singularity_cache: + print(f"Singularity images located at {os.environ['SINGULARITY_CACHEDIR']}") + print() From 83126004628c36262d9afa54b9e5f05be99c8b28 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 14:39:19 -0400 Subject: [PATCH 10/20] make star index optional --- get_refs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/get_refs.py b/get_refs.py index a72afe73..77c0f78a 100755 --- a/get_refs.py +++ b/get_refs.py @@ -23,6 +23,9 @@ default="main", metavar = "vX.X.X", help = "tag for a specific workflow version (defaults to latest revision)") +parser.add_argument("--star_index", + action = "store_true", + help = "get STAR index (required for genetic demultiplexing)") parser.add_argument("--docker", action = "store_true", help = "pull and cache images for docker") @@ -100,7 +103,8 @@ ] star_dir = genome_dir/ "star_index/Homo_sapiens.GRCh38.104.star_idx" -ref_paths += [star_dir / f for f in star_index_files] +if args.star_index: + ref_paths += [star_dir / f for f in star_index_files] # get barcode file paths barcode_dir = pathlib.Path("barcodes/10X") From 72d343ad9c875f4c00b40c4324929141f135ec11 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 15:09:29 -0400 Subject: [PATCH 11/20] Force singularity --- get_refs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/get_refs.py b/get_refs.py index 77c0f78a..db67ff5d 100755 --- a/get_refs.py +++ b/get_refs.py @@ -187,7 +187,7 @@ os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(args.singularity_cache) for loc in containers.values(): subprocess.run( - ["singularity", "pull", f"docker://{loc}"], + ["singularity", "pull", "--force", f"docker://{loc}"], env = os.environ ) print("Done pulling singularity images") From aaab20ff9d01f951ea2c5b1c2f35c89db870ab43 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 26 Oct 2022 15:28:32 -0400 Subject: [PATCH 12/20] missing makeJson --- modules/spaceranger.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/spaceranger.nf b/modules/spaceranger.nf index 253e0409..36c7fabc 100644 --- a/modules/spaceranger.nf +++ b/modules/spaceranger.nf @@ -16,6 +16,7 @@ process spaceranger{ script: out_id = file(meta.spaceranger_results_dir).name meta.cellranger_index = index.fileName + meta_json = Utils.makeJson(meta) """ spaceranger count \ --id=${out_id} \ From f1d36bbd599a092a79e6180e1a3295b5c29254fc Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 27 Oct 2022 13:05:42 -0400 Subject: [PATCH 13/20] Add cell ranger option & download --- get_refs.py | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/get_refs.py b/get_refs.py index db67ff5d..7f188f8c 100755 --- a/get_refs.py +++ b/get_refs.py @@ -17,7 +17,7 @@ action = "store_true", help = "replace previously downloaded files") parser.add_argument("--paramfile", type=str, - default="local_refs.params", + default="local_refs.yaml", help = "nextflow param file to write (default: `local_refs.params`)") parser.add_argument("--revision", type=str, default="main", @@ -26,6 +26,9 @@ parser.add_argument("--star_index", action = "store_true", help = "get STAR index (required for genetic demultiplexing)") +parser.add_argument("--cellranger_index", + action = "store_true", + help = "get Cell Ranger index (required for spatial data)") parser.add_argument("--docker", action = "store_true", help = "pull and cache images for docker") @@ -34,8 +37,7 @@ help = "pull and cache images for singularity") parser.add_argument("--singularity_cache", type=str, metavar = "CACHE_DIR", - help = "cache directory for singularity" -) + help = "cache directory for singularity") args = parser.parse_args() # scpca-nf resource urls @@ -53,7 +55,6 @@ "annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv", "annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" ] - ref_paths = [genome_dir / sd for sd in ref_subdirs] # salmon index files @@ -102,11 +103,37 @@ "transcriptInfo.tab" ] -star_dir = genome_dir/ "star_index/Homo_sapiens.GRCh38.104.star_idx" +star_dir = genome_dir / "star_index/Homo_sapiens.GRCh38.104.star_idx" if args.star_index: ref_paths += [star_dir / f for f in star_index_files] -# get barcode file paths +# Cell Ranger index files +cr_index_files = [ + "reference.json", + "fasta/genome.fa", + "fasta/genome.fa.fai", + "genes/genes.gtf.gz", + "star/chrLength.txt", + "star/chrName.txt", + "star/chrNameLength.txt", + "star/chrStart.txt", + "star/exonGeTrInfo.tab", + "star/exonInfo.tab", + "star/geneInfo.tab", + "star/Genome", + "star/genomeParameters.txt", + "star/SA", + "star/SAindex", + "star/sjdbInfo.txt", + "star/sjdbList.fromGTF.out.tab", + "star/sjdbList.out.tab", + "star/transcriptInfo.tab" +] +cr_dir = genome_dir / "cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full" +if args.cellranger_index: + ref_paths += [cr_dir / f for f in cr_index_files] + +# barcode file paths barcode_dir = pathlib.Path("barcodes/10X") barcode_files = [ "3M-february-2018.txt", @@ -118,7 +145,7 @@ ref_paths += [barcode_dir / f for f in barcode_files] -# download all the files and put them in the correct locations +## download all the files and put them in the correct locations ## print("Downloading reference files...") for path in ref_paths[0:2]: outfile = args.refdir / path @@ -139,6 +166,7 @@ print("Done with reference file downloads\n" f"Reference files can be found at '{args.refdir}'\n") +# write param file if requested if args.paramfile: pfile = pathlib.Path(args.paramfile) # check if paramfile exists & move old if needed @@ -155,6 +183,7 @@ for key, value in nf_params.items(): f.write(f"{key}: {value}\n") +## Get docker containers from workflow if args.singularity or args.docker: print("Getting list of required containers") containers = {} @@ -173,7 +202,7 @@ if match: containers[match.group('id')] = match.group('loc') -# pull docker images +# pull docker images ## if args.docker: print("Pulling docker images...") for loc in containers.values(): From 1a1a2af1b770ca1a1d6dbfad4ef65f0fde312527 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 27 Oct 2022 13:23:06 -0400 Subject: [PATCH 14/20] remove bash script --- get_refs.sh | 101 ---------------------------------------------------- 1 file changed, 101 deletions(-) delete mode 100755 get_refs.sh diff --git a/get_refs.sh b/get_refs.sh deleted file mode 100755 index 41210840..00000000 --- a/get_refs.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -set -euo pipefail - -dest_dir=${1:-"scpca-references"} - - -containerfile_url="https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config" -aws_root="https://scpca-references.s3.amazonaws.com" - -ref_dir="homo_sapiens/ensembl-104" -ref_paths=( - "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" - "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai" - "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.gtf.gz" - "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt" - "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv" - "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" -) - -salmon_index_files=( - "complete_ref_lens.bin" - "ctable.bin" - "ctg_offsets.bin" - "duplicate_clusters.tsv" - "info.json" - "mphf.bin" - "pos.bin" - "pre_indexing.log" - "rank.bin" - "ref_indexing.log" - "refAccumLengths.bin" - "reflengths.bin" - "refseq.bin" - "seq.bin" - "versionInfo.json" -) - -salmon_index_dirs=( - "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome" - "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" -) -for dir in ${salmon_index_dirs[@]} -do - for file in ${salmon_index_files[@]} - do - ref_paths+=("${dir}/${file}") - done -done - - -star_index_files=( - "chrLength.txt" - "chrName.txt" - "chrNameLength.txt" - "chrStart.txt" - "exonGeTrInfo.tab" - "exonInfo.tab" - "geneInfo.tab" - "Genome" - "genomeParameters.txt" - "Log.out" - "SA" - "SAindex" - "sjdbInfo.txt" - "sjdbList.fromGTF.out.tab" - "sjdbList.out.tab" - "transcriptInfo.tab" -) - -star_dir="${ref_dir}/star_index/Homo_sapiens.GRCh38.104.star_idx" -for file in ${star_index_files[@]} -do - ref_paths+=("${star_dir}/${file}") -done - - -barcode_files=( - "3M-february-2018.txt" - "737K-august-2016.txt" - "cellranger_mit_license.txt" - "visium-v1.txt" - "visium-v1.txt" -) - -barcode_dir="barcodes/10X" -for file in ${barcode_files[@]} -do - ref_paths+=("${barcode_dir}/${file}") -done - -for path in ${ref_paths[@]} -do - echo "Getting $path" - curl -s --create-dirs "$aws_root/$path" -o "$dest_dir/$path" -done - - -containers=`curl -s https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config \ - | grep CONTAINER \ - | cut -d"'" -f 2 \ - | grep -v "^$" ` From c910a14a7bcfe816d81e2bbeb50cd8015ca5f176 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Fri, 28 Oct 2022 14:18:27 -0400 Subject: [PATCH 15/20] parse reference file locations from repo --- get_refs.py | 108 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 29 deletions(-) diff --git a/get_refs.py b/get_refs.py index 7f188f8c..ff850ffc 100755 --- a/get_refs.py +++ b/get_refs.py @@ -2,13 +2,14 @@ import argparse import os -import pathlib import re import shutil import subprocess import sys import urllib.request +from pathlib import Path + parser = argparse.ArgumentParser() parser.add_argument("--refdir", type=str, default="scpca-references", @@ -17,8 +18,8 @@ action = "store_true", help = "replace previously downloaded files") parser.add_argument("--paramfile", type=str, - default="local_refs.yaml", - help = "nextflow param file to write (default: `local_refs.params`)") + default="localref_params.yaml", + help = "nextflow param file to write (default: `localref_params.yaml`)") parser.add_argument("--revision", type=str, default="main", metavar = "vX.X.X", @@ -41,21 +42,64 @@ args = parser.parse_args() # scpca-nf resource urls -aws_root = "https://scpca-references.s3.amazonaws.com" +reffile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/reference_paths.config" containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config" -# genome reference files -assembly = "Homo_sapiens.GRCh38.104" -genome_dir = pathlib.Path("homo_sapiens/ensembl-104") -ref_subdirs =[ - "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", - "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai", - "annotation/Homo_sapiens.GRCh38.104.gtf.gz", - "annotation/Homo_sapiens.GRCh38.104.mitogenes.txt", - "annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv", - "annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv" +# download reference file +print("Getting list of required reference files") +refs = {} +try: + ref_file = urllib.request.urlopen(reffile_url) +except urllib.error.URLError as e: + print(e.reason) + print(f"The file download failed for {reffile_url}, please check the URL for errors") + print(f"Is `{args.revision}` a valid release tag?") + exit(1) + +# parse reference file +ref_re = re.compile(r'(?P.+?)\s*=\s*([\'"])(?P.+)\2') +for line in ref_file: + match = ref_re.search(line.decode()) + if match: + refs[match.group('id')] = match.group('loc') + +# regular expressions for parameter expansion +root_re = re.compile(r'\$\{?(params.)?ref_rootdir\}?$') +refdir_re = re.compile(r'\$\{?(params.)?ref_dir\}?$') + +# get assembly and root location +assembly = refs.get("assembly", "NA") +root_parts = refs.get("ref_rootdir").split('://') +if root_parts[0] == 's3': + url_root = f"https://{root_parts[1]}.s3.amazonaws.com" +elif root_parts[0] in ['http', 'https', 'ftp']: + url_root = refs.get("ref_rootdir") +else: + print("The `ref_rootdir` is not a supported remote location.") + exit(1) + + + +# set the base directory (usually corresponding to a genome version) +genome_dir = Path(refs.get("ref_dir")) +# remove the first element if it is a variable +if root_re.match(genome_dir.parts[0]): + genome_dir = genome_dir.relative_to(genome_dir.parts[0]) + +# single-file references +ref_keys =[ + "ref_fasta", + "ref_fasta_index", + "ref_gtf", + "mito_file", + "t2g_3col_path", + "t2g_bulk_path" ] -ref_paths = [genome_dir / sd for sd in ref_subdirs] +ref_paths = [Path(refs.get(k)) for k in ref_keys] +# replace initial part of path if it is `$params.ref_dir` or similar +ref_paths = [genome_dir / p.relative_to(p.parts[0]) + if refdir_re.match(p.parts[0]) else p + for p in ref_paths] # salmon index files salmon_index_files = [ @@ -75,12 +119,14 @@ "seq.bin", "versionInfo.json" ] - -salmon_index_dirs = [ - genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome", - genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" +salmon_keys = [ + "splici_index", + "bulk_index" ] -for sa_dir in salmon_index_dirs: +for k in salmon_keys: + sa_dir = Path(refs.get(k)) + if refdir_re.match(sa_dir.parts[0]): + sa_dir = genome_dir / sa_dir.relative_to(sa_dir.parts[0]) ref_paths += [sa_dir / f for f in salmon_index_files] # star index files @@ -102,8 +148,9 @@ "sjdbList.out.tab", "transcriptInfo.tab" ] - -star_dir = genome_dir / "star_index/Homo_sapiens.GRCh38.104.star_idx" +star_dir = Path(refs.get("star_index")) +if refdir_re.match(star_dir.parts[0]): + star_dir = genome_dir / star_dir.relative_to(star_dir.parts[0]) if args.star_index: ref_paths += [star_dir / f for f in star_index_files] @@ -129,12 +176,13 @@ "star/sjdbList.out.tab", "star/transcriptInfo.tab" ] -cr_dir = genome_dir / "cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full" +cr_dir = Path(refs.get("cellranger_index")) +if refdir_re.match(cr_dir.parts[0]): + cr_dir = genome_dir / cr_dir.relative_to(cr_dir.parts[0]) if args.cellranger_index: ref_paths += [cr_dir / f for f in cr_index_files] -# barcode file paths -barcode_dir = pathlib.Path("barcodes/10X") +# barcode files barcode_files = [ "3M-february-2018.txt", "737K-august-2016.txt", @@ -142,12 +190,14 @@ "visium-v1.txt", "visium-v2.txt" ] - +barcode_dir = Path(refs.get("barcode_dir")) +if root_re.match(barcode_dir.parts[0]): + barcode_dir = barcode_dir.relative_to(barcode_dir.parts[0]) ref_paths += [barcode_dir / f for f in barcode_files] ## download all the files and put them in the correct locations ## print("Downloading reference files...") -for path in ref_paths[0:2]: +for path in ref_paths: outfile = args.refdir / path if outfile.exists() and not args.replace: continue @@ -155,7 +205,7 @@ # make parents outfile.parent.mkdir(exist_ok=True, parents = True) # download and write - file_url = f"{aws_root}/{path}" + file_url = f"{url_root}/{path}" try: urllib.request.urlretrieve(file_url, outfile) except urllib.error.URLError as e: @@ -168,7 +218,7 @@ # write param file if requested if args.paramfile: - pfile = pathlib.Path(args.paramfile) + pfile = Path(args.paramfile) # check if paramfile exists & move old if needed if pfile.exists(): print(f"A file already exists at `{pfile}`, renaming previous file to `{pfile.name}.bak`") From 293fded3e23ed3a88f5730632c265e21b755ace8 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 31 Oct 2022 13:25:31 -0400 Subject: [PATCH 16/20] Improve root URI parsing --- get_refs.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/get_refs.py b/get_refs.py index ff850ffc..2af1252f 100755 --- a/get_refs.py +++ b/get_refs.py @@ -69,13 +69,19 @@ # get assembly and root location assembly = refs.get("assembly", "NA") -root_parts = refs.get("ref_rootdir").split('://') +# split out protocol from the root URI +root_parts = refs.get("ref_rootdir").split('://', maxsplit = 1) if root_parts[0] == 's3': - url_root = f"https://{root_parts[1]}.s3.amazonaws.com" + # if S3, convert bucket path to https:// url + bucket_path = root_parts[1].split("/", maxsplit = 1) + url_root = f"https://{bucket_path[0]}.s3.amazonaws.com" + if len(bucket_path) > 1: + url_root += f"/{bucket_path[1]}" elif root_parts[0] in ['http', 'https', 'ftp']: + # otherwise, just get the location url_root = refs.get("ref_rootdir") else: - print("The `ref_rootdir` is not a supported remote location.") + print("`ref_rootdir` is not a supported remote location.") exit(1) From f2d71dce8dd62b81142e60e56ede61557f431fe5 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 31 Oct 2022 13:26:13 -0400 Subject: [PATCH 17/20] Updates from review (semicolons, arg names) --- get_refs.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/get_refs.py b/get_refs.py index 2af1252f..5c9b38c2 100755 --- a/get_refs.py +++ b/get_refs.py @@ -14,12 +14,12 @@ parser.add_argument("--refdir", type=str, default="scpca-references", help = "destination directory for downloaded reference files") -parser.add_argument("--replace", - action = "store_true", - help = "replace previously downloaded files") parser.add_argument("--paramfile", type=str, default="localref_params.yaml", - help = "nextflow param file to write (default: `localref_params.yaml`)") + help = "path to nextflow param file to write (default: `localref_params.yaml`)") +parser.add_argument("--overwrite_refs", + action = "store_true", + help = "replace previously downloaded files") parser.add_argument("--revision", type=str, default="main", metavar = "vX.X.X", @@ -52,7 +52,7 @@ ref_file = urllib.request.urlopen(reffile_url) except urllib.error.URLError as e: print(e.reason) - print(f"The file download failed for {reffile_url}, please check the URL for errors") + print(f"The file download failed for {reffile_url}; please check the URL for errors") print(f"Is `{args.revision}` a valid release tag?") exit(1) @@ -202,10 +202,10 @@ ref_paths += [barcode_dir / f for f in barcode_files] ## download all the files and put them in the correct locations ## -print("Downloading reference files...") +print("Downloading reference files... (This might take a while)") for path in ref_paths: outfile = args.refdir / path - if outfile.exists() and not args.replace: + if outfile.exists() and not args.overwrite_refs: continue print(f"Getting {path}") # make parents @@ -216,7 +216,7 @@ urllib.request.urlretrieve(file_url, outfile) except urllib.error.URLError as e: print(e.reason) - print(f"The file download failed for {file_url}, please check the URL for errors", + print(f"The file download failed for {file_url}; please check the URL for errors", file = sys.stderr) exit(1) print("Done with reference file downloads\n" @@ -227,7 +227,7 @@ pfile = Path(args.paramfile) # check if paramfile exists & move old if needed if pfile.exists(): - print(f"A file already exists at `{pfile}`, renaming previous file to `{pfile.name}.bak`") + print(f"A file already exists at `{pfile}`; renaming previous file to `{pfile.name}.bak`") shutil.move(pfile, str(pfile) + ".bak") # create parameter dictionary nf_params = { @@ -247,7 +247,7 @@ container_file = urllib.request.urlopen(containerfile_url) except urllib.error.URLError as e: print(e.reason) - print(f"The file download failed for {container_url}, please check the URL for errors") + print(f"The file download failed for {container_url}; please check the URL for errors") print(f"Is `{args.revision}` a valid release tag?") exit(1) From 9ba2bcb14538ddaf1f44dfb463741da568b1419e Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 31 Oct 2022 13:28:45 -0400 Subject: [PATCH 18/20] move refs intialization --- get_refs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/get_refs.py b/get_refs.py index 5c9b38c2..a23d1f3d 100755 --- a/get_refs.py +++ b/get_refs.py @@ -47,7 +47,6 @@ # download reference file print("Getting list of required reference files") -refs = {} try: ref_file = urllib.request.urlopen(reffile_url) except urllib.error.URLError as e: @@ -57,6 +56,7 @@ exit(1) # parse reference file +refs = {} ref_re = re.compile(r'(?P.+?)\s*=\s*([\'"])(?P.+)\2') for line in ref_file: match = ref_re.search(line.decode()) From 154b1b6dcf6b737516ee00e3d60d9b846c4de001 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Mon, 31 Oct 2022 13:47:15 -0400 Subject: [PATCH 19/20] a few more comments --- get_refs.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/get_refs.py b/get_refs.py index a23d1f3d..b89da388 100755 --- a/get_refs.py +++ b/get_refs.py @@ -56,6 +56,8 @@ exit(1) # parse reference file +# gets all of the `param` variables that are set in `reference_paths.config` +# and stores then in a dict refs = {} ref_re = re.compile(r'(?P.+?)\s*=\s*([\'"])(?P.+)\2') for line in ref_file: @@ -93,6 +95,7 @@ genome_dir = genome_dir.relative_to(genome_dir.parts[0]) # single-file references +# the keys here are the param variables we will be downloading ref_keys =[ "ref_fasta", "ref_fasta_index", @@ -107,7 +110,7 @@ if refdir_re.match(p.parts[0]) else p for p in ref_paths] -# salmon index files +# salmon index files within index dir (must be downloaded individually through http) salmon_index_files = [ "complete_ref_lens.bin", "ctable.bin", @@ -125,6 +128,7 @@ "seq.bin", "versionInfo.json" ] +# param variables that are salmon index directories salmon_keys = [ "splici_index", "bulk_index" @@ -135,7 +139,7 @@ sa_dir = genome_dir / sa_dir.relative_to(sa_dir.parts[0]) ref_paths += [sa_dir / f for f in salmon_index_files] -# star index files +# star index files within index dir (must be downloaded individually through http) star_index_files = [ "chrLength.txt", "chrName.txt", @@ -160,7 +164,7 @@ if args.star_index: ref_paths += [star_dir / f for f in star_index_files] -# Cell Ranger index files +# Cell Ranger index files within index dir (must be downloaded individually through http) cr_index_files = [ "reference.json", "fasta/genome.fa", @@ -188,7 +192,7 @@ if args.cellranger_index: ref_paths += [cr_dir / f for f in cr_index_files] -# barcode files +# barcode files on S3 within the barcode_dir (must be downloaded individually through http) barcode_files = [ "3M-february-2018.txt", "737K-august-2016.txt", @@ -258,14 +262,14 @@ if match: containers[match.group('id')] = match.group('loc') -# pull docker images ## +# pull docker images if requested if args.docker: print("Pulling docker images...") for loc in containers.values(): subprocess.run(["docker", "pull", loc]) print("Done pulling docker images\n") -# pull singularity images (to optionally specified cache location) +# pull singularity images if requested (to optionally specified cache location) if args.singularity: print("Pulling singularity images...") if args.singularity_cache: From d64f0753024d67146cc703d7518f253a0da7bef8 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 1 Nov 2022 09:29:22 -0400 Subject: [PATCH 20/20] Add header comments --- get_refs.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/get_refs.py b/get_refs.py index b89da388..d66804f5 100755 --- a/get_refs.py +++ b/get_refs.py @@ -1,5 +1,13 @@ #!/usr/bin/env python3 +# Download reference files for the scpca-nf nextflow workflow to enable running +# the workflow without internet access by compute nodes. Optionally pulls +# container images for singularity or docker. +# +# Example usage: +# python3 get_refs.py --singularity + + import argparse import os import re