From 472fc5c737736b9e3708b6a439188730dc61ab30 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 8 Sep 2022 10:45:38 -0400
Subject: [PATCH 01/20] add script to download reference files

---
 get_refs.sh | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100755 get_refs.sh

diff --git a/get_refs.sh b/get_refs.sh
new file mode 100755
index 00000000..d3d2d44d
--- /dev/null
+++ b/get_refs.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+set -euo pipefail
+
+dest_dir=${1:-"scpca-references"}
+
+aws_root="https://scpca-references.s3.amazonaws.com"
+
+ref_dir="homo_sapiens/ensembl-104"
+ref_paths=(
+    "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
+    "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai"
+    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.gtf.gz"
+    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt"
+    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv"
+    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
+)
+
+salmon_index_files=(
+    "complete_ref_lens.bin"
+    "ctable.bin"
+    "ctg_offsets.bin"
+    "duplicate_clusters.tsv"
+    "info.json"
+    "mphf.bin"
+    "pos.bin"
+    "pre_indexing.log"
+    "rank.bin"
+    "ref_indexing.log"
+    "refAccumLengths.bin"
+    "reflengths.bin"
+    "refseq.bin"
+    "seq.bin"
+    "versionInfo.json"
+)
+
+salmon_index_dirs=(
+    "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome"
+    "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome"
+)
+for dir in ${salmon_index_dirs[@]}
+do
+    for file in ${salmon_index_files[@]}
+    do
+        ref_paths+=("${dir}/${file}")
+    done
+done
+
+
+star_index_files=(
+    "chrLength.txt"
+    "chrName.txt"
+    "chrNameLength.txt"
+    "chrStart.txt"
+    "exonGeTrInfo.tab"
+    "exonInfo.tab"
+    "geneInfo.tab"
+    "Genome"
+    "genomeParameters.txt"
+    "Log.out"
+    "SA"
+    "SAindex"
+    "sjdbInfo.txt"
+    "sjdbList.fromGTF.out.tab"
+    "sjdbList.out.tab"
+    "transcriptInfo.tab"
+)
+
+star_dir="${ref_dir}/star_index/Homo_sapiens.GRCh38.104.star_idx"
+for file in ${star_index_files[@]}
+do
+    ref_paths+=("${star_dir}/${file}")
+done
+
+
+barcode_files=(
+    "3M-february-2018.txt"
+    "737K-august-2016.txt"
+    "cellranger_mit_license.txt"
+    "visium-v1.txt"
+    "visium-v1.txt"
+)
+
+barcode_dir="barcodes/10X"
+for file in ${barcode_files[@]}
+do
+    ref_paths+=("${barcode_dir}/${file}")
+done
+
+for path in ${ref_paths[@]}
+do
+    echo "Getting $path"
+    curl -s --create-dirs "$aws_root/$path" -o "$dest_dir/$path"
+done

From 5ed43b4bef6ae7185d4119629eb111b0584183e3 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 8 Sep 2022 10:46:28 -0400
Subject: [PATCH 02/20] update reference files to use params

should make local files a bit easier
---
 config/reference_paths.config | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/config/reference_paths.config b/config/reference_paths.config
index 8b9e9f0c..76d97fec 100644
--- a/config/reference_paths.config
+++ b/config/reference_paths.config
@@ -1,20 +1,21 @@
-// reference params and files 
+// reference params and files
 assembly         = 'Homo_sapiens.GRCh38.104'
-ref_dir          = 's3://scpca-references/homo_sapiens/ensembl-104'
-ref_fasta        = "$ref_dir/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
-ref_fasta_index  = "$ref_dir/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai"
-ref_gtf          = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.gtf.gz"  
+ref_rootdir      = 's3://scpca-references'
+ref_dir          = "${params.ref_rootdir}/homo_sapiens/ensembl-104"
+ref_fasta        = "${params.ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
+ref_fasta_index  = "${params.ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai"
+ref_gtf          = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.gtf.gz"
 
-// index files 
-splici_index     = "$ref_dir/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome"
-bulk_index       = "$ref_dir/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome" 
-cellranger_index = "$ref_dir/cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full"
-star_index       = "$ref_dir/star_index/Homo_sapiens.GRCh38.104.star_idx"
+// index files
+splici_index     = "${params.ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome"
+bulk_index       = "${params.ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome"
+cellranger_index = "${params.ref_dir}/cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full"
+star_index       = "${params.ref_dir}/star_index/Homo_sapiens.GRCh38.104.star_idx"
 
-// annotation files 
-mito_file        = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt"
-t2g_3col_path    = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv"
-t2g_bulk_path    = "$ref_dir/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"  
+// annotation files
+mito_file        = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt"
+t2g_3col_path    = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv"
+t2g_bulk_path    = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
 
 // barcode files
-barcode_dir      = 's3://scpca-references/barcodes/10X' 
+barcode_dir      = '${params.ref_rootdir}/barcodes/10X'

From f26a50849dd9d0fa5ca0dfd36c5001f6aeb8287b Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 8 Sep 2022 12:20:41 -0400
Subject: [PATCH 03/20] Fix barcode file

---
 config/reference_paths.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/reference_paths.config b/config/reference_paths.config
index 76d97fec..7d54fb8d 100644
--- a/config/reference_paths.config
+++ b/config/reference_paths.config
@@ -18,4 +18,4 @@ t2g_3col_path    = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced
 t2g_bulk_path    = "${params.ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
 
 // barcode files
-barcode_dir      = '${params.ref_rootdir}/barcodes/10X'
+barcode_dir      = "${params.ref_rootdir}/barcodes/10X"

From 9b8c91225a3c95bbbb3b46375a20f76e704ea8f6 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 30 Sep 2022 12:57:03 -0400
Subject: [PATCH 04/20] add containerfile

---
 get_refs.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/get_refs.sh b/get_refs.sh
index d3d2d44d..41210840 100755
--- a/get_refs.sh
+++ b/get_refs.sh
@@ -3,6 +3,8 @@ set -euo pipefail
 
 dest_dir=${1:-"scpca-references"}
 
+
+containerfile_url="https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config"
 aws_root="https://scpca-references.s3.amazonaws.com"
 
 ref_dir="homo_sapiens/ensembl-104"
@@ -91,3 +93,9 @@ do
     echo "Getting $path"
     curl -s --create-dirs "$aws_root/$path" -o "$dest_dir/$path"
 done
+
+
+containers=`curl -s https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config \
+    | grep CONTAINER \
+    | cut -d"'" -f 2 \
+    | grep -v "^$" `

From 850772d2bfa195ce2a98a158504379690fc39263 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 11:25:07 -0400
Subject: [PATCH 05/20] Add python reference file

---
 get_refs.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100755 get_refs.py

diff --git a/get_refs.py b/get_refs.py
new file mode 100755
index 00000000..310ef0f3
--- /dev/null
+++ b/get_refs.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+import argparse
+import pathlib
+from signal import SIG_DFL
+import sys
+import urllib.request
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--refdir", type=str,
+                    default="scpca-references",
+                    help = "destination directory for downloaded reference files")
+parser.add_argument("--replace", action = argparse.BooleanOptionalAction,
+                    default = False,
+                    help = "replace previously downloaded files")
+parser.add_argument("--revision", type=str,
+                    default="main",
+                    metavar = "vX.X.X",
+                    help = "tag for a specific workflow version (defaults to latest revision)")
+
+args = parser.parse_args()
+
+aws_root = "https://scpca-references.s3.amazonaws.com"
+containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config"
+
+# genome reference files
+genome_dir = pathlib.Path("homo_sapiens/ensembl-104")
+ref_subdirs =[
+    "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai",
+    "annotation/Homo_sapiens.GRCh38.104.gtf.gz",
+    "annotation/Homo_sapiens.GRCh38.104.mitogenes.txt",
+    "annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv",
+    "annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
+]
+
+ref_paths = [genome_dir / sd for sd in ref_subdirs]
+
+# salmon index files
+salmon_index_files = [
+    "complete_ref_lens.bin",
+    "ctable.bin",
+    "ctg_offsets.bin",
+    "duplicate_clusters.tsv",
+    "info.json",
+    "mphf.bin",
+    "pos.bin",
+    "pre_indexing.log",
+    "rank.bin",
+    "ref_indexing.log",
+    "refAccumLengths.bin",
+    "reflengths.bin",
+    "refseq.bin",
+    "seq.bin",
+    "versionInfo.json"
+]
+
+salmon_index_dirs = [
+    genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome",
+    genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome"
+]
+for sa_dir in salmon_index_dirs:
+    ref_paths += [sa_dir / f for f in salmon_index_files]
+
+# star index files
+star_index_files = [
+    "chrLength.txt",
+    "chrName.txt",
+    "chrNameLength.txt",
+    "chrStart.txt",
+    "exonGeTrInfo.tab",
+    "exonInfo.tab",
+    "geneInfo.tab",
+    "Genome",
+    "genomeParameters.txt",
+    "Log.out",
+    "SA",
+    "SAindex",
+    "sjdbInfo.txt",
+    "sjdbList.fromGTF.out.tab",
+    "sjdbList.out.tab",
+    "transcriptInfo.tab"
+]
+
+star_dir = genome_dir/ "star_index/Homo_sapiens.GRCh38.104.star_idx"
+ref_paths += [star_dir / f for f in star_index_files]
+
+# get barcode file paths
+barcode_dir = pathlib.Path("barcodes/10X")
+barcode_files = [
+    "3M-february-2018.txt",
+    "737K-august-2016.txt",
+    "cellranger_mit_license.txt",
+    "visium-v1.txt",
+    "visium-v2.txt"
+]
+
+ref_paths += [barcode_dir / f for f in barcode_files]
+
+# download all the files and put them in the correct locations
+print("Downloading reference files:")
+for path in ref_paths[0:2]:
+    outfile = args.refdir / path
+    if outfile.exists() and not args.replace:
+        continue
+    print(f"Getting {path}")
+    # make parents
+    outfile.parent.mkdir(exist_ok=True, parents = True)
+    # download and write
+    file_url = f"{aws_root}/{path}"
+    try:
+        urllib.request.urlretrieve(file_url, outfile)
+    except urllib.error.URLError:
+        print(f"The file download failed for {file_url}, please check the URL for errors",
+              file = sys.stderr)
+        exit(1)
+

From 48ed950f8b95424cc088eab12375a62f10a2a64c Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 12:09:35 -0400
Subject: [PATCH 06/20] write params file

---
 get_refs.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index 310ef0f3..6051f509 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 
 import argparse
+import os
 import pathlib
-from signal import SIG_DFL
+import shutil
 import sys
 import urllib.request
 
@@ -10,9 +11,12 @@
 parser.add_argument("--refdir", type=str,
                     default="scpca-references",
                     help = "destination directory for downloaded reference files")
-parser.add_argument("--replace", action = argparse.BooleanOptionalAction,
-                    default = False,
+parser.add_argument("--replace",
+                    action = "store_true",
                     help = "replace previously downloaded files")
+parser.add_argument("--paramfile", type=str,
+                    default="",
+                    help = "nextflow param file to write")
 parser.add_argument("--revision", type=str,
                     default="main",
                     metavar = "vX.X.X",
@@ -24,6 +28,7 @@
 containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config"
 
 # genome reference files
+assembly = "Homo_sapiens.GRCh38.104"
 genome_dir = pathlib.Path("homo_sapiens/ensembl-104")
 ref_subdirs =[
     "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
@@ -98,7 +103,7 @@
 ref_paths += [barcode_dir / f for f in barcode_files]
 
 # download all the files and put them in the correct locations
-print("Downloading reference files:")
+print("Downloading reference files...")
 for path in ref_paths[0:2]:
     outfile = args.refdir / path
     if outfile.exists() and not args.replace:
@@ -114,4 +119,24 @@
         print(f"The file download failed for {file_url}, please check the URL for errors",
               file = sys.stderr)
         exit(1)
+print("Done with reference file downloads\n"
+      f"Reference files can be found at '{args.refdir}'\n")
+
+
+
+if args.paramfile:
+    pfile = pathlib.Path(args.paramfile)
+    # check if paramfile exists & move old if needed
+    if pfile.exists():
+        print(f"A file already exists at `{pfile}`, renaming previous file to `{pfile.name}.bak`")
+        shutil.move(pfile, str(pfile) + ".bak")
+    # create parameter dictionary
+    nf_params = {
+        'assembly': assembly,
+        'ref_rootdir': os.path.abspath(args.refdir)
+    }
+    with open(pfile, 'w') as f:
+        f.write("# local nextflow reference file parameters, generated by `get_refs.py`\n\n")
+        for key, value in nf_params.items():
+            f.write(f"{key}: {value}\n")
 

From fcb267376b2ec1421a60c42e3440e3a493ef2118 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 13:56:43 -0400
Subject: [PATCH 07/20] add docker and singularity pulls

---
 get_refs.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index 6051f509..2bd84d6d 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -3,7 +3,9 @@
 import argparse
 import os
 import pathlib
+import re
 import shutil
+import subprocess
 import sys
 import urllib.request
 
@@ -21,9 +23,19 @@
                     default="main",
                     metavar = "vX.X.X",
                     help = "tag for a specific workflow version (defaults to latest revision)")
-
+parser.add_argument("--docker",
+                    action = "store_true",
+                    help = "pull and cache images for docker")
+parser.add_argument("--singularity",
+                    action = "store_true",
+                    help = "pull and cache images for singularity")
+parser.add_argument("--singularity_cache", type=str,
+                    metavar = "CACHE_DIR",
+                    help = "cache directory for singularity"
+)
 args = parser.parse_args()
 
+# scpca-nf resource urls
 aws_root = "https://scpca-references.s3.amazonaws.com"
 containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config"
 
@@ -115,15 +127,14 @@
     file_url = f"{aws_root}/{path}"
     try:
         urllib.request.urlretrieve(file_url, outfile)
-    except urllib.error.URLError:
+    except urllib.error.URLError as e:
+        print(e.reason)
         print(f"The file download failed for {file_url}, please check the URL for errors",
               file = sys.stderr)
         exit(1)
 print("Done with reference file downloads\n"
       f"Reference files can be found at '{args.refdir}'\n")
 
-
-
 if args.paramfile:
     pfile = pathlib.Path(args.paramfile)
     # check if paramfile exists & move old if needed
@@ -140,3 +151,35 @@
         for key, value in nf_params.items():
             f.write(f"{key}: {value}\n")
 
+if args.singularity or args.docker:
+    print("getting list of required containers")
+    containers = {}
+    try:
+        container_file =  urllib.request.urlopen(containerfile_url)
+    except urllib.error.URLError as e:
+        print(e.reason)
+        print(f"The file download failed for {container_url}, please check the URL for errors")
+        print(f"Is `{args.revision}` a valid release tag?")
+        exit(1)
+
+    # pattern match to find container id & location
+    container_re = re.compile(r'(?P<id>.+_CONTAINER)\s*=\s*([\'"])(?P<loc>.+)\2')
+    for line in container_file:
+        match = container_re.search(line.decode())
+        if match:
+            containers[match.group('id')] = match.group('loc')
+
+# pull docker images
+if args.docker:
+    for loc in containers.values():
+        subprocess.run(["docker", "pull", loc])
+
+# pull singularity images (with cache location)
+if args.singularity:
+    if args.singularity_cache:
+        os.environ['SINGULARITY_CACHEDIR'] = args.singularity_cache
+    for loc in containers.values():
+        subprocess.run(
+            ["singularity", "pull", f"docker://{loc}"],
+            env = os.environ
+        )

From 4f9232f173258d77afff1ae6208475d4a52fecd1 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 14:01:52 -0400
Subject: [PATCH 08/20] default param file name

---
 get_refs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index 2bd84d6d..43b70186 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -17,8 +17,8 @@
                     action = "store_true",
                     help = "replace previously downloaded files")
 parser.add_argument("--paramfile", type=str,
-                    default="",
-                    help = "nextflow param file to write")
+                    default="local_refs.params",
+                    help = "nextflow param file to write (default: `local_refs.params`)")
 parser.add_argument("--revision", type=str,
                     default="main",
                     metavar = "vX.X.X",

From 407db7db48d265d79c1ad785e66f3f8183f8df47 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 14:38:56 -0400
Subject: [PATCH 09/20] Add messages for containers

---
 get_refs.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index 43b70186..a72afe73 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -152,7 +152,7 @@
             f.write(f"{key}: {value}\n")
 
 if args.singularity or args.docker:
-    print("getting list of required containers")
+    print("Getting list of required containers")
     containers = {}
     try:
         container_file =  urllib.request.urlopen(containerfile_url)
@@ -171,15 +171,22 @@
 
 # pull docker images
 if args.docker:
+    print("Pulling docker images...")
     for loc in containers.values():
         subprocess.run(["docker", "pull", loc])
+    print("Done pulling docker images\n")
 
-# pull singularity images (with cache location)
+# pull singularity images (to optionally specified cache location)
 if args.singularity:
+    print("Pulling singularity images...")
     if args.singularity_cache:
-        os.environ['SINGULARITY_CACHEDIR'] = args.singularity_cache
+        os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(args.singularity_cache)
     for loc in containers.values():
         subprocess.run(
             ["singularity", "pull", f"docker://{loc}"],
             env = os.environ
         )
+    print("Done pulling singularity images")
+    if args.singularity_cache:
+        print(f"Singularity images located at {os.environ['SINGULARITY_CACHEDIR']}")
+    print()

From 83126004628c36262d9afa54b9e5f05be99c8b28 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 14:39:19 -0400
Subject: [PATCH 10/20] make star index optional

---
 get_refs.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/get_refs.py b/get_refs.py
index a72afe73..77c0f78a 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -23,6 +23,9 @@
                     default="main",
                     metavar = "vX.X.X",
                     help = "tag for a specific workflow version (defaults to latest revision)")
+parser.add_argument("--star_index",
+                    action = "store_true",
+                    help = "get STAR index (required for genetic demultiplexing)")
 parser.add_argument("--docker",
                     action = "store_true",
                     help = "pull and cache images for docker")
@@ -100,7 +103,8 @@
 ]
 
 star_dir = genome_dir/ "star_index/Homo_sapiens.GRCh38.104.star_idx"
-ref_paths += [star_dir / f for f in star_index_files]
+if args.star_index:
+    ref_paths += [star_dir / f for f in star_index_files]
 
 # get barcode file paths
 barcode_dir = pathlib.Path("barcodes/10X")

From 72d343ad9c875f4c00b40c4324929141f135ec11 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 15:09:29 -0400
Subject: [PATCH 11/20] Force singularity

---
 get_refs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/get_refs.py b/get_refs.py
index 77c0f78a..db67ff5d 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -187,7 +187,7 @@
         os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(args.singularity_cache)
     for loc in containers.values():
         subprocess.run(
-            ["singularity", "pull", f"docker://{loc}"],
+            ["singularity", "pull", "--force", f"docker://{loc}"],
             env = os.environ
         )
     print("Done pulling singularity images")

From aaab20ff9d01f951ea2c5b1c2f35c89db870ab43 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 26 Oct 2022 15:28:32 -0400
Subject: [PATCH 12/20] missing makeJson

---
 modules/spaceranger.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/spaceranger.nf b/modules/spaceranger.nf
index 253e0409..36c7fabc 100644
--- a/modules/spaceranger.nf
+++ b/modules/spaceranger.nf
@@ -16,6 +16,7 @@ process spaceranger{
   script:
     out_id = file(meta.spaceranger_results_dir).name
     meta.cellranger_index = index.fileName
+    meta_json = Utils.makeJson(meta)
     """
     spaceranger count \
       --id=${out_id} \

From f1d36bbd599a092a79e6180e1a3295b5c29254fc Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 27 Oct 2022 13:05:42 -0400
Subject: [PATCH 13/20] Add cell ranger option & download

---
 get_refs.py | 45 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index db67ff5d..7f188f8c 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -17,7 +17,7 @@
                     action = "store_true",
                     help = "replace previously downloaded files")
 parser.add_argument("--paramfile", type=str,
-                    default="local_refs.params",
+                    default="local_refs.yaml",
                     help = "nextflow param file to write (default: `local_refs.params`)")
 parser.add_argument("--revision", type=str,
                     default="main",
@@ -26,6 +26,9 @@
 parser.add_argument("--star_index",
                     action = "store_true",
                     help = "get STAR index (required for genetic demultiplexing)")
+parser.add_argument("--cellranger_index",
+                    action = "store_true",
+                    help = "get Cell Ranger index (required for spatial data)")
 parser.add_argument("--docker",
                     action = "store_true",
                     help = "pull and cache images for docker")
@@ -34,8 +37,7 @@
                     help = "pull and cache images for singularity")
 parser.add_argument("--singularity_cache", type=str,
                     metavar = "CACHE_DIR",
-                    help = "cache directory for singularity"
-)
+                    help = "cache directory for singularity")
 args = parser.parse_args()
 
 # scpca-nf resource urls
@@ -53,7 +55,6 @@
     "annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv",
     "annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
 ]
-
 ref_paths = [genome_dir / sd for sd in ref_subdirs]
 
 # salmon index files
@@ -102,11 +103,37 @@
     "transcriptInfo.tab"
 ]
 
-star_dir = genome_dir/ "star_index/Homo_sapiens.GRCh38.104.star_idx"
+star_dir = genome_dir / "star_index/Homo_sapiens.GRCh38.104.star_idx"
 if args.star_index:
     ref_paths += [star_dir / f for f in star_index_files]
 
-# get barcode file paths
+# Cell Ranger index files
+cr_index_files = [
+    "reference.json",
+    "fasta/genome.fa",
+    "fasta/genome.fa.fai",
+    "genes/genes.gtf.gz",
+    "star/chrLength.txt",
+    "star/chrName.txt",
+    "star/chrNameLength.txt",
+    "star/chrStart.txt",
+    "star/exonGeTrInfo.tab",
+    "star/exonInfo.tab",
+    "star/geneInfo.tab",
+    "star/Genome",
+    "star/genomeParameters.txt",
+    "star/SA",
+    "star/SAindex",
+    "star/sjdbInfo.txt",
+    "star/sjdbList.fromGTF.out.tab",
+    "star/sjdbList.out.tab",
+    "star/transcriptInfo.tab"
+]
+cr_dir = genome_dir / "cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full"
+if args.cellranger_index:
+    ref_paths += [cr_dir / f for f in cr_index_files]
+
+# barcode file paths
 barcode_dir = pathlib.Path("barcodes/10X")
 barcode_files = [
     "3M-february-2018.txt",
@@ -118,7 +145,7 @@
 
 ref_paths += [barcode_dir / f for f in barcode_files]
 
-# download all the files and put them in the correct locations
+## download all the files and put them in the correct locations ##
 print("Downloading reference files...")
 for path in ref_paths[0:2]:
     outfile = args.refdir / path
@@ -139,6 +166,7 @@
 print("Done with reference file downloads\n"
       f"Reference files can be found at '{args.refdir}'\n")
 
+# write param file if requested
 if args.paramfile:
     pfile = pathlib.Path(args.paramfile)
     # check if paramfile exists & move old if needed
@@ -155,6 +183,7 @@
         for key, value in nf_params.items():
             f.write(f"{key}: {value}\n")
 
+## Get docker containers from workflow
 if args.singularity or args.docker:
     print("Getting list of required containers")
     containers = {}
@@ -173,7 +202,7 @@
         if match:
             containers[match.group('id')] = match.group('loc')
 
-# pull docker images
+# pull docker images ##
 if args.docker:
     print("Pulling docker images...")
     for loc in containers.values():

From 1a1a2af1b770ca1a1d6dbfad4ef65f0fde312527 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 27 Oct 2022 13:23:06 -0400
Subject: [PATCH 14/20] remove bash script

---
 get_refs.sh | 101 ----------------------------------------------------
 1 file changed, 101 deletions(-)
 delete mode 100755 get_refs.sh

diff --git a/get_refs.sh b/get_refs.sh
deleted file mode 100755
index 41210840..00000000
--- a/get_refs.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-dest_dir=${1:-"scpca-references"}
-
-
-containerfile_url="https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config"
-aws_root="https://scpca-references.s3.amazonaws.com"
-
-ref_dir="homo_sapiens/ensembl-104"
-ref_paths=(
-    "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
-    "${ref_dir}/fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai"
-    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.gtf.gz"
-    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.mitogenes.txt"
-    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv"
-    "${ref_dir}/annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
-)
-
-salmon_index_files=(
-    "complete_ref_lens.bin"
-    "ctable.bin"
-    "ctg_offsets.bin"
-    "duplicate_clusters.tsv"
-    "info.json"
-    "mphf.bin"
-    "pos.bin"
-    "pre_indexing.log"
-    "rank.bin"
-    "ref_indexing.log"
-    "refAccumLengths.bin"
-    "reflengths.bin"
-    "refseq.bin"
-    "seq.bin"
-    "versionInfo.json"
-)
-
-salmon_index_dirs=(
-    "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome"
-    "${ref_dir}/salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome"
-)
-for dir in ${salmon_index_dirs[@]}
-do
-    for file in ${salmon_index_files[@]}
-    do
-        ref_paths+=("${dir}/${file}")
-    done
-done
-
-
-star_index_files=(
-    "chrLength.txt"
-    "chrName.txt"
-    "chrNameLength.txt"
-    "chrStart.txt"
-    "exonGeTrInfo.tab"
-    "exonInfo.tab"
-    "geneInfo.tab"
-    "Genome"
-    "genomeParameters.txt"
-    "Log.out"
-    "SA"
-    "SAindex"
-    "sjdbInfo.txt"
-    "sjdbList.fromGTF.out.tab"
-    "sjdbList.out.tab"
-    "transcriptInfo.tab"
-)
-
-star_dir="${ref_dir}/star_index/Homo_sapiens.GRCh38.104.star_idx"
-for file in ${star_index_files[@]}
-do
-    ref_paths+=("${star_dir}/${file}")
-done
-
-
-barcode_files=(
-    "3M-february-2018.txt"
-    "737K-august-2016.txt"
-    "cellranger_mit_license.txt"
-    "visium-v1.txt"
-    "visium-v1.txt"
-)
-
-barcode_dir="barcodes/10X"
-for file in ${barcode_files[@]}
-do
-    ref_paths+=("${barcode_dir}/${file}")
-done
-
-for path in ${ref_paths[@]}
-do
-    echo "Getting $path"
-    curl -s --create-dirs "$aws_root/$path" -o "$dest_dir/$path"
-done
-
-
-containers=`curl -s https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/main/config/containers.config \
-    | grep CONTAINER \
-    | cut -d"'" -f 2 \
-    | grep -v "^$" `

From c910a14a7bcfe816d81e2bbeb50cd8015ca5f176 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Fri, 28 Oct 2022 14:18:27 -0400
Subject: [PATCH 15/20] parse reference file locations from repo

---
 get_refs.py | 108 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 79 insertions(+), 29 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index 7f188f8c..ff850ffc 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -2,13 +2,14 @@
 
 import argparse
 import os
-import pathlib
 import re
 import shutil
 import subprocess
 import sys
 import urllib.request
 
+from pathlib import Path
+
 parser = argparse.ArgumentParser()
 parser.add_argument("--refdir", type=str,
                     default="scpca-references",
@@ -17,8 +18,8 @@
                     action = "store_true",
                     help = "replace previously downloaded files")
 parser.add_argument("--paramfile", type=str,
-                    default="local_refs.yaml",
-                    help = "nextflow param file to write (default: `local_refs.params`)")
+                    default="localref_params.yaml",
+                    help = "nextflow param file to write (default: `localref_params.yaml`)")
 parser.add_argument("--revision", type=str,
                     default="main",
                     metavar = "vX.X.X",
@@ -41,21 +42,64 @@
 args = parser.parse_args()
 
 # scpca-nf resource urls
-aws_root = "https://scpca-references.s3.amazonaws.com"
+reffile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/reference_paths.config"
 containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config"
 
-# genome reference files
-assembly = "Homo_sapiens.GRCh38.104"
-genome_dir = pathlib.Path("homo_sapiens/ensembl-104")
-ref_subdirs =[
-    "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
-    "fasta/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai",
-    "annotation/Homo_sapiens.GRCh38.104.gtf.gz",
-    "annotation/Homo_sapiens.GRCh38.104.mitogenes.txt",
-    "annotation/Homo_sapiens.GRCh38.104.spliced_intron.tx2gene_3col.tsv",
-    "annotation/Homo_sapiens.GRCh38.104.spliced_cdna.tx2gene.tsv"
+# download reference file
+print("Getting list of required reference files")
+refs = {}
+try:
+    ref_file =  urllib.request.urlopen(reffile_url)
+except urllib.error.URLError as e:
+    print(e.reason)
+    print(f"The file download failed for {reffile_url}, please check the URL for errors")
+    print(f"Is `{args.revision}` a valid release tag?")
+    exit(1)
+
+# parse reference file
+ref_re = re.compile(r'(?P<id>.+?)\s*=\s*([\'"])(?P<loc>.+)\2')
+for line in ref_file:
+    match = ref_re.search(line.decode())
+    if match:
+        refs[match.group('id')] = match.group('loc')
+
+# regular expressions for parameter expansion
+root_re = re.compile(r'\$\{?(params.)?ref_rootdir\}?$')
+refdir_re = re.compile(r'\$\{?(params.)?ref_dir\}?$')
+
+# get assembly and root location
+assembly = refs.get("assembly", "NA")
+root_parts = refs.get("ref_rootdir").split('://')
+if root_parts[0] == 's3':
+    url_root = f"https://{root_parts[1]}.s3.amazonaws.com"
+elif root_parts[0] in ['http', 'https', 'ftp']:
+    url_root = refs.get("ref_rootdir")
+else:
+    print("The `ref_rootdir` is not a supported remote location.")
+    exit(1)
+
+
+
+# set the base directory (usually corresponding to a genome version)
+genome_dir = Path(refs.get("ref_dir"))
+# remove the first element if it is a variable
+if root_re.match(genome_dir.parts[0]):
+    genome_dir = genome_dir.relative_to(genome_dir.parts[0])
+
+# single-file references
+ref_keys =[
+    "ref_fasta",
+    "ref_fasta_index",
+    "ref_gtf",
+    "mito_file",
+    "t2g_3col_path",
+    "t2g_bulk_path"
 ]
-ref_paths = [genome_dir / sd for sd in ref_subdirs]
+ref_paths = [Path(refs.get(k)) for k in ref_keys]
+# replace initial part of path if it is `$params.ref_dir` or similar
+ref_paths = [genome_dir / p.relative_to(p.parts[0])
+             if refdir_re.match(p.parts[0]) else p
+             for p in ref_paths]
 
 # salmon index files
 salmon_index_files = [
@@ -75,12 +119,14 @@
     "seq.bin",
     "versionInfo.json"
 ]
-
-salmon_index_dirs = [
-    genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_intron.txome",
-    genome_dir / "salmon_index/Homo_sapiens.GRCh38.104.spliced_cdna.txome"
+salmon_keys = [
+    "splici_index",
+    "bulk_index"
 ]
-for sa_dir in salmon_index_dirs:
+for k in salmon_keys:
+    sa_dir = Path(refs.get(k))
+    if refdir_re.match(sa_dir.parts[0]):
+        sa_dir = genome_dir / sa_dir.relative_to(sa_dir.parts[0])
     ref_paths += [sa_dir / f for f in salmon_index_files]
 
 # star index files
@@ -102,8 +148,9 @@
     "sjdbList.out.tab",
     "transcriptInfo.tab"
 ]
-
-star_dir = genome_dir / "star_index/Homo_sapiens.GRCh38.104.star_idx"
+star_dir = Path(refs.get("star_index"))
+if refdir_re.match(star_dir.parts[0]):
+    star_dir = genome_dir / star_dir.relative_to(star_dir.parts[0])
 if args.star_index:
     ref_paths += [star_dir / f for f in star_index_files]
 
@@ -129,12 +176,13 @@
     "star/sjdbList.out.tab",
     "star/transcriptInfo.tab"
 ]
-cr_dir = genome_dir / "cellranger_index/Homo_sapiens.GRCh38.104_cellranger_full"
+cr_dir = Path(refs.get("cellranger_index"))
+if refdir_re.match(cr_dir.parts[0]):
+    cr_dir = genome_dir / cr_dir.relative_to(cr_dir.parts[0])
 if args.cellranger_index:
     ref_paths += [cr_dir / f for f in cr_index_files]
 
-# barcode file paths
-barcode_dir = pathlib.Path("barcodes/10X")
+# barcode files
 barcode_files = [
     "3M-february-2018.txt",
     "737K-august-2016.txt",
@@ -142,12 +190,14 @@
     "visium-v1.txt",
     "visium-v2.txt"
 ]
-
+barcode_dir = Path(refs.get("barcode_dir"))
+if root_re.match(barcode_dir.parts[0]):
+    barcode_dir = barcode_dir.relative_to(barcode_dir.parts[0])
 ref_paths += [barcode_dir / f for f in barcode_files]
 
 ## download all the files and put them in the correct locations ##
 print("Downloading reference files...")
-for path in ref_paths[0:2]:
+for path in ref_paths:
     outfile = args.refdir / path
     if outfile.exists() and not args.replace:
         continue
@@ -155,7 +205,7 @@
     # make parents
     outfile.parent.mkdir(exist_ok=True, parents = True)
     # download and write
-    file_url = f"{aws_root}/{path}"
+    file_url = f"{url_root}/{path}"
     try:
         urllib.request.urlretrieve(file_url, outfile)
     except urllib.error.URLError as e:
@@ -168,7 +218,7 @@
 
 # write param file if requested
 if args.paramfile:
-    pfile = pathlib.Path(args.paramfile)
+    pfile = Path(args.paramfile)
     # check if paramfile exists & move old if needed
     if pfile.exists():
         print(f"A file already exists at `{pfile}`, renaming previous file to `{pfile.name}.bak`")

From 293fded3e23ed3a88f5730632c265e21b755ace8 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 31 Oct 2022 13:25:31 -0400
Subject: [PATCH 16/20] Improve root URI parsing

---
 get_refs.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index ff850ffc..2af1252f 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -69,13 +69,19 @@
 
 # get assembly and root location
 assembly = refs.get("assembly", "NA")
-root_parts = refs.get("ref_rootdir").split('://')
+# split out protocol from the root URI
+root_parts = refs.get("ref_rootdir").split('://', maxsplit = 1)
 if root_parts[0] == 's3':
-    url_root = f"https://{root_parts[1]}.s3.amazonaws.com"
+    # if S3, convert bucket path to https:// url
+    bucket_path = root_parts[1].split("/", maxsplit = 1)
+    url_root = f"https://{bucket_path[0]}.s3.amazonaws.com"
+    if len(bucket_path) > 1:
+        url_root += f"/{bucket_path[1]}"
 elif root_parts[0] in ['http', 'https', 'ftp']:
+    # otherwise, just get the location
     url_root = refs.get("ref_rootdir")
 else:
-    print("The `ref_rootdir` is not a supported remote location.")
+    print("`ref_rootdir` is not a supported remote location.")
     exit(1)
 
 

From f2d71dce8dd62b81142e60e56ede61557f431fe5 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 31 Oct 2022 13:26:13 -0400
Subject: [PATCH 17/20] Updates from review (semicolons, arg names)

---
 get_refs.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index 2af1252f..5c9b38c2 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -14,12 +14,12 @@
 parser.add_argument("--refdir", type=str,
                     default="scpca-references",
                     help = "destination directory for downloaded reference files")
-parser.add_argument("--replace",
-                    action = "store_true",
-                    help = "replace previously downloaded files")
 parser.add_argument("--paramfile", type=str,
                     default="localref_params.yaml",
-                    help = "nextflow param file to write (default: `localref_params.yaml`)")
+                    help = "path to nextflow param file to write (default: `localref_params.yaml`)")
+parser.add_argument("--overwrite_refs",
+                    action = "store_true",
+                    help = "replace previously downloaded files")
 parser.add_argument("--revision", type=str,
                     default="main",
                     metavar = "vX.X.X",
@@ -52,7 +52,7 @@
     ref_file =  urllib.request.urlopen(reffile_url)
 except urllib.error.URLError as e:
     print(e.reason)
-    print(f"The file download failed for {reffile_url}, please check the URL for errors")
+    print(f"The file download failed for {reffile_url}; please check the URL for errors")
     print(f"Is `{args.revision}` a valid release tag?")
     exit(1)
 
@@ -202,10 +202,10 @@
 ref_paths += [barcode_dir / f for f in barcode_files]
 
 ## download all the files and put them in the correct locations ##
-print("Downloading reference files...")
+print("Downloading reference files... (This might take a while)")
 for path in ref_paths:
     outfile = args.refdir / path
-    if outfile.exists() and not args.replace:
+    if outfile.exists() and not args.overwrite_refs:
         continue
     print(f"Getting {path}")
     # make parents
@@ -216,7 +216,7 @@
         urllib.request.urlretrieve(file_url, outfile)
     except urllib.error.URLError as e:
         print(e.reason)
-        print(f"The file download failed for {file_url}, please check the URL for errors",
+        print(f"The file download failed for {file_url}; please check the URL for errors",
               file = sys.stderr)
         exit(1)
 print("Done with reference file downloads\n"
@@ -227,7 +227,7 @@
     pfile = Path(args.paramfile)
     # check if paramfile exists & move old if needed
     if pfile.exists():
-        print(f"A file already exists at `{pfile}`, renaming previous file to `{pfile.name}.bak`")
+        print(f"A file already exists at `{pfile}`; renaming previous file to `{pfile.name}.bak`")
         shutil.move(pfile, str(pfile) + ".bak")
     # create parameter dictionary
     nf_params = {
@@ -247,7 +247,7 @@
         container_file =  urllib.request.urlopen(containerfile_url)
     except urllib.error.URLError as e:
         print(e.reason)
-        print(f"The file download failed for {container_url}, please check the URL for errors")
+        print(f"The file download failed for {container_url}; please check the URL for errors")
         print(f"Is `{args.revision}` a valid release tag?")
         exit(1)
 

From 9ba2bcb14538ddaf1f44dfb463741da568b1419e Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 31 Oct 2022 13:28:45 -0400
Subject: [PATCH 18/20] move refs intialization

---
 get_refs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/get_refs.py b/get_refs.py
index 5c9b38c2..a23d1f3d 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -47,7 +47,6 @@
 
 # download reference file
 print("Getting list of required reference files")
-refs = {}
 try:
     ref_file =  urllib.request.urlopen(reffile_url)
 except urllib.error.URLError as e:
@@ -57,6 +56,7 @@
     exit(1)
 
 # parse reference file
+refs = {}
 ref_re = re.compile(r'(?P<id>.+?)\s*=\s*([\'"])(?P<loc>.+)\2')
 for line in ref_file:
     match = ref_re.search(line.decode())

From 154b1b6dcf6b737516ee00e3d60d9b846c4de001 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Mon, 31 Oct 2022 13:47:15 -0400
Subject: [PATCH 19/20] a few more comments

---
 get_refs.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/get_refs.py b/get_refs.py
index a23d1f3d..b89da388 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -56,6 +56,8 @@
     exit(1)
 
 # parse reference file
+# gets all of the `param` variables that are set in `reference_paths.config`
+# and stores then in a dict
 refs = {}
 ref_re = re.compile(r'(?P<id>.+?)\s*=\s*([\'"])(?P<loc>.+)\2')
 for line in ref_file:
@@ -93,6 +95,7 @@
     genome_dir = genome_dir.relative_to(genome_dir.parts[0])
 
 # single-file references
+# the keys here are the param variables we will be downloading
 ref_keys =[
     "ref_fasta",
     "ref_fasta_index",
@@ -107,7 +110,7 @@
              if refdir_re.match(p.parts[0]) else p
              for p in ref_paths]
 
-# salmon index files
+# salmon index files within index dir (must be downloaded individually through http)
 salmon_index_files = [
     "complete_ref_lens.bin",
     "ctable.bin",
@@ -125,6 +128,7 @@
     "seq.bin",
     "versionInfo.json"
 ]
+# param variables that are salmon index directories
 salmon_keys = [
     "splici_index",
     "bulk_index"
@@ -135,7 +139,7 @@
         sa_dir = genome_dir / sa_dir.relative_to(sa_dir.parts[0])
     ref_paths += [sa_dir / f for f in salmon_index_files]
 
-# star index files
+# star index files within index dir (must be downloaded individually through http)
 star_index_files = [
     "chrLength.txt",
     "chrName.txt",
@@ -160,7 +164,7 @@
 if args.star_index:
     ref_paths += [star_dir / f for f in star_index_files]
 
-# Cell Ranger index files
+# Cell Ranger index files within index dir (must be downloaded individually through http)
 cr_index_files = [
     "reference.json",
     "fasta/genome.fa",
@@ -188,7 +192,7 @@
 if args.cellranger_index:
     ref_paths += [cr_dir / f for f in cr_index_files]
 
-# barcode files
+# barcode files on S3 within the barcode_dir (must be downloaded individually through http)
 barcode_files = [
     "3M-february-2018.txt",
     "737K-august-2016.txt",
@@ -258,14 +262,14 @@
         if match:
             containers[match.group('id')] = match.group('loc')
 
-# pull docker images ##
+# pull docker images if requested
 if args.docker:
     print("Pulling docker images...")
     for loc in containers.values():
         subprocess.run(["docker", "pull", loc])
     print("Done pulling docker images\n")
 
-# pull singularity images (to optionally specified cache location)
+# pull singularity images if requested (to optionally specified cache location)
 if args.singularity:
     print("Pulling singularity images...")
     if args.singularity_cache:

From d64f0753024d67146cc703d7518f253a0da7bef8 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 1 Nov 2022 09:29:22 -0400
Subject: [PATCH 20/20] Add header comments

---
 get_refs.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/get_refs.py b/get_refs.py
index b89da388..d66804f5 100755
--- a/get_refs.py
+++ b/get_refs.py
@@ -1,5 +1,13 @@
 #!/usr/bin/env python3
 
+# Download reference files for the scpca-nf nextflow workflow to enable running
+# the workflow without internet access by compute nodes. Optionally pulls
+# container images for singularity or docker.
+#
+# Example usage:
+# python3 get_refs.py --singularity
+
+
 import argparse
 import os
 import re