-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3-copy_fastqs
111 lines (92 loc) · 4.03 KB
/
3-copy_fastqs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# This script initializes the filesystem of this project:
# It creates a "jobs" folder which contains as many subdirectories as samples it has
# For each sample directory, it creates the following files/folders:
# 1. fastq: dir with the symlinks pointing to the fastq files
# 2. log: dir which contains standard error and output of cellranger
# 3. (sample_id).cmd: job script to compute the features-barcode matrix using cellranger
# Import required packages
import numpy as np
import pandas as pd
import os
import argparse
import subprocess
import re
import sys
import config_vars as cfg
from utils import *
# Define command-line arguments
parser = argparse.ArgumentParser(description = "options to initialize the filesystem and scripts of this project")
parser.add_argument("--subproject",
dest = "subproject",
action = "store",
default = None,
help = "Subproject we are working on (i.e. BCLLATLAS_10)")
parser.add_argument("--gem_id",
dest = "gem_id",
action = "store",
default = None,
help = "Gel Beads in Emulsion id")
parser.add_argument("--verbose",
dest = "verbose",
action = "store_true",
default = False,
help = "Print log in standard error")
parser.add_argument("--metadata",
dest = "metadata",
action = "store",
default = None,
help = "Metadata csv file for the tonsil atlas project")
parser.add_argument("--fastq_paths",
dest = "fastq_paths",
action = "store",
default = None,
help = "File that contains the paths of the fastqs for the subproject libraries")
def create_fastq_symlink_nh(gem_id, fastq_path_df, symlink_path):
"""Creates a symbolic link pointing to a fastq file using cellranger notation
Args:
gem_id: identifier of the Gelbeads-in-Emulsion (GEM) well that will be used as prefix in the symlink
fastq_path_df: pandas dataframe with the fastq paths for that gem_id
symlink_path: string specifying where to create the symlinks
Returns:
None
"""
pair_ids = np.unique(fastq_path_df["pair_id"])
for i in range(len(pair_ids)):
filt = (fastq_path_df["pair_id"] == pair_ids[i])
pair_df = fastq_path_df.loc[filt, :]
for j in pair_df.index:
fastq_path = pair_df.loc[j, "fastq_path"]
lane = str(i + 1)
read = pair_df.loc[j, "read"]
read = read.replace("R", "")
subprocess.run(["ln", "-s", fastq_path, "{}/{}_S1_L00{}_{}.fastq.gz".format(symlink_path, gem_id, lane, read)])
options = parser.parse_args()
subproject = options.subproject
gem_id = options.gem_id
metadata_path = options.metadata
fastq_paths = options.fastq_paths
# Read data
project_dir = "/home/groups/singlecell/mabdalfttah/projects/{}".format(subproject)
fastq_path_df = pd.read_csv(fastq_paths, sep = "\t", header = 0)
metadata_df = pd.read_csv(metadata_path, sep = ",", header = 0)
if options.verbose:
sys.stderr.write("Files read successfully!\n")
# For each sample, create directories and jobscript
if not os.path.exists("{}/jobs".format(project_dir)):
os.mkdir("{}/jobs".format(project_dir))
filt = (metadata_df["gem_id"] == gem_id)
metadata_df = metadata_df.loc[filt]
# Create directories
subproject_dir = "{}/jobs/{}".format(project_dir, gem_id)
fastq_dir = "{}/fastq".format(subproject_dir)
log_dir = "{}/log".format(subproject_dir)
for direct in [subproject_dir, fastq_dir, log_dir]:
if not os.path.exists(direct):
os.mkdir(direct)
# Define variables and subset dataframes
library_id = metadata_df.loc[filt, "library_id"]
fastq_sub_df = fastq_path_df.loc[fastq_path_df["library_id"].isin(library_id), :]
type = metadata_df["type"]
type = type.values[0]
# Create symmlinks to fastq files
create_fastq_symlink_nh(gem_id, fastq_sub_df, fastq_dir)