-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbam2bedgraph.sh
executable file
·326 lines (246 loc) · 11.2 KB
/
bam2bedgraph.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/bin/bash
#/ Convert BAM files to bedGraphs
set -e -o pipefail
LC_ALL=C
export VERSION=1.1
#/ Help section:
usage(){
echo "
## Bam2Bigwig.sh Version: ${VERSION}
=> Options:
------------------------------------------------------------------------------------------------------------------
-h | --help : show this message {}
-b | --bams : space-delimited string of input bam files in double quotes or *.bam {}
-m | --mode : <single> or <paired>, if paired will use -pc option in bedtools genomecov {}
-a | --atacseq : ATAC-seq mode, will use cutsites rather than reads {FALSE}
-e | --extend : numeric value to extend reads to fragment size, see details {0}
-n | --normalize : if set then normalizes data based on the scaling_factors.txt file {FALSE}
-j | --njobs : number of parallel (GNU parallel) jobs to create bedGraphs, see details. {1}
-q | --sortthreads : number of threads for sorting the bedGraph {1}
-w | --sortmem : memory for sorting the bedGraph {1G}
-t | --nobigwig : if set then do not output bigwig files along with the bedGraphs {FALSE}
-g | --average : if set then average samples of the same group, see details {FALSE}
------------------------------------------------------------------------------------------------------------------
Details: * Option --extend is simply -fs in bedtools genomecov. If --atacseq is set then this can be
used to smooth the coverage of the cutting sites for visualization. It will then use
bedtools slop -b to extend the cutting sites to both directions, so if a 100bp window is
desired for smoothing one should set it to 50 to have 50bp extension to each direction.
* Option --normalize will make use of the scaling_factors.txt file which must be present in the same
directory. This file must have $1 with the exact file names (of the bam files) and $2 a scaling factor,
e.g. from DESeq2 which will be used to divide the raw counts (so 4th column of bedGraph file) by.
* Option --njobs is -j of GNU parallel. One job requires up to 5 threads depending on whether --atacseq is
set or not plus the --sortthreads which are the threads for sorting the bedGraphs so be sure to set this
rather higher than lower. Memory requirement is basically what the sort (--sortmem) needs, everything +
else is mostly simple file/text parsing operations.
* By default all all .bedGraph files in the directory will be also written to bigwig. Set --nobigwig to
turn that off. As this is a simple script that does not know whether a bedGraph that is in the directory
comes from this script or just happens to be there from any other process it is recommended to make a
separate directory for every run of this script and simply symlink the bam files into it to have a clean
environment.
* Towards --average: If samples are named as group_rep1*, _rep2*, rep3* etc so using '_rep' as first
delimiter after the groupwise basename and --average is set then all bedGraph files of that group
will be averaged. If no "_rep" is found then this is deactivated regardless whether it is set or not.
"
}; if [[ -z "$1" ]] || [[ $1 == -h ]] || [[ $1 == --help ]]; then usage; exit; fi
#------------------------------------------------------------------------------------------------------------------------
#/ Set args:
for arg in "$@"; do
shift
case "$arg" in
"--bams") set -- "$@" "-b" ;;
"--mode") set -- "$@" "-m" ;;
"--atacseq") set -- "$@" "-a" ;;
"--extend") set -- "$@" "-e" ;;
"--normalize") set -- "$@" "-n" ;;
"--njobs") set -- "$@" "-j" ;;
"--sortthreads") set -- "$@" "-q" ;;
"--sortmem") set -- "$@" "-w" ;;
"--nobigwig") set -- "$@" "-t" ;;
"--average") set -- "$@" "-g" ;;
*) set -- "$@" "$arg" ;;
esac
done
#/ Set defaults and check sanity:
bams=""
mode=""
atacseq="FALSE"
extend="0"
normalize="FALSE"
njobs="1"
sortthreads="1"
sortmem="1G"
nobigwig="FALSE"
average="FALSE"
echo ''
#/ getopts and export:
while getopts b:m:e:j:q:w:antg OPT
do
case ${OPT} in
b) bams="${OPTARG}" ;;
m) mode="${OPTARG}" ;;
a) atacseq="TRUE" ;;
e) extend="${OPTARG}" ;;
n) normalize="TRUE" ;;
j) njobs="${OPTARG}" ;;
q) sortthreads="${OPTARG}" ;;
w) sortmem="${OPTARG}" ;;
t) nobigwig="TRUE" ;;
g) average="TRUE" ;;
esac
done
if [[ "${bams}" == "" ]]; then
echo 'Either --bam or --bigwig must be specified'
exit 1
fi
if [[ "${mode}" != "single" ]] && [[ "${mode}" != "paired" ]]; then
if [[ "${atacseq}" == "FALSE" ]]; then
echo 'Argument --mode must be specified and must be <single> or <paired>'
exit 1
fi
fi
if [[ "${atacseq}" == "TRUE" ]]; then
if [[ "${mode}" == "paired" ]]; then
echo '[Info] --atacseq is set, setting --mode to single to use cutsites rather than reads'
mode='single'
fi
fi
OPTS=(bams mode extend njobs atacseq \
normalize sortthreads sortmem customsuffix)
for i in ${OPTS[*]}; do export $i; done
#/ Print summary:
echo ''
echo '---------------------------------------------------------------------------------------------'
echo '[Version] ::: '"${VERSION}"
echo ''
echo '[Info] Running with these parameters:'
echo ' --bams = '"${bams}"
echo ' --mode = '"${mode}"
echo ' --extend = '"${extend}"
echo ' --njobs = '"${njobs}"
echo ' --atacseq = '"${atacseq}"
echo ' --normalize = '"${normalize}"
echo ' --sortthreads = '"${sortthreads}"
echo ' --sortmem = '"${sortmem}"
echo ' --nobigwig = '"${nobigwig}"
echo ' --average = '"${average}"
echo '---------------------------------------------------------------------------------------------'
echo ''
#------------------------------------------------------------------------------------------------------------------------
#/ Function that checks if required tools are callable:
function PathCheck {
if [[ $(command -v $1 | wc -l | xargs) == 0 ]]; then
echo ${1} >> missing_tools.txt
fi
}; export -f PathCheck
Tools=(bedtools mawk parallel samtools)
#/ if that argument is set check for tools, then exit:
if [[ -e missing_tools.txt ]]; then rm missing_tools.txt; fi
for i in $(echo ${Tools[*]}); do PathCheck $i; done
if [[ -e missing_tools.txt ]]; then
echo '[Error] Tools missing in PATH, see missing_tools.txt'
exit 1
fi
#/ If --normalize then scaling_factors.txt must be present:
if [[ "${normalize}" == "TRUE" ]]; then
if [[ ! -e scaling_factors.txt ]]; then
echo 'Argument --normalize is set but no file <scaling_factors.txt> can be found!'
exit 1
fi
fi
#------------------------------------------------------------------------------------------------------------------------
#/ Function to produce BigWig from Bam:
function Bam2Bw {
#/ Input is one bam file per sample:
singlebam="${1}"
Basename=${singlebam%.bam}
#/ Get chromsizes based on idxstats
samtools view -H "${singlebam}" | grep 'SN:' | cut -f2,3 | awk '{gsub("SN:|LN:", "");print}' > "${singlebam}".chromsize
#---------------------------------------------
#/ Normalize by using the scaling factors from scaling_factors.txt
if [[ "${normalize}" == "TRUE" ]]; then
#/ take the reciprocal since the scale factor is intended for division but genomecov multiplies:
SF=$(bc <<< "scale=10; $(grep -w ${singlebam} scaling_factors.txt | cut -f2)^-1")
if [[ "${SF}" == "" ]]; then
echo "[Error]" "${singlebam}" "not found in scaling_factors.txt"
exit 1
fi
else SF=1
fi
#---------------------------------------------
#/ function to get cutsites in case of --atacseq
getCutsite() {
if (( $(head -n1 $1 | awk -F'\t' 'NR==1{print NF}') < 6 ));
then
echo '[Error]' $1 "has fewer than six columns!"
exit 1
fi
mawk 'OFS="\t" {if ($6 == "+") print $1,$2+4,$2+5,".",".",$6}
{if ($6 == "-") print $1,$3-5,$3-4,".",".",$6}' $1
}
#---------------------------------------------
#/ chain the commands into a generic call:
if [[ "${atacseq}" == "TRUE" ]]; then
do_bamtobed='bedtools bamtobed -i "${singlebam}" | getCutsite /dev/stdin |'
if [[ "${extend}" != 0 ]]; then
do_slop='bedtools slop -g "${singlebam}".chromsize -b "${extend}" -i - |'
suffix='_extended'
else do_slop=''; suffix=''
fi
input='-i -'
csize='-g "${singlebam}".chromsize'
is_paired=''
else
do_bamtobed=''
do_slop=''
input='-ibam "${singlebam}"'
csize=''
if [[ "${extend}" != 0 ]]; then
do_extend='-fs "${extend}"'
suffix='_extended'
else
do_extend=''
suffix=''
fi
if [[ "${mode}" == "paired" ]]; then is_paired='-pc'; else is_paired=''; fi
fi
do_scale='-scale "${SF}"'
#/ put it together:
eval \
"${do_bamtobed}" \
"${do_slop}" \
bedtools genomecov -bga "${input}" "${csize}" "${do_extend}" "${is_paired}" "${do_scale}" \
| sort -k1,1 -k2,2n --parallel="${sortthreads}" -S "${sortmem}" > "${Basename}""${suffix}".bedGraph
export sfx=123
}; export -f Bam2Bw
#/ function for optional averaging:
function AveragebedGraph {
bedtools unionbedg -i $(ls "${1}"*.bedGraph | tr "\n" " ") \
| mawk 'OFS="\t" {sum=0; for (col=4; col<=NF; col++) sum += $col; print $1, $2, $3, sum/(NF-4+1); }'
}; export -f AveragebedGraph
#-----------------------------------------------
#/ to bedGraph
echo ${bams} | tr " " "\n" | mawk NF | sort -u | parallel --will-cite -j "${njobs}" "Bam2Bw {}"
#/ optionally average if "_rep" nomenclature:
if [[ "${extend}" != 0 ]]; then
suffix='_extended'
else
suffix=''
fi
if [[ "${average}" == "TRUE" ]]; then
if [[ $(find . -maxdepth 1 -name "*_rep*" | wc -l) == 0 ]]; then
echo '[Info] --average is set but no bedGraphs with *_rep* as delimiter can be found -- continuing without that option'
else
find . -maxdepth 1 -name "*_rep*.bedGraph" \
| awk -F "_rep" '{print $1 | "sort -u"}' \
| parallel --will-cite -j "${njobs}" "AveragebedGraph {}_rep > {}${suffix}_averaged.bedGraph"
fi
fi
#/ optionally (by default true) to bigwig
if [[ "${nobigwig}" == "FALSE" ]]; then
ls *chromsize | head -n 1 | xargs cat > chromsizes_tobigwig.txt
if [[ $(cat chromsizes_tobigwig.txt | wc -l) == 0 ]]; then
echo '[Error] Cannot create bigwigs as chromsize file cannot be parsed'
exit 1
fi
ls *.bedGraph | parallel --will-cite -j "${njobs}" "bedGraphToBigWig {} chromsizes_tobigwig.txt {.}.bigwig"
fi