-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path3_redkmer_kmerGenerate.bash
executable file
·131 lines (96 loc) · 4.92 KB
/
3_redkmer_kmerGenerate.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
#PBS -N redkmer3
#PBS -l walltime=72:00:00
#PBS -l select=1:ncpus=24:mem=32gb:tmpspace=400gb
#PBS -e /home/nikiwind/reports
#PBS -o /home/nikiwind/reports
if [ -z ${PBS_ENVIRONMENT+x} ]
then
echo "---> running on the Perugia numbercruncher..."
source redkmer.cfg
else
echo "---> running on HPC cluster..."
source $PBS_O_WORKDIR/redkmer.cfg
fi
printf "======= calculating library sizes =======\n"
illLIBMsize=$(wc -l $illM | awk '{print ($1/4)}')
illLIBFsize=$(wc -l $illF | awk '{print ($1/4)}')
illnorm=$((($illLIBMsize+$illLIBFsize)/2))
printf "======= using jellyfish to create kmers of lenght 30 from male and female illumina libraries =======\n"
if [ -z ${PBS_ENVIRONMENT+x} ]
then
$JFISH count -C -L 2 -m 25 $illM -o $CWD/kmers/rawdata/m -c 3 -s 1000000000 -t $CORES
$JFISH count -C -L 2 -m 25 $illF -o $CWD/kmers/rawdata/f -c 3 -s 1000000000 -t $CORES
$JFISH dump $CWD/kmers/rawdata/m -c -L 2 -o $CWD/kmers/rawdata/m.counts
$JFISH dump $CWD/kmers/rawdata/f -c -L 2 -o $CWD/kmers/rawdata/f.counts
printf "======= sorting and counting kmer libraries =======\n"
time sort -k1b,1 --parallel=8 -T $CWD/temp --buffer-size=$BUFFERSIZE $CWD/kmers/rawdata/m.counts > $CWD/kmers/rawdata/m.sorted &
time sort -k1b,1 --parallel=8 -T $CWD/temp --buffer-size=$BUFFERSIZE $CWD/kmers/rawdata/f.counts > $CWD/kmers/rawdata/f.sorted &
wait $(jobs -p)
else
cat > ${CWD}/qsubscripts/femalejelly.bashX <<EOF
#!/bin/bash
#PBS -N redkmer_f_jf
#PBS -l walltime=72:00:00
#PBS -l select=1:ncpus=24:mem=32gb:tmpspace=400gb
#PBS -e /home/nikiwind/reports
#PBS -o /home/nikiwind/reports
module load jellyfish
cp $illF $TMPDIR
$JFISH count -C -L 2 -m 25 XXXXX/f.fastq -o XXXXX/m -c 3 -s 1000000000 -t $CORES
$JFISH dump XXXXX/f -c -L 2 -o XXXXX/f.counts
printf "======= sorting and counting female kmer libraries =======\n"
sort -k1b,1 -T XXXXX/temp --buffer-size=$BUFFERSIZE XXXXX/f.counts > XXXXX/f.sorted
cp XXXXX/f.sorted $CWD/kmers/rawdata/
EOF
sed 's/XXXXX/$TMPDIR/g' ${CWD}/qsubscripts/femalejelly.bashX > ${CWD}/qsubscripts/femalejelly.bash
cat > ${CWD}/qsubscripts/malejelly.bashX <<EOF
#!/bin/bash
#PBS -N redkmer_m_jf
#PBS -l walltime=72:00:00
#PBS -l select=1:ncpus=24:mem=32gb:tmpspace=400gb
#PBS -e /home/nikiwind/reports
#PBS -o /home/nikiwind/reports
module load jellyfish
cp $illM $TMPDIR
$JFISH count -C -L 2 -m 25 XXXXX/m.fastq -o XXXXX/m -c 3 -s 1000000000 -t $CORES
$JFISH dump XXXXX/m -c -L 2 -o XXXXX/m.counts
printf "======= sorting and counting male kmer libraries =======\n"
sort -k1b,1 -T XXXXX/temp --buffer-size=$BUFFERSIZE XXXXX/m.counts > XXXXX/m.sorted
cp XXXXX/m.sorted $CWD/kmers/rawdata/
EOF
sed 's/XXXXX/$TMPDIR/g' ${CWD}/qsubscripts/malejelly.bashX > ${CWD}/qsubscripts/malejelly.bash
JMALEJOB=$(qsub ${CWD}/qsubscripts/malejelly.bash)
echo $JMALEJOB
JFEMALEJOB=$(qsub ${CWD}/qsubscripts/femalejelly.bash)
echo $JFEMALEJOB
while qstat $JFEMALEJOB &> /dev/null; do
sleep 10;
done;
while qstat $JMALEJOB &> /dev/null; do
sleep 10;
done;
fi
printf "======= merging kmer libraries =======\n"
join -a1 -a2 -1 1 -2 1 -o '0,1.2,2.2' -e "0" $CWD/kmers/rawdata/f.sorted $CWD/kmers/rawdata/m.sorted > $CWD/kmers/rawdata/kmers_to_merge
printf "======= removing kmers absent in male library (from seq errors or low read depth) =======\n"
awk '{if ($3>0) print}' $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
printf "======= creating unique ids for all kmers =======\n"
awk '{printf("%.1f %s\n", 1+(NR-1), $0)}' $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
awk '{print "kmer_"$0}' $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
sort -k1b,1 $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
printf "======= normalizing to library sizes =======\n"
awk -v ma="$illLIBMsize" -v fema="$illLIBFsize" -v le="$illnorm" '{print $1, $2, ($3*fema/le), ($4*ma/le)}' $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
printf "======= calculating kmer CQ and sum of kmer occurences in both libraries =======\n"
awk '{_div1= $4 ? ($3 / $4) : 0 ; print $0, _div1 }' $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
#Adding sum
awk '{print $0, ($3+$4)}' $CWD/kmers/rawdata/kmers_to_merge > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmers_to_merge
#Replace space with tabs
awk -v OFS="\t" '$1=$1' $CWD/kmers/rawdata/kmers_to_merge > $CWD/kmers/rawdata/kmer_counts
printf "======= generating final kmer_counts file =======\n"
#Add column header
awk 'BEGIN {print "kmer_id\tseq\tfemale\tmale\tCQ\tsum"} {print}' $CWD/kmers/rawdata/kmer_counts > tmpfile; mv tmpfile $CWD/kmers/rawdata/kmer_counts
printf "======= generating fasta file for next blast =======\n"
# make fasta file from kmers for blast
awk '{print ">"$1"\n"$2}' $CWD/kmers/rawdata/kmers_to_merge > $CWD/kmers/fasta/allkmers.fasta
printf "======= Done step 3=======\n"