-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_pipeline.sh
180 lines (157 loc) · 6.67 KB
/
run_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/bin/bash -x
### get script directory
BIN=$(dirname "${BASH_SOURCE[0]}")
pushd $BIN > /dev/null
BASEDIR=`pwd`
popd > /dev/null
### get input
PROT1=$1
PROT2=$2
CHAIN1=$3
CHAIN2=$4
### get prot IDs and define input format
SUFFIX=`echo $1 | awk -F . '{print $NF}'`
ID1=`basename ${PROT1%.$SUFFIX}`
ID2=`basename ${PROT2%.$SUFFIX}`
### singularity image
SINGULARITY=${BASEDIR}/AF_data/AF_environment.sif
### starting from structures
if [ $SUFFIX == "pdb" ] || [ $SUFFIX == "cif" ]; then
### extract proper chain (bakbone atoms only), rename it as A and renumber it from 1 not considering gaps
singularity exec $SINGULARITY \
python3 $BASEDIR/src/pdb_extract.py $PROT1 $CHAIN1 ./${ID1}_$CHAIN1.pdb
singularity exec $SINGULARITY \
python3 $BASEDIR/src/pdb_extract.py $PROT2 $CHAIN2 ./${ID2}_$CHAIN2.pdb
### rename chain2 as B
sed 's/ A / B /g' ${ID2}_$CHAIN2.pdb > ${ID2}_${CHAIN2}_chainrename.pdb
mv ${ID2}_${CHAIN2}_chainrename.pdb ${ID2}_$CHAIN2.pdb
### concatenate the two chains in a single structure for comparison with final model
cat ${ID1}_$CHAIN1.pdb | grep -v 'END' > ${ID1}_${CHAIN1}-${ID2}_${CHAIN2}.pdb
cat ${ID2}_$CHAIN2.pdb >> ${ID1}_${CHAIN1}-${ID2}_${CHAIN2}.pdb
MERGEDSTR=${ID1}_${CHAIN1}-${ID2}_${CHAIN2}.pdb
### get single and concatenated fastas
echo '>'${ID1}_${CHAIN1} > ${ID1}_$CHAIN1.fasta
singularity exec $SINGULARITY \
python3 $BASEDIR/src/sequence_extract.py ${ID1}_$CHAIN1.pdb A >> ${ID1}_$CHAIN1.fasta
echo '>'${ID2}_${CHAIN2} > ${ID2}_$CHAIN2.fasta
singularity exec $SINGULARITY \
python3 $BASEDIR/src/sequence_extract.py ${ID2}_$CHAIN2.pdb B >> ${ID2}_$CHAIN2.fasta
FASTA1=${ID1}_$CHAIN1.fasta
FASTA2=${ID2}_$CHAIN2.fasta
SEQ1=`tail -1 ${FASTA1}`
SEQ2=`tail -1 ${FASTA2}`
echo '>'${ID1}_${CHAIN1}-${ID2}_${CHAIN2} > ${ID1}_${CHAIN1}-${ID2}_${CHAIN2}.fasta
echo ${SEQ1}${SEQ2} >> ${ID1}_${CHAIN1}-${ID2}_${CHAIN2}.fasta
MERGED_FASTA=${ID1}_${CHAIN1}-${ID2}_${CHAIN2}.fasta
MERGED_ID=${ID1}_${CHAIN1}-${ID2}_${CHAIN2}
### starting from sequences
elif [ $SUFFIX == "fasta" ]; then
### get merged sequences
FASTA1=$PROT1
FASTA2=$PROT2
SEQ1=`grep -v '>' $PROT1 | tr -d '\n'`
SEQ2=`grep -v '>' $PROT2 | tr -d '\n'`
echo '>'${ID1}-${ID2} > ${ID1}-${ID2}.fasta
echo ${SEQ1}${SEQ2} >> ${ID1}-${ID2}.fasta
MERGED_FASTA=${ID1}-${ID2}.fasta
MERGED_ID=${ID1}-${ID2}
### starting from msa
elif [ $SUFFIX == "a3m" ]; then
head -2 $PROT1 > ${ID1}.fasta
head -2 $PROT2 > ${ID2}.fasta
FASTA1=${ID1}.fasta
FASTA2=${ID2}.fasta
SEQ1=`tail -1 $FASTA1`
SEQ2=`tail -1 $FASTA2`
echo '>'${ID1}-${ID2} > ${ID1}-${ID2}.fasta
echo ${SEQ1}${SEQ2} >> ${ID1}-${ID2}.fasta
MERGED_FASTA=${ID1}-${ID2}.fasta
MERGED_ID=${ID1}-${ID2}
fi
### run alignment if not getting it as input
if [ $SUFFIX == "pdb" ] || [ $SUFFIX == "cif" ] || [ $SUFFIX == "fasta" ]; then
singularity exec $SINGULARITY \
hhblits -i $FASTA1 -d $BASEDIR/uniclust30_2018_08/uniclust30_2018_08 -E 0.001 -all -oa3m ${FASTA1%.fasta}.a3m
A3M1=${FASTA1%.fasta}.a3m
singularity exec $SINGULARITY \
hhblits -i $FASTA2 -d $BASEDIR/uniclust30_2018_08/uniclust30_2018_08 -E 0.001 -all -oa3m ${FASTA2%.fasta}.a3m
A3M2=${FASTA2%.fasta}.a3m
elif [ $SUFFIX == "a3m" ]; then
A3M1=$PROT1
A3M2=$PROT2
fi
MGF=0.9
FUSEDMSA=${MERGED_ID}_fused.a3m
PAIREDMSA=${MERGED_ID}_paired.a3m
if [ -f $A3M1 ] && [ -f $A3M2 ]
then
if [ ! -f $PAIREDMSA ]
then
python3 ${BASEDIR}/src/oxmatch.py --a3m1 $A3M1 --a3m2 $A3M2 --max_gap_fraction $MGF --outname $PAIREDMSA
fi
if [ ! -f $FUSEDMSA ]
then
python3 ${BASEDIR}/src/fuse_msas.py --a3m1 $A3M1 --a3m2 $A3M2 --max_gap_fraction $MGF --outname $FUSEDMSA
fi
fi
### AlphaFold runtime parameters
L1=`${BASEDIR}/src/seqlen.sh $FASTA1` # Length of chain1 (where to introduce chain break)
L2=`${BASEDIR}/src/seqlen.sh $FASTA2` # Length of chain2
MSAS=${PAIREDMSA},${FUSEDMSA} # Comma separated list of msa paths
AFHOME=${BASEDIR}/src/alphafold/ # Path of alphafold directory in FoldDock
PARAM=${BASEDIR}/AF_data/ # Path to AF2 params
OUTFOLDER=./ # Path where AF2 generates its output folder structure
PRESET="full_dbs" # Configuration1 - no ensembling (full_dbs) and (reduced_dbs) or 8 model ensemblings (casp14).
MODEL_NAME="model_1" # Configuration2
MAX_RECYCLES=10 # recycles number (default=3)
# HEADER="if_plddt_av,if_plddt_std,ch1_plddt_av,ch1_plddt_std,ch2_plddt_av,ch2_plddt_std,"
# HEADER=$HEADER"plddt_av,plddt_std,num_atoms_in_interface,num_res_in_interface,Neff1,Neff2"
# if [ $SUFFIX == "pdb" ] || [ $SUFFIX == "cif" ]; then HEADER=$HEADER",DockQ"; fi
# echo $HEADER > ${MERGED_ID}_unranked.csv
for n in {1..5}; do
cp $MERGED_FASTA ${MERGED_ID}_${n}.fasta
### Run Alphafold2 to fold provided chains
singularity exec --nv --bind $BASEDIR:$BASEDIR $SINGULARITY \
python3 $AFHOME/run_alphafold.py \
--fasta_paths=${MERGED_ID}_${n}.fasta \
--output_dir=$OUTFOLDER \
--model_names=$MODEL_NAME \
--max_recycles=$MAX_RECYCLES \
--data_dir=$PARAM \
--preset=$PRESET \
--fold_only \
--msas=$MSAS \
--chain_break_list=$L1 \
rm ${MERGED_ID}_${n}.fasta
# ### Run analysis on interface pLDDT and contacts
# INTSTATS=`singularity exec --nv --bind $BASEDIR:$BASEDIR $SINGULARITY \
# python3 $BASEDIR/src/fetch_plDDT.py \
# --model ${MERGED_ID}_$n/unrelaxed_model_1.pdb \
# --metric ${MERGED_ID}_$n/result_model_1.pkl \
# --l1 $L1 \
# --l2 $L2`
#
# ### split modelled chains in 2, renumber and rename them A and B
# python3 $BASEDIR/src/split_chains.py --structure ${MERGED_ID}_$n/unrelaxed_model_1.pdb --outname ${MERGED_ID}_unranked_$n
# INTSTATS=${INTSTATS},${NEFF1},${NEFF2}
#
# ### If starting from structures, compare with (merged) structures provided as input
# if [ $SUFFIX == "pdb" ] || [ $SUFFIX == "cif" ]; then
# DOCKQ=`singularity exec --nv --bind $BASEDIR:$BASEDIR $SINGULARITY \
# $BASEDIR/src/DockQ/DockQ.py \
# -short \
# ${MERGED_ID}_unranked_${n}.pdb \
# $MERGEDSTR | \
# grep "DockQ" | awk '{print $2}'`
#
# INTSTATS=${INTSTATS},$DOCKQ
# fi
#
# echo $INTSTATS >> ${MERGED_ID}_unranked.csv
done
# ### rerank obtained structures
# if [ $SUFFIX == "pdb" ] || [ $SUFFIX == "cif" ]; then
# python3 $BASEDIR/src/rerank.py --comm ${MERGED_ID}_unranked --csv ${MERGED_ID}_unranked.csv --col 9 --out ${MERGED_ID}_ranked
# else
# python3 $BASEDIR/src/rerank.py --comm ${MERGED_ID}_unranked --csv ${MERGED_ID}_unranked.csv --col 9 --out ${MERGED_ID}_ranked
# fi