-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathPANZ_cluster_pipe.sh
684 lines (652 loc) · 21.9 KB
/
PANZ_cluster_pipe.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
#!/usr/bin/env bash
usage="
--------------------------------------------------
Clustering PANZ_individual_pipe.sh outputs
--------------------------------------------------
USAGE:
bash $0 <OPTIONS>
OPTIONS: (R:required O:optional)
-h show help and exit.
-t <num> O set threads (default 2)
-V <list> R VCF file list
-U <list> R UNANCHORED list
-S <list> R unaln sequences
-d <num> O pre distance (default 200 bp)
-D <num> O final distance (default 2000 bp)
-c <0-1> O coverage cutoff (default 0.8)
-i <0-1> O identity cutoff (default 0.9)
-w <num> O cdhit word length (default 10)
-n <num> O Min support number (default 3)
-r <num> O Min REF support number (default 1)
-k <T/F> O Keep all intermediate files if 'T'(default F)
--------------------------------------------------
Songtao Gui
"
############### Set Default OPT #############
threads=2
export VCF_list=
export UNANCHORED_list=
export unaln_seq_list=
pre_distance=200
final_distance=2000
export coverage=0.8
export identity=0.9
wordlength=10
export supp_num=3
export ref_num=1
keep=F
# --------------------
workdir=$PWD
srcdir=$(cd $(dirname $0); pwd)
cd $workdir;
# ---------------------
################# GETOPT ####################
if [ $# -eq 0 ]; then echo "$usage" >&2; exit 1;fi
while getopts t:V:U:S:d:D:c:i:w:n:r:k:h opt;
do
case $opt in
h) echo "$usage" >&2
exit
;;
t) threads=$OPTARG
;;
V) VCF_list=$OPTARG
;;
U) UNANCHORED_list=$OPTARG
;;
S) unaln_seq_list=$OPTARG
;;
d) pre_distance=$OPTARG
;;
D) final_distance=$OPTARG
;;
c) coverage=$OPTARG
;;
i) identity=$OPTARG
;;
w) wordlength=$OPTARG
;;
n) supp_num=$OPTARG
;;
r) ref_num=$OPTARG
;;
k) keep=$OPTARG
;;
:) printf "[ERROR] --> missing argument for -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
\?) printf "[ERROR] --> illegal option: -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
esac
done
shift $((OPTIND - 1))
################# GETOPT END ################
echo "
################# Pre-step ######################
generate bed files
###################################################
" >&2
if [[ -s "00_combine-left.bed.gz" && -s "00_combine-right.bed.gz" ]];then
echo -e "Skip. File exists:" >&2
else
echo ">>> get left and right bed files from VCF file to 00_bed_files ..." >&2
mkdir -p 00_bed_files
######### run loop with multi threads #########
# set threads
loop_threads=$threads
# open fd
[ -e /tmp/fd1 ] || mkfifo /tmp/fd1
exec 7<>/tmp/fd1
rm -rf /tmp/fd1
for i in $(seq 0 $loop_threads);do
echo
done >&7
cat $VCF_list | while read vcf_file
do
read -u7
{
if [ ! -s $vcf_file ];then
echo "No file: $vcf_file" >&2
exit 1
fi
# get basename
export vcf_pre=$(basename $vcf_file)
vcf_pre=${vcf_pre%%_00_FINAL_ALL.vcf}
cat $vcf_file | perl -F"\t" -lane '
$,="\t";
/^#/ && next;
($chr1,$end1,$id)=@F[0,1,2];
$start1=$end1-1;
$start1=0 if $start1 < 0;
if(/CHR2=(.*);END=(\d+);/){
$chr2=$1;
$end2=$2;
$start2=$end2-1;
$start2=0 if $start2 < 0;
print $chr1,$start1,$end1,"$ENV{vcf_pre}#$id","left";
print $chr2,$start2,$end2,"$ENV{vcf_pre}#$id","right";
}else{
die "No CHR2 or END for $id";
}
' >00_bed_files/${vcf_pre}.bed
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> get-bed: non-zero exit." >&2
rm -f 00_bed_files/${vcf_pre}.bed
exit 1
fi
echo >&7
} &
done | cat
wait
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> get-bed: non-zero exit." >&2
exit 1
fi
exec 7>&- # close fd
exec 7<&- # close fd
echo ">>> generate left and right bed: 00_combine-left.bed.gz 00_combine-right.bed.gz ..." >&2
pwd
csvtk concat -j $threads -tTH 00_bed_files/*.bed | csvtk split -G -j $threads -tTH -f 5 -o $PWD
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> csvtk split: non-zero exit." >&2
# rm -f xxx.outputs
exit 1
fi
echo "Done csvtk split" >&2
mv stdin-left.tsv.gz 00_combine-left.bed.gz &&\
mv stdin-right.tsv.gz 00_combine-right.bed.gz &&\
echo "Done. Generated bed files:" >&2
fi
ls -lh 00_combine-left.bed.gz 00_combine-right.bed.gz >&2
echo "
################### STEP1 #######################
Clustering by left-posi within $pre_distance bp
###################################################
" >&2
echo "bash $srcdir/src/pan10_bed_cls.sh 00_combine-left.bed.gz $unaln_seq_list S1 $pre_distance $coverage $identity $wordlength $threads $keep" >&2
bash $srcdir/src/pan10_bed_cls.sh 00_combine-left.bed.gz $unaln_seq_list S1 $pre_distance $coverage $identity $wordlength $threads $keep
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> step1: non-zero exit."
exit 1
fi
echo "Done. Output files:" >&2
ls | grep "^S1" >&2
# bash panz_bed_cls.sh <bed> <unaln_seq_list> <step_pre> <distance> <coverage> <identity> <wordlength> <threads> <keep>
echo "
################### step2 ######################
Clustering by right-posi within $pre_distance bp
###################################################
"
# get right cls beds
if [[ ! -s "S1_02_uniqID_uncls.id.gz" || ! -s "S1_07_clstr_refID.id.gz" ]];then
echo "No file: S1_02_uniqID_uncls.id.gz OR S1_07_clstr_refID.id.gz" >&2
exit 1
fi
echo ">>> generate bed file for step2: S2_00_combine-right.bed.gz ..."
if [ -s "S2_00_combine-right.bed.gz" ];then
echo "Skip. File exists." >&2
else
cat S1_02_uniqID_uncls.id.gz S1_07_clstr_refID.id.gz | csvtk uniq -tTH -j $threads -o S1_10_all_ID_for_S2.id.gz &&\
csvtk join -tTH -j $threads -f '4;1' 00_combine-right.bed.gz S1_10_all_ID_for_S2.id.gz -o S2_00_combine-right.bed.gz
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> generate bed: non-zero exit." >&2
rm -f S2_00_combine-right.bed.gz
exit 1
fi
echo "Done. Generated file: S2_00_combine-right.bed.gz" >&2
fi
bash $srcdir/src/pan10_bed_cls.sh S2_00_combine-right.bed.gz $unaln_seq_list S2 $pre_distance $coverage $identity $wordlength $threads $keep
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> step2: non-zero exit."
exit 1
fi
echo "Done. Output files:" >&2
ls | grep "^S2" >&2
# final cluster
echo "
################### step3 ######################
Final clustering within $final_distance
###################################################
"
# get final cls beds
if [[ ! -s "S2_02_uniqID_uncls.id.gz" || ! -s "S2_07_clstr_refID.id.gz" ]];then
echo "No file: S2_02_uniqID_uncls.id.gz OR S2_07_clstr_refID.id.gz" >&2
exit 1
fi
echo ">>> generate bed files for step3: " >&2
if [ -s "S3_00_all_left_right.bed.gz" ];then
echo "Skip. File exists." >&2
else
cat S2_02_uniqID_uncls.id.gz S2_07_clstr_refID.id.gz | csvtk uniq -tTH -j $threads -o S2_10_all_ID_for_S3.id.gz &&\
cat 00_combine-left.bed.gz 00_combine-right.bed.gz > 00_all_left_right.bed.gz &&\
csvtk join -tTH -j $threads -f '4;1' 00_all_left_right.bed.gz S2_10_all_ID_for_S3.id.gz -o S3_00_all_left_right.bed.gz
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> generate bed: non-zero exit." >&2
rm -f S3_00_all_left_right.bed.gz
exit 1
fi
echo "Done. Generated file: S3_00_all_left_right.bed.gz" >&2
fi
bash $srcdir/src/pan10_bed_cls.sh S3_00_all_left_right.bed.gz $unaln_seq_list S3 $final_distance $coverage $identity $wordlength $threads $keep
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> step3: non-zero exit."
exit 1
fi
echo "Done. Output files:" >&2
ls | grep "^S3" >&2
echo "
################ STEP4 ############################
Merge all clusters and filter
###################################################
"
# merge S*_06_collapsed_cdhit_clstr.tsv.gz
echo "[S4_STEP01]>>> merge 3 cluster results and get final cluster file ..." >&2
if [ -s S4_00_FINAL_Cluster.list.gz ];then
echo "Skip. File exists" >&2
else
############ Check if all related files exists ###########
num_related_file=1;
for related_file in S*_06_collapsed_cdhit_clstr.tsv.gz
do
if [[ ! -s "$related_file" ]]; then
printf "[ERROR] --> No file: %s \n" $related_file >&2
let num_related_file++
fi
done
if [ $num_related_file -eq 1 ];then
echo "All related files were found. Proceeding ..." >&2
else
echo "Check if you miss somthing." >&2
exit 1;
fi
############### merging cluster files #################
zcat S3_06_collapsed_cdhit_clstr.tsv.gz | cut -f 4 | perl -lane '
BEGIN{
$fs="S2_06_collapsed_cdhit_clstr.tsv.gz";
open(IN,"gzip -dc $fs |") or die("cannot open $fs:$!");
while(<IN>){
chomp;
(undef,$ref,undef,$all)=split(/\t/,$_);
die("No record for $_") unless $ref;
$hs{$ref}=$all;
}
close IN;
}
@seqid=split(/,/,$_);
for ($i=0;$i<=$#seqid;$i++) {
$seqid[$i]=$hs{$seqid[$i]} if $hs{$seqid[$i]};
}
$,=",";
print @seqid;
' | perl -lane '
BEGIN{
$fs="S1_06_collapsed_cdhit_clstr.tsv.gz";
open(IN,"gzip -dc $fs |") or die("cannot open $fs:$!");
while(<IN>){
chomp;
(undef,$ref,undef,$all)=split(/\t/,$_);
die("No record for $_") unless $ref;
$hs{$ref}=$all;
}
close IN;
}
@seqid=split(/,/,$_);
for ($i=0;$i<=$#seqid;$i++) {
$seqid[$i]=$hs{$seqid[$i]} if $hs{$seqid[$i]};
}
$,=",";
print @seqid;
' | perl -F"," -lane '
%hash={};
@uniq=grep {++$hash{$_}==1} @F;
$cls=join(",",@uniq);
$,="\t";
print $#uniq+1,$cls;
' | pigz -p $threads > S4_00_FINAL_Cluster.list.gz
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> merging cls: non-zero exit." >&2
rm -f S4_00_FINAL_Cluster.list.gz
exit 1
fi
fi
echo "Done. Generated merged cluster file: S4_00_FINAL_Cluster.list.gz"
############## Filtering merged clusters ##########
# r>0
# t > r -> PASS_WithRef
# t = r -> PASS_OnlyRef
# t < r -> error
# r=0
# t = nlowqual -> LOWQUAL
# t > nlowqual -> PASS_NoRef
# t < nlowqual -> error
# r<0 error
Filter_CLS_fun(){
cur_tsupp=$(echo "$1" | cut -f 1 )
cur_rcd=$(echo "$1" | cut -f 2 )
cur_rsupp=$(echo "$cur_rcd" | grep -o "ref" | grep -c "ref" )
nlowqual=$(echo "$cur_rcd" | grep -o "LOWQUAL" | grep -c "LOWQUAL" )
if [[ "$cur_tsupp" -ge "$supp_num" || "$cur_rsupp" -ge "$ref_num" ]];then
if [[ "$cur_rsupp" -gt 0 ]];then
# r > 0
if [[ "$cur_tsupp" -gt "$cur_rsupp" ]];then
# t > r -> PASS_WithRef
cls_type="PASS_WithRef"
elif [[ "$cur_tsupp" -eq "$cur_rsupp" ]]; then
# t = r -> PASS_OnlyRef
cls_type="PASS_OnlyRef"
else
# t < r -> error
echo -e "[ERROR] --> ref_num gt total num:\n $cur_tsupp\t$cur_rcd" >&2
exit 1
fi
elif [[ "$cur_rsupp" -eq 0 ]];then
# r = 0
if [[ "$cur_tsupp" -gt "$nlowqual" ]];then
# t > nlowqual -> PASS_NoRef
cls_type="PASS_NoRef"
elif [[ "$cur_tsupp" -eq "$nlowqual" ]]; then
# t = nlowqual -> LOWQUAL
cls_type="LOWQUAL"
else
# t < nlowqual -> error
echo -e "[ERROR] --> nlowqual gt total num:\n $cur_tsupp\t$cur_rcd" >&2
exit 1
fi
elif [[ "$cur_rsupp" -lt 0 ]]; then
# r < 0
echo -e "[ERROR] --> ref num less than zero:\n $cur_tsupp\t$cur_rcd" >&2
exit 1
fi
cur_out_line=$(echo -e "$cls_type\t$cur_tsupp\t$cur_rsupp\t$nlowqual\t$cur_rcd") &&\
echo "$cur_out_line"
fi
}
export -f Filter_CLS_fun
echo "[S4_STEP02]>>> Filtering out clusters with less than $supp_num total_supports or less than $ref_num Ref_supports ..." >&2
if [ -s S4_01_Filter_Cluster_fmt.tsv.gz ];then
echo "SKip. File exists." >&2
else
rm -f S4_01_Filter_Cluster_fmt.tsv
pigz -dc -p $threads S4_00_FINAL_Cluster.list.gz | parallel -j $threads -k --lb --bar Filter_CLS_fun | pigz -p $threads >S4_01_Filter_Cluster_fmt.tsv.gz
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> filter by parallel: non-zero exit." >&2
rm -f S4_01_Filter_Cluster_fmt.tsv.gz
exit 1
fi
fi
echo -e "Done. Filtered file:\n$(ls -lh S4_01_Filter_Cluster_fmt.tsv.gz)" >&2
echo "
[S4_STEP02]>>> Get represent sequences for each cluster ...
(Criteria: Length > BE|SE > REF|NonREF > PASS|LOWQUAL )
" >&2
# function
# input is S4_01_Filter_Cluster_fmt.tsv.gz per line
get_rep_cls_fun(){
line=$1
cls_type=$(echo "$line" | cut -f 1 )
fmt_rcd=$(
echo "$line" | cut -f 5 | sed 's/,/\n/g' | perl -lane '
$line=$_;
($sample,$type,$orient,$l_r,$id,$strand)=split(/#/,$line);
$,="\t";
if($id=~/_(\d+)-(\d+)$/){
$length=$2-$1+1;
$bs="SE";
$bs="BE" if $type=~/^BE/;
$ref=2;
$ref=1 if $type=~/ref/;
$pass=1;
$pass=2 if $type=~/LOWQUAL/;
print $length,$bs,$ref,$pass,$line;
}else{
die("Wrong scaffold id format: $id \n");
}
' )
if [[ -z "$fmt_rcd" ]];then
echo "[ERROR]-->empty fmt_rcd." >&2
#exit 1
fi
rep_rcd=$(echo "$fmt_rcd" | sort -k1,1nr -k2,2 -k3,3n -k4,4n | cut -f 5 | head -1)
if [[ -z "$rep_rcd" ]];then
echo "[ERROR]-->empty rep_rcd." >&2
#exit 1
fi
echo -e "$rep_rcd\t$line"
}
export -f get_rep_cls_fun
# run get_rep_cls function using parallel
if [ -s S4_02_AddRepresentID_Cluster.tsv.gz ];then
echo "SKip. File exists." >&2
else
pigz -dc -p $threads S4_01_Filter_Cluster_fmt.tsv.gz | parallel -j $threads -k --lb get_rep_cls_fun | pigz -p $threads >S4_02_AddRepresentID_Cluster.tsv.gz
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> get rep id by parallel: non-zero exit." >&2
rm -f S4_02_AddRepresentID_Cluster.tsv.gz
exit 1
fi
echo -e "Done. Add rep file:\n$(ls -lh S4_02_AddRepresentID_Cluster.tsv.gz)" >&2
fi
echo "[S4_STEP03]>>> Get the final represent Clustered Sequences ..." >&2
progress_check=$PWD/progress_check
if [ -s "${progress_check}/S4_03.finished" ];then
echo "Skip running." >&2
cat ${progress_check}/S4_03.finished >&2
else
echo ">>>>>> seperate represent IDs by sample to S4_03_EACH_sample_RepresentID dir ..." >&2
pigz -dc -p $threads S4_02_AddRepresentID_Cluster.tsv.gz | cut -f 1,2 |\
perl -lane '($sample,$id)=(split(/#/,$F[0]))[0,4];$,="\t";print $sample,$id,"$F[1]#$F[0]";'|\
csvtk split -j $threads -tTH -f 1 -o S4_03_EACH_sample_RepresentID
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> get split id: non-zero exit." >&2
rm -rf S4_03_EACH_sample_RepresentID
exit 1
fi
echo "Done." >&2
echo ">>>>>> Get represent sequences for each sample to S4_04_EACH_sample_RepresentSequences dir ..." >&2
mkdir -p S4_04_EACH_sample_RepresentSequences
# get sequences by CLS
######### run loop with multi threads #########
# set threads
loop_threads=$threads
# open fd
[ -e /tmp/fd1 ] || mkfifo /tmp/fd1
exec 7<>/tmp/fd1
rm -rf /tmp/fd1
for i in $(seq 0 $loop_threads);do
echo
done >&7
########### progress bar ###########
####### scripts before loop ########
export ttt=$(ls S4_03_EACH_sample_RepresentID/*.tsv | wc -l)
echo "Progress:( Total records: $ttt )
start: $(date)
0% 10 20 30 40 50 60 70 80 90 100%
|----|----|----|----|----|----|----|----|----|----|" >&2
# creat tmp files for counting (use tmp files instead of VARs for FIFO-loop)
rm -f progress_bar_kkk.tmp progress_bar_ccc.tmp
echo "start" > progress_bar_ccc.tmp
touch progress_bar_kkk.tmp
#####################################
ls S4_03_EACH_sample_RepresentID/*.tsv | while read cls_file
do
read -u7
{
tpre=$(basename $cls_file)
tpre=${tpre%%.tsv}
tpre=${tpre##stdin-}
cur_fa=$(grep -m 1 "/${tpre}.unaligned" $unaln_seq_list)
if [ ! -s "$cur_fa" ];then
echo "[ERROR] --> No file: $cur_fa" >&2
exit 1
fi
csvtk join -tTH -f "2;1" -k $cls_file <(seqkit fx2tab $cur_fa) -o S4_04_EACH_sample_RepresentSequences/${tpre}_repSeq.tsv.gz
#### Progress bar within loop ####
ccc=$(perl -e '$a=`cat progress_bar_ccc.tmp | wc -l`;print int($a*100/$ENV{ttt});')
kkk=$(cat progress_bar_kkk.tmp | wc -l)
echo "ccc" >> progress_bar_ccc.tmp
if [[ $ccc -eq $kkk ]];then
echo -e "kkk1\nkkk2" >> progress_bar_kkk.tmp
echo -n "*" >&2;
fi
###################################
echo >&7
} &
done | cat
wait
exec 7>&- # close fd
exec 7<&- # close fd
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> get rep seq tfa: non-zero exit."
exit 1
fi
echo -e "\nDone." >&2
echo ">>>>>> Merge represent sequences.tfa to S4_04_RepresentSequences_tfa.tsv.gz ..." >&2
cat S4_04_EACH_sample_RepresentSequences/*_repSeq.tsv.gz > S4_04_RepresentSequences_tfa.tsv.gz
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> merge rep seq: non-zero exit." >&2
rm -f S4_04_RepresentSequences_tfa.tsv.gz
exit 1
fi
echo ">>>>>> Get final represent sequnces.fasta to S4_04_RepresentSequences.fasta ..." >&2
csvtk cut -j $threads -tTH -f 3,4 S4_04_RepresentSequences_tfa.tsv.gz | seqkit tab2fx -j $threads -o S4_04_RepresentSequences.fasta
if [[ $? -ne 0 ]] ; then
echo "[ERROR] --> get rep seq fasta: non-zero exit." >&2
rm -f S4_04_RepresentSequences.fasta.gz
exit 1
fi
# remove intermediate files if $keep is not "T"
if [ "$keep" != "T" ];then
echo "cleaning S4_04_EACH_sample_RepresentSequences and S4_03_EACH_sample_RepresentID ... " >&2
rm -rf S4_04_EACH_sample_RepresentSequences S4_03_EACH_sample_RepresentID
echo "Done." >&2
fi
echo "Already finished S4_04_EACH_sample_RepresentSequences" >${progress_check}/S4_03.finished
echo -e "Done. results in: \n$(ls -lh S4_04_RepresentSequences* )" >&2
fi
# exit ;
echo "
###################################################
Mapping UNANCHORED seq to Clustered represents
###################################################
"
minimap2 -t $threads -c --cs S4_04_RepresentSequences.fasta S5_UNANCHORED_CLS_rep_seq.fa.gz | pigz -p $threads >S5_UNANCHORED_rep_seq_MM2_S4_Repseq_MergeRef.paf.gz
# echo ">>>> Indexing S4_04_RepresentSequences.fasta ..." >&2
# if [ -s S4_04_RepresentSequences.fasta.bwt ];then
# echo "Skip. file exists" >&2
# else
# bwa index S4_04_RepresentSequences.fasta
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> bwa index: non-zero exit." >&2
# exit 1
# fi
# echo "DONE." >&2
# fi
# echo ">>>> Get ALL UNANCHORED sequences ..." >&2
# mkdir -p S5_UNANCHORED_Seq
# # get UNANCHORED seq fun
# get_UNANCHORED_seq_fun(){
# unanchored_file=$1
# if [ ! -s $unanchored_file ];then
# echo "[ERROR]-->No file:$unanchored_file" >&2
# exit 1
# fi
# unanchored_id=$( cut -f 1 $unanchored_file )
# cur_pre=$( basename $unanchored_file )
# cur_pre=${cur_pre%%_05_*}
# cur_fa=$(grep -m 1 "/${cur_pre}.unaligned" $unaln_seq_list)
# if [ ! -s "$cur_fa" ];then
# echo "[ERROR] --> No file: $cur_fa" >&2
# exit 1
# fi
# seqkit grep -f <(echo "$unanchored_id") $cur_fa -o S5_UNANCHORED_Seq/${cur_pre}_unanchored.fa.gz
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> get unanchored fa: non-zero exit." >&2
# rm -f S5_UNANCHORED_Seq/${cur_pre}_unanchored.fa.gz
# exit 1
# fi
# # run bwa mem for each record
# if [ -s "S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq.paf.gz" ];then
# echo "Skip MEM. File exists: S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq.paf.gz" >&2
# else
# bwa mem -t 1 S4_04_RepresentSequences.fasta S5_UNANCHORED_Seq/${cur_pre}_unanchored.fa.gz 2>S5_UNANCHORED_Seq/${cur_pre}_MEM.log | paftools sam2paf - | pigz > S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq.paf.gz
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> get MEM PAF: non-zero exit." >&2
# rm -f S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq.paf.gz
# exit 1
# fi
# fi
# # filter out matched to anchored records and get only unanchored
# echo "Filter for ${cur_pre} with coverage:$coverage and identity:$identity" >&2
# zcat S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq.paf.gz |\
# perl -lane '$F[0]=~s/\:\.//g;print $F[0] if $F[10]/$F[1] >= $ENV{coverage} && $F[9]/$F[10] >= $ENV{identity}' |\
# perl -lane 'print if ++$h{$_}==1' >S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq_filtered.id
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> get filtered id : non-zero exit." >&2
# rm -f S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq_filtered.id
# exit 1
# fi
# # get filtered seq
# if [ -s "S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq_filtered.id" ];then
# seqkit grep -v -f S5_UNANCHORED_Seq/${cur_pre}_MEM_to_RepSeq_filtered.id S5_UNANCHORED_Seq/${cur_pre}_unanchored.fa.gz -o S5_UNANCHORED_Seq/${cur_pre}_unanchored_filtered_out_MEM_to_RepSeq.fa.gz
# else
# cp S5_UNANCHORED_Seq/${cur_pre}_unanchored.fa.gz S5_UNANCHORED_Seq/${cur_pre}_unanchored_filtered_out_MEM_to_RepSeq.fa.gz
# fi
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> get filtered seq: non-zero exit." >&2
# rm -f S5_UNANCHORED_Seq/${cur_pre}_unanchored_filtered_out_MEM_to_RepSeq.fa.gz
# exit 1
# fi
# }
# export -f get_UNANCHORED_seq_fun
# if [ -s "S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz" ];then
# echo "Skip. File exists: S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz" >&2
# else
# cat $UNANCHORED_list | parallel -j $threads -k --lb get_UNANCHORED_seq_fun
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> parallel get unanchored: non-zero exit." >&2
# exit 1
# fi
# echo ">>>> Merge each sample unanchored to one ..." >&2
# cat S5_UNANCHORED_Seq/*_unanchored_filtered_out_MEM_to_RepSeq.fa.gz > S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> merge unanchored fa: non-zero exit." >&2
# rm -f S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz
# exit 1
# fi
# echo "DONE. results in: S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz" >&2
# fi
# echo "
# ###################################################
# Mapping remained UNANCHORED seq to each other
# ###################################################
# "
# if [ -s "S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz.sa" ];then
# echo "Skip index.File exists" >&2
# else
# bwa index S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz
# fi
# if [ -s "S5_Filtered_UNANCHORED_MEM_to_self.paf.gz" ];then
# echo "Skip MEM. File exists" >&2
# else
# bwa mem -t $threads S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz S5_ALL_UNANCHORED_filtered_out_MEM_to_RepSeq_Seq.fa.gz | paftools sam2paf - | pigz -p $threads > S5_Filtered_UNANCHORED_MEM_to_self.paf.gz
# if [[ $? -ne 0 ]] ; then
# echo "[ERROR] --> mem to self: non-zero exit." >&2
# rm -f S5_Filtered_UNANCHORED_MEM_to_self.paf.gz
# exit 1
# fi
# echo "DONE. results in: S5_Filtered_UNANCHORED_MEM_to_self.paf.gz" >&2
# fi
echo "
###################################################
Filtering cluster results
###################################################
"
echo "
###################################################
Merge and get final NON-ref-seq
###################################################
"