-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcfixer.pl
2852 lines (2512 loc) · 110 KB
/
vcfixer.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
use strict ; use warnings;
# vcfixer # by M'Óscar
my $version = "vcfixer_1.5.pl";
my $edited = "30.VII.2020";
############################
# Use this script to clean VCF files from missing values
# It works as dataset_fixer.sh and vcf_fixer
# For options and other information check the help information with "help" or "h" or so...
# vcf_missing.pl --help
# Usage: https://github.com/M-Osky/VCFixer
###########################################################################################################################
############### Chagelog: ########################
############### ########################
############### ########################
############### Version 1.5 : 2020-07-30 ########################
############### Fixed some wrong messages when samples don't mach popmap ########################
############### Fixed sed, now it will input Quality Metadata in inputed missing ########################
############### Metadata is complex and for now no editable from command line, only in script ########################
############### Check inputed metadata below default values for parameters ########################
############### Trying to add a message when no missing are found when inputing ########################
############### ########################
############### New vcfixer, equivalent to vcf_fixer version 1.4: 2020-05-19 ########################
############### Duplicated it in order to fix bugs while vcf_fixer is running. ########################
############### As of now vcf_fixer is outdated by this beta version ########################
############### Major issue addressed: ########################
############### The inputed genotypes were just the genotype "#/#" not quality information ########################
############### Some softwares read the quality information missing and took it as missing ########################
############### Now it's adding quality information to the inputed alleles ########################
############### inputed quality is low, but should be enough to pass most software filters ########################
############### The metadata added is bellow default parameter values ########################
############### Also: trying to fix the reported populations when a popmap is parsed (389) ########################
############### BOTH THIS POINTS NEEDS WORK STILL ########################
############### ########################
############### Version 1.3 : 2020-03-12 ########################
############### Now checks if all samples from the file exist in popmap ########################
############### fixing wrong population and missing counts (for the summary) ########################
############### tried to fix some wrong population and missing counts that appeared ########################
############### in the log file. Debug 643; 649; 655; 1148, 1150; 1279; 1287; ########################
############### ########################
############### Version 1.2 : 2019-10-28 ########################
############### debugging, trying to improve log file ########################
############### ########################
############### Version 1.1 : 2019-10-13 ########################
############### added popmap option as an alternative of reading pops from sample codes ########################
############### improved performance and trying to make it faster (spoiler alet: didn't) ########################
############### ########################
############### Version 1.0 : 2019-09-30 ########################
############### Now when there is multiple alleles with shared maximum frequency ########################
############### it will pick randomly one of those as a mode. ########################
############### Deleted some resundant chuncks of code. Improved output log files ########################
############### it will print the date and time it started and finished running ########################
############### ########################
###########################################################################################################################
####################### PARAMETERS #########################
# All of them can and should be set from the command line, check the help information.
my $poplength = 2; #how many characters long is the population code at the beginning of the sample code
#my $poplength = 3; #how many characters long is the population code at the beginning of the sample code
my $emptyrate = 0.8; #ratio of missing from which samples will be considered empty
my $miss_samples = 0.3; #ratio of missing from which samples must be deleted
my $miss_loci = 0.3; #ratio of missing from which loci must be deleted
my $gralmiss = "pop"; #how should the program replace the missing values?
#my $gralmiss = "global"; #how should the program replace the missing values?
#my $gralmiss = "pop"; #how should the program replace the missing values?
#my $gralmiss = "miss"; #how should the program replace the missing values?
#my $gralmiss = "2/2"; #how should the program replace the missing values?
my $misspop = "global"; #how should the program replace when a SNP is missing in a entire population?
#my $misspop = "miss"; #how should the program replace when a SNP is missing in a entire population?
#my $misspop = "global"; #how should the program replace when a SNP is missing in a entire population?
#my $misspop = "2/2"; #how should the program replace when a SNP is missing in a entire population?
my $minpop = 8; #minnimum number of samples per population in order to keep the population.
#output file name
my $tail = "not defined"; #something to add at the end of the input name to generate the output name
#my $tail = "_something"; #something to add at the end of the input name to generate the output name
my $outname = "not defined"; #name or path for the output file
#my $outname = "podarcis"; #name or path for the output file
# This should be fixed for all VCF produced by Populationns (Stacks)
#my $inputname = "populations.snps.vcf"; # input file name, should be a string either alphanumeric or alphabetic.
my $inputname = "populations.snps.vcf"; # input file name, should be a string either alphanumeric or alphabetic.
my $infocols = 9; # check how many columns of information has the VCF file before the first sample column
#my $infocols = 9; # check how many columns of information has the VCF file before the first sample column
my $sumpath = "summary_table_vcf_fixer.txt"; #path to a file that will gather some of the details of ref_maps and vcf_fixer outputs
#my $sumpath = "summary_table.txt"; #path to a file that will gather some of the details of ref_maps and vcf_fixer outputs
#my $sumpath = "no"; #path to a file that will gather some of the details of ref_maps and vcf_fixer outputs
my $poplog = "ref_map.log"; #name of the log file from populations run
#my $poplog = "populations.log"; #name of the log file from populations run
#my $poplog = "ref_map.log"; #name of the log file from populations run
my $popmap = "No default"; # pop map to know the samples
#my $popmap = "none";
#my $popmap = "none";
my $quality = 0; # input metadata about genotype quality
#################################################################################################################
###### quality metadata to add to inputed genotypes
#my $homoref = "\t0\/0:1:1,0:12:-0.05,-0.82,-1.42";
my $homoref = "\t0/0:1:1,0:12:-0.05,-0.82,-1.42";
#my $hetegen = "\t0\/1:1:1,1:12:-1.01,-0.05,-1.01";
my $hetegen = "\t0/1:1:1,1:12:-1.01,-0.05,-1.01";
#my $homoalt = "\t1\/1:1:0,1:12:-1.42,-0.82,-0.05";
my $homoalt = "\t1/1:1:0,1:12:-1.42,-0.82,-0.05";
#################################################################################################################
my $rawname = $inputname;
$rawname =~ s/^(.*?)\.vcf/$1/;
#Help!
my %arguments = map { $_ => 1 } @ARGV;
if(exists($arguments{"help"}) || exists($arguments{"--help"}) || exists($arguments{"-help"}) || exists($arguments{"h"}) || exists($arguments{"-h"}) || exists($arguments{"--h"})) {
die "\n\n\t $version Help Information $edited\n\t--------------------------------------------------\n
This program will read a VCF file, first will delete empty or monomorphic SNP, then empty samples,
then will delete loci with high missing rate, then samples with high missing rate,
and finally will input genotypes in the missing values left \"./.\".
It is designed to work with the VCF file generated by the program \"populations\" (Stacks).
Populations can be identified from sample names or from a tab separated popmap (Stacks format: one population and one sample per row).
A log file, a list with deleted and kept SNPs, deleted and kept samples, and a new popmap will be saved saved in the output directory\n
\n\tCommand line arguments and defaults:\n
--input / --vcf name (or path) of the VCF file. Default: $inputname\n
--infocols number of locus information columns before the first sample column. Default: $infocols\n
--poplength [int] how many characters of the sample name belong to the population code? Default: $poplength
--popmap Alternatively provide a popmap to read which sample belongs to which population. $popmap\n
--empty [float] missing rate from which a sample will be considered \"empty\" and deleted. Default: $emptyrate\n
--miss_loci [float] missing rate from which a loci should be deleted. Default: $miss_loci\n
--miss_samples [float] missing rate from which a sample should be deleted. Default: $miss_loci\n
--minpop [int] minimum number of samples a population must have in order to keep it. Default: $minpop\n
--gral_miss How to replace the regular missing values? There are four options. Default: $gralmiss\n\t\t\t\t \"pop\" to replace it with the population mode* (most frequent genotype)
\t\t\t \"global\" to replace the missing with the whole dataset mode*.
\t\t\t \"miss\" to leave it as missing. \"2/2\" or any other value to input that value.\n
--pop_miss What to input if a SNP is missing in an entire population? Three options available. Default: $misspop\n\t\t\t\t \"global\" to input the global mode*, \"miss\" to keep them as missing,
\t\t\t \"2/2\", or \"5/5\", or any other value: to input a new genotype and remark its difference from the rest.\n
--noquality [flag] add this if you do not want $version to input quality/probability information in missing
Some software can not process genotypes without quality metadata, by default will be also inputed,
quality information is not editable from command line and should be handled carefully, values are:
$homoref\n\t $hetegen\n\t $homoalt\n
--summary path/name for a summary table that will gather some details of populations and vcf_fixer outputs.
if \'--summary no\' the file will not be created. Default: $sumpath\n
--poplog [optional] path or name of the populations (Stacks) log file. Default: $poplog
if \'--poplog no\' it will not look for a logfile. Only used for summary table.
If no path (only name) provided will look for file in vcf file location.\n
--out path or new name for the output file. By default will be \"input name\" + \"tail\" + \".vcf\"\n
--tail Something to the end of the input file name to generate the output name.
\t\t\t If no tail provided, will add to the file name the sample number, SNP number, and missing handling:
\t\t\t 1p: regular missing replaced with population mode
\t\t\t 1g: regular missing replaced with global mode
\t\t\t 1m: regular missing not handled - left as missing.
\t\t\t 1x: regular missing replaced with custome input
\t\t\t 0m: if not regular missing found in the file
\t\t\t 2g: when a SNP is missing in the whole population, the global mode is input
\t\t\t 2m: when missing in the whole population is left as missing
\t\t\t 2x: custome input
\t\t\t 0p: there are not SNPs entirely missing in any population\n\n
Command line call example:\n\tvcfixer.pl --input /usr/home/refmap/populations.snps.vcf --poplength 3 --gral_miss global --minpop 1 --summary no\n\n
* When two or more alleles have the highest frequency, one of them will be picked randomly for each SNP\n\n\n";
}
################ PASSING ARGUMENTS
my $popstring = "not_defined";
use Getopt::Long;
GetOptions( "input=s" => \$inputname, # --input
"vcf=s" => \$inputname, # --vcf
"infocols=s" => \$infocols, # --infocols
"empty=f" => \$emptyrate, # --empty
"miss_loci=f" => \$miss_loci, # --miss_loci
"miss_samples=f" => \$miss_samples, # --miss_samples
"tail=s" => \$tail, # --tail
"out=s" => \$outname, # --out
"summary=s" => \$sumpath, # --summary
"popmap=s" => \$popmap, # --popmap
"poplength=i" => \$poplength, # --poplength
"minpop=i" => \$minpop, # --minpop
"poplog=s" => \$poplog, # --poplog
"pop_miss=s" => \$misspop, # --pop_miss
"noquality" => \$quality, # --noquality
"gral_miss=s" => \$gralmiss ); # --gral_miss
my @populations = (); #needs to go
############### DIRECTORY PATH
use Cwd qw(cwd);
my $localdir = cwd;
my @directorypath = split('/' , $inputname);
my $pathlength = scalar @directorypath;
my $filename="nofilename";
my $subdirpath="nodirpath";
my $filepath="nofilepath";
if ($pathlength > 1) {
$filename = $directorypath[-1];
$filepath = $inputname;
pop (@directorypath);
$subdirpath = join ('/' , @directorypath);
}
elsif ($pathlength <= 1) {
$filename = $inputname;
$filepath = "$localdir" . "/" . "$inputname";
$subdirpath = $localdir;
}
#print "\nPopulations to extract: $populations[0], $populations[1], $populations[2]\n";
#same with poplog
my $logname="nofilename";
my $subdirlog="nodirpath";
my $filelog="nofilepath";
if ($poplog ne "no") {
my @logpath = split('/' , $poplog);
my $loglength = scalar @logpath;
if ($loglength > 1) {
$logname = $logpath[-1];
$filelog = $poplog;
pop (@logpath);
$subdirlog = join ('/' , @logpath);
}
elsif ($loglength <= 1) {
$logname = $poplog;
$subdirlog = $subdirpath;
$filelog = "$subdirlog" . "/" . "$logname";
}
}
print "\nReading $filename";
open my $VCFFILE, '<', $filepath or die "\nUnable to find or open $filepath: $!\n";
print "\n\nAnalizing loci information... \n";
##die "Log file: $logname, Log path: $subdirlog\nfull path log file: $filelog\n\n";
#use DateTime;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$mon = $mon + 1;
$year = $year + 1900;
my $datetime = "$year-$mon-$mday $hour:$min:$sec";
#my $datetime = "2019-mm-dd hh:mm:ss"; # Seriously this is just temporary
my $mainparams = "--infocols => $infocols, --empty => $emptyrate, --miss_loci => $miss_loci, --miss_samples => $miss_samples\n--tail => $tail, --out => $outname, --summary => $sumpath, --poplength => $poplength\n--minpop => $minpop, --pop_miss => $misspop, --gral_miss => $gralmiss, --popmap => $popmap\n\n";
my @report=("\t$version log file", "\n", "input file name: $inputname", "Working directory: $localdir", "Executed on $datetime", " ", "Options:", "$mainparams");
my @introrows = (); #first first rows of metadata will be stored here
my @headers = (); #column headers will be stored here
my @locirows = (); #this will include all the non monomorphic loci information, one variable=one row
my $firstsample = $infocols; #first column with genotype information
my $infoloci = $infocols - 1; #last column with locus information
my $samplenum="none"; # this variable will store the number of columns with SNP data
my $sampletot="none"; # this variable will store the number of columns with sample tags
my $iniloci = 0;
my $goodloci = 0;
my $delete = 0;
my %uniquepops = ();
my $lastsample=0;
my @deletedloci = ("name\tScaffold\tposition\tID\tproblem");
my @locilist = ();
my @samplelist = ();
######## SAVE MOST COMMON GENOTYPE
while (<$VCFFILE>) {
chomp; #clean "end of line" symbols
next if /^$/; #skip if blank
next if /^\s*$/; #skip if only empty spaces
my $line = $_; #save line
my %better =();
$line =~ s/\s+$//; #clean white tails in lines
my @wholeline= split("\t", $line); #split columns as different elements of an array
my $numcolumn = scalar @wholeline;
$lastsample = $numcolumn -1;
#if ($iniloci < 20) { print "\n$wholeline[2]: "; }
if ($wholeline[0]=~ /^##.*?/) {
push (@introrows, $line); #save the first rows with metadata (they all start with "##")
#print "Saving metadata $wholeline[0]\n";
}
elsif ($wholeline[0]=~ /^#.*?/) {
@headers = @wholeline;
foreach my $samplecols ($firstsample..$lastsample) {
$sampletot = $numcolumn - $firstsample;
my $popcode = substr ($wholeline[$samplecols], 0, $poplength);
my $samplecode = $wholeline[$samplecols];
push(@samplelist, $samplecode);
$uniquepops{$popcode} = 1;
}
}
else {
$iniloci++;
my $missingrate = 0;
my @SNP = @wholeline[$firstsample..$lastsample]; #save the information about the loci
my @genotype = @SNP; #duplicate
my @lociid = split (":", $wholeline[2]);
my $scaffold = $wholeline[0];
$scaffold =~ s/^Scaffold([0-9]*?)$/$1/;
my $lociname = "$scaffold" . "_$lociid[0]";
$samplenum = scalar @SNP;
#print "\n$iniloci) sample tags: $sampletot -> genotypes at $lociname: $samplenum\n\n";
if ($samplenum != $sampletot) {die "ERROR in VCF file! Discrepancy with number of samples\n\n$sampletot sample tags found: @samplelist\n\n$samplenum genotypes found for locus $lociname\n@SNP\n\nProgram will exit now. You should check your input file $inputname.\n\n";}
my $last = $samplenum - 1;
foreach my $count (0..$last) {
my @splitted = split(':', $genotype[$count]); #save only the genotype
$genotype[$count] = $splitted[0];
}
#if ($iniloci < 20) {print "\n@genotype\n"; }
foreach my $rowinfo (@genotype) {
my $onegenotype = $rowinfo;
#if ($onegenotype eq "./.") { next; }
if (exists $better{$onegenotype}) { $better{$onegenotype} = $better{$onegenotype} +1; }
else { $better{$onegenotype} = 1; }
}
my $missingnum = 0;
if (exists $better{'./.'}) { $missingnum = $better{'./.'} ; }
$missingrate = $missingnum / $samplenum;
if (exists $better{'./.'}) { delete $better{'./.'} ; }
#my $variability = %better; # Till now this worked, but is now giving back a fraction
my $variability = scalar keys %better;
#print "\nvariability = $variability\n";
#die "\n\nDebugging\n\n";
if($variability==1) {
$delete++;
#if ($iniloci < 20) { print "\n$iniloci) $lociname variability $variability". ":\n@genotype\nMonomorphic!\n"; }
my $lociinfo = "$lociname\t$scaffold\t$wholeline[1]\t$wholeline[2]\tmonomorphic";
push(@deletedloci, $lociinfo);
}
elsif ($missingrate >= $emptyrate) {
$delete++;
#if ($iniloci < 20) { print "\n$iniloci) $lociname variability $variability". ":\n@genotype\nToo mutch missing ($missingrate)\n"; }
my $lociinfo = "$lociname\t$scaffold\t$wholeline[1]\t$wholeline[2]\tmissing rate $missingrate";
push(@deletedloci, $lociinfo);
}
else {
push (@locirows, $line);
#my $iniloci = "$lociname\t$scaffold\t$wholeline[1]\t$wholeline[2]";
#push(@locilist, $lociinfo);
$goodloci++;
}
#$iniloci++;
}
}
close $VCFFILE;
my $sampnum = $lastsample - $firstsample + 1;
my %popref = ();
if ($popmap ne "No default") {
%uniquepops = ();
open my $MAP, '<', $popmap or die "\nUnable to find or open $popmap: $!\n";
while (<$MAP>) {
my $line = $_;
my $row = $line;
$row =~ s/\s+$//; #clean white tails in lines
$row =~ s/^\s+//; #clean white spaces at the beginning
my @pair = split('\t', $row);
$popref{$pair[0]} = $pair[1];
}
foreach my $key (keys %popref) {
my $popcode = $popref{$key};
$uniquepops{$popcode} = 1;
}
}
my $popnum = scalar keys %uniquepops; #added "scalar keys" just in case
my @poplist = keys %uniquepops; #list of populations
if ($delete > 0) {print "Monomorphic or \"empty\" loci were deleted.\n$delete deleted from a total of $iniloci; $goodloci loci kept.\n" }
elsif ($delete == 0) { print "No monomorphic or empty loci found. "; }
if ($goodloci == 0) { die "\n\nWARNING: There are no loci left!\nCheck that the dataset and program options are corect.\nCheck the program help information if needed: $version -h\n\n"; }
print "Now looking for \"empty\" samples... \n";
my $popindex = $popnum - 1;
#my @mostfreq = values %refgenotyped;
#print "\nGlobal averages: @mostfreq\n";
my $q=0;
#my $popmoda = "undefined";
my $fixedline = "undefined";
my @oldloci = @locirows;
my $snpnumber = scalar @locirows;
my $lastindex = $snpnumber -1;
my $pophandler = "nd";
my $gralhandler = "nd";
my @firstcols = ();
my @firstheaders = @headers[0..$infoloci];
#empty samples
#get info per sample
my $rownow = 1;
my %persampleind = ();
foreach my $row (@locirows) {
#print "\n\n$row\n";
my @wholeline= split("\t", $row);
my @keepinfo = @wholeline[0..$infoloci];
my $infoline = join("\t", @keepinfo);
push (@firstcols, $infoline);
#save the information of each sample in a different position of a hash
#this is equivalent to translocate
foreach my $samp ($firstsample..$lastsample) {
if ($rownow == 1) {
my $samplename = $headers[$samp];
$persampleind{$samp} = "$samplename\t$wholeline[$samp]";
#print "\n $samp) $persampleind{$samp}\n";
}
elsif ($rownow <= $snpnumber) { $persampleind{$samp} = "$persampleind{$samp}\t$wholeline[$samp]"; }
}
$rownow++;
}
# now check the missingrate
my @deletedsamples = ();
my %goodsamples1 =();
my @samplecols1 = ();
my @samplenames = ();
#print "\none full: $persampleind{10}\n\n";
foreach my $samp ($firstsample..$lastsample) {
my $locinum = 0;
my $sampleline = $persampleind{$samp};
my @wholeline= split("\t", $sampleline);
my $sampleid = $wholeline[0];
shift @wholeline;
#print "ALL: @wholeline\n";
my %eachsamp = ();
$eachsamp{'./.'} = 0;
foreach my $genot (@wholeline) {
#print "$genot ";
my @splitted = split(':', $genot); #save only the genotype
my $onlygen = $splitted[0];
if (exists $eachsamp{$onlygen}) { $eachsamp{$onlygen} = $eachsamp{$onlygen} +1; }
else { $eachsamp{$onlygen} = 1; }
$locinum++;
}
#print "\n\n";
my $misscount = $eachsamp{'./.'};
my $missrate = $misscount / $locinum;
my $message = "$sampleid\tmissing rate $missrate";
if ($missrate >= $emptyrate) { push (@deletedsamples, $message); }
elsif ($missrate < $emptyrate) {
$goodsamples1{$samp} = $sampleline;
push (@samplecols1, $samp); #keep the unique sample col index
push (@samplenames, $sampleid);
}
}
@headers = (@firstheaders, @samplenames); #keep headers only for the samples that we have now
my $emptycount = scalar @deletedsamples;
if ($emptycount > 0) { print "\n$emptycount samples were considered \"empty\" and deleted (missing rate above or equal to $emptyrate).\n"; }
elsif ($emptycount == 0) { print "No samples deleted! " }
print "Now checking missing rate per loci... \n";
#untranspose (sort by locus)
my @goodloci1 = (); # all loci rows
my $indxmax = $snpnumber - 1;
foreach my $snp (0..$indxmax) {
# get ready the loci info from each loci
my $locusinf = $firstcols[$snp];
#loop through info by sample
foreach my $smp (@samplecols1) {
my $sampleline = $goodsamples1{$smp}; #get one sample line each time
my @wholeline = split("\t", $sampleline);
shift @wholeline; #delete the sample name
$locusinf = "$locusinf\t$wholeline[$snp]"; #add to the list of sample genotypes for the given locus
}
push (@goodloci1, $locusinf); #save all the info from the loci
}
#now check missing rate per loci and delete if needed
my $locinum1 = scalar @goodloci1;
my @goodloci2 =();
my $toomiss=0;
my $maxind = 0;
my @savedlist = ();
foreach my $lociline (@goodloci1) {
my @wholeline= split("\t", $lociline);
# get loci name
#print "\n\n$lociline\n\n";
my @lociid = split (":", $wholeline[2]);
my $scaffold = $wholeline[0];
$scaffold =~ s/^Scaffold([0-9]*?)$/$1/;
my $lociname = "$scaffold" . "_$lociid[0]";
# get number of loci
my $maxcol = scalar @wholeline;
$maxind = $maxcol - 1;
#get only sample info
my @genotypes = @wholeline[$firstsample..$maxind];
my $samplecount = scalar @genotypes;
my %eachlocus = ();
$eachlocus{'./.'} = 0;
foreach my $genot (@genotypes) {
my @splitted = split(':', $genot); #save only the genotype
my $onlygen = $splitted[0];
if (exists $eachlocus{$onlygen}) { $eachlocus{$onlygen} = $eachlocus{$onlygen} +1; }
else { $eachlocus{$onlygen} = 1; }
}
my $misscount = $eachlocus{'./.'};
my $missrate = $misscount / $samplecount;
if ($missrate >= $miss_loci) {
my $lociinfo = "$lociname\t$scaffold\t$wholeline[1]\t$wholeline[2]\tmissing rate $missrate";
push(@deletedloci, $lociinfo);
$toomiss++;
}
elsif ($missrate < $miss_loci) {
push (@goodloci2, $lociline);
push (@savedlist, $lociname);
}
}
my $locinum2 = scalar @goodloci2;
if ($toomiss > 0) { print "$toomiss loci deleted with missing rate above $miss_loci\n"; }
elsif ($toomiss == 0) { print "No loci deleted! " }
#print "\n@savedlist\n";
if ($locinum2 ==0) {die "\n\nNO LOCI LEFT!!\n$version is done (and quite sad)\nMay be try different settings\n\n"; }
print "\nChecking missing rate per sample... \n";
#missing val per sample
#get info per sample
#my $newlastsample = $
@firstcols=();
$rownow = 1;
%persampleind = ();
foreach my $row (@goodloci2) {
my @wholeline= split("\t", $row);
#print "\n\n$row\n\n";
my @keepinfo = @wholeline[0..$infoloci];
my $infoline = join("\t", @keepinfo);
#print "\n\n>> $infoline\n\n";
push (@firstcols, $infoline);
#save the information of each sample in a different position of a hash
#this is equivalent to translocate
foreach my $samp ($firstsample..$maxind) {
if ($rownow == 1) {
my $samplename = $headers[$samp];
$persampleind{$samp} = "$samplename\t$wholeline[$samp]";
#print "\n $samp) $persampleind{$samp}\n";
}
elsif ($rownow <= $locinum2) { $persampleind{$samp} = "$persampleind{$samp}\t$wholeline[$samp]"; }
}
$rownow++;
}
#print "\n\n$persampleind{10}\n\n";
#foreach (@firstcols) {
# print "\n\n > $_\n\n";
#}
#check missing
my %goodsamples2 =();
my @samplecols2 = ();
@samplenames =();
my $delsam=0;
foreach my $samp ($firstsample..$maxind) {
my $locinum = 0;
my $sampleline = $persampleind{$samp};
my @wholeline= split("\t", $sampleline);
my $sampleid = $wholeline[0];
shift @wholeline;
my %eachsamp = ();
$eachsamp{'./.'} = 0;
foreach my $genot (@wholeline) {
my @splitted = split(':', $genot); #save only the genotype
my $onlygen = $splitted[0];
if (exists $eachsamp{$onlygen}) { $eachsamp{$onlygen} = $eachsamp{$onlygen} +1; }
else { $eachsamp{$onlygen} = 1; }
$locinum++;
}
my $misscount = $eachsamp{'./.'};
my $missrate = $misscount / $locinum;
if ($missrate >= $miss_samples) {
my $message = "$sampleid\tmissing rate $missrate";
push (@deletedsamples,$message);
$delsam++;
}
elsif ($missrate < $emptyrate) {
$goodsamples2{$samp} = $sampleline;
#print "\n\n$sampleline\n\n";
push (@samplecols2, $samp); #keep the unique sample col index
push (@samplenames, $sampleid);
}
}
if ($delsam > 0) { print "$delsam sample(s) deleted with missing rate above $miss_samples.\n"; }
elsif ($delsam == 0) { print "No samples deleted! " }
print "Now checking populations... \n";
#trim populations
my @delepops = ();
my @keepops = ();
my %popcount = ();
my @deletelist = ();
my $checksamp = scalar @samplecols2;
if ($checksamp == 0) { die "\n\nNO SAMPLES LEFT!!\n$version is done (and quite sad)\nMay be try different settings\n\n"; }
if ($popmap eq "No default") {
foreach my $samp (@samplecols2) {
my $sampline = $goodsamples2{$samp};
my $poppart = substr ($sampline, 0, $poplength);
if (exists $popcount{$poppart}) { $popcount{$poppart} = $popcount{$poppart} + 1; }
else { $popcount{$poppart} = 1 }
#print "$popcount{$poppart}\n";
}
}
else {
foreach my $samp (@samplecols2) {
my $sampline = $goodsamples2{$samp};
my @letsdothisagain = split ("\t", $sampline);
if (exists $popref{$letsdothisagain[0]}) {
my $gotpop = $popref{$letsdothisagain[0]};
#print "sample $letsdothisagain[0] -> pop $gotpop.\n";
if (exists $popcount{$gotpop}) { $popcount{$gotpop} = $popcount{$gotpop} + 1; }
else { $popcount{$gotpop} = 1; }
}
else { push (@report, "WARNING!: Sample $letsdothisagain[0] does not exist in popmap and will not be analysed\n"); }
}
}
my $test = 0;
push (@report, "$popnum populations found: @poplist");
foreach my $popul (@poplist) {
if (exists $popcount{$popul}) {
if ($popcount{$popul} < $minpop) { push (@delepops, $popul); }
elsif ($popcount{$popul} >= $minpop) { push(@keepops, $popul); }
}
else { push (@report, "Population $popul from popmap not found in the vcf file"); $test++; }
}
#if ($test > 0) { push (@report, ""); }
push (@report, "");
$test = 0;
my $smallpops = scalar @delepops;
my @samplecols3 = ();
@samplenames = ();
if ($smallpops > 0) {
foreach my $shitpop (@delepops) {
foreach my $samp (@samplecols2) {
my $sampline = $goodsamples2{$samp};
my @wholeline = split("\t", $sampline);
my $sampleid = $wholeline[0];
my $poppart = substr ($sampline, 0, $poplength);
if ($popmap ne "No default") { $poppart = $popref{$sampleid}; }
my $message = "$sampleid\tbelongs to $poppart with less than $minpop samples per population";
if ($poppart eq $shitpop) { push(@deletedsamples, $message); }
else {
push (@samplecols3, $sampline);
push (@samplenames, $sampleid);
#print "\n\n >> $sampline\n\n";
}
}
}
if ($smallpops == 1) { print "Population @delepops had less than $minpop samples and was deleted.\n" }
elsif ($smallpops > 1) { print "Populations @delepops, had less than $minpop samples and were deleted.\n" }
}
elsif ($smallpops == 0) {
print "All populations have enough samples.\n";
foreach my $samp (@samplecols2) {
my $sampline = $goodsamples2{$samp};
my @wholeline = split("\t", $sampline);
my $sampleid = $wholeline[0];
push (@samplecols3, $sampline);
push (@samplenames, $sampleid);
}
}
@headers = (@firstheaders, @samplenames); #keep headers only for the samples that we have now
my $checkpops = scalar @keepops;
if ($checkpops == 0) { die "\n\nNO POPULATIONS LEFT!!\n$version is done (and quite sad)\nMay be try different settings\n\n"; }
print "\nAnalizing and inputing values on remaining missing... \n";
#translocate back to per-loci dataset
my @goodloci3 = (); # all loci rows
$indxmax = $locinum2 - 1;
my $finalsamp = scalar @samplecols3;
my $lastcol = $finalsamp - 1;
foreach my $snp (0..$indxmax) {
# get ready the loci info from each loci
my $locusinf = $firstcols[$snp];
#print "\n\n$locusinf\n\n";
#loop through info by sample
foreach my $smp (0..$lastcol) {
my $sampleline = $samplecols3[$smp]; #get one sample line each time
#print "\n\n > $sampleline\n\n";
my @wholeline = split("\t", $sampleline);
shift @wholeline; #delete the sample name
$locusinf = "$locusinf\t$wholeline[$snp]"; #add to the list of sample genotypes for the given locus
}
push (@goodloci3, $locusinf); #save all the info from the loci
#print "\n\n$locusinf\n\n";
}
#####
#rename everything to their old names
my $locileft = scalar @goodloci3;
$lastindex = $locileft - 1;
my @checkline = split("\t", $goodloci3[0]);
my $colnum = scalar @checkline;
$lastsample = $colnum - 1;
@poplist = @keepops;
my $popleft = scalar @keepops;
$popindex = $popleft - 1;
@locirows = @goodloci3;
my $snpcount = 0;
##########################################
########### input missing ###############
##########################################
$goodloci = 0;
my %refgenotyped = ();
my $warns = 0;
# replace missing
my $gralcount = 0;
my $popwisecount = 0;
if ($gralmiss eq "pop" && $misspop eq "global") { # pop - glob
$gralhandler = "_1p";
#compute global mode per SNP
foreach my $linesnp (@locirows) {
#my %freqgenot = ();
my %freqgenot = ('./.' => 0, '0/0' => 0, '0/1' => 0, '1/1' => 0);
my $bienmoda = "none";
my @wholeline = split ("\t", $linesnp);
#print "\n\n$linesnp\n\n";
my @SNP = @wholeline[$firstsample..$lastsample]; #save the information about the loci
my @genotype = @SNP;
my @lociid = split (":", $wholeline[2]);
my $scaffold = $wholeline[0];
$scaffold =~ s/^Scaffold([0-9]*?)$/$1/;
my $lociname = "$scaffold" . "_$lociid[0]";
$samplenum = scalar @SNP;
my $last = $samplenum - 1;
foreach my $count (0..$last) {
my @splitted = split(':', $genotype[$count]); #save only the genotype
$genotype[$count] = $splitted[0];
}
foreach my $onegenotype (@genotype) {
$freqgenot{$onegenotype} = $freqgenot{$onegenotype} +1;
#if (exists $freqgenot{$onegenotype}) { $freqgenot{$onegenotype} = $freqgenot{$onegenotype} +1; }
#else { $freqgenot{$onegenotype} = 1; }
}
delete $freqgenot{'./.'};
my $refal = $freqgenot{'0/0'};
my $hetal = $freqgenot{'0/1'};
my $altal = $freqgenot{'1/1'};
if ($refal == $hetal && $refal == $altal && $hetal == $altal) {
my @highest = ('0/0', '0/1', '1/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$bienmoda = $highest[$choosen];
}
if ($refal < $hetal && $refal < $altal && $hetal == $altal) {
my @highest = ('0/1', '1/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$bienmoda = $highest[$choosen];
}
if ($refal == $hetal && $refal > $altal && $hetal > $altal) {
my @highest = ('0/0', '0/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$bienmoda = $highest[$choosen];
}
if ($refal > $hetal && $refal == $altal && $hetal > $altal) {
my @highest = ('0/0', '1/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$bienmoda = $highest[$choosen];
}
else { for my $bt (sort {$freqgenot{$a} <=> $freqgenot{$b} || $a cmp $b } (keys %freqgenot) ) { $bienmoda = $bt; } }
$refgenotyped{$goodloci} = "$bienmoda";
$goodloci++;
}
#now check the missing population-wise
#gets samples per population
foreach my $pop (0..$popindex) { #each population
$test = 0;
my $moretest = 0;
my $numgral = 0;
my $reportmisspop = 0;
my $popwise = 0;
my $modanomiss = "undefined";
my $population = $poplist[$pop];
my @sampleindex = ();
my $printpop = "no need to input anything";
my $shitlocus = 0;
foreach my $samp ($firstsample..$lastsample) { #each column with samples
my $poppart = "atlantis";
if ($popmap eq "No default") { $poppart = substr ($headers[$samp], 0, $poplength); } #extract the population code from the sample name
elsif ($popmap ne "No default") { $poppart = $popref{$headers[$samp]}; } #use popmap
if ($population eq $poppart) { push (@sampleindex, $samp); } #save the positions of the samples that belong to the same population
}
#if ($snpcount < 7) { print "@sampleindex\n"; }
my $numindex = scalar @sampleindex;
my $indexofindex = $numindex - 1;
# save only alleles from all the chunck of loci data
foreach my $fila (0..$lastindex) {
#my %popgenot=();
my %popgenot=('./.' => 0, '0/0' => 0, '0/1' => 0, '1/1' => 0);
my @allgenot = ();
my $lineagain = $locirows[$fila]; # each row
my @wholerow = split("\t", $lineagain); # split by column
#print "\n\n@wholerow\n@sampleindex ($population)\n\n";
my $num=0;
foreach my $id (0..$indexofindex) {
my $sampleposition = $sampleindex[$id]; #each sample from that line (SNP) from that population
my @holdallele = split (':', $wholerow[$sampleposition]); #split the alleles from the rest of the info
$allgenot[$num] = $holdallele[0]; #save the allele information
#if ($snpcount < 7) { print "$allgenot[$num] "; }
$num++;
}
# save the genotypes and sort them by frequency of appearance
my $numgenot = scalar @allgenot;
my $genotindex = $numgenot - 1;
#if ($snpcount < 7) { print "\n@allgenot\n"; }
foreach my $genind (0..$genotindex) {
my $onegenotype = $allgenot[$genind];
#if ($snpcount < 7) { print "$onegenotype "; }
#if (exists $popgenot{$onegenotype}) { $popgenot{$onegenotype} = $popgenot{$onegenotype} +1; }
$popgenot{$onegenotype} = $popgenot{$onegenotype} +1;
#else { $popgenot{$onegenotype} = 1; }
#sort the hash acording to the values, times that each genotype appears
#for $q (sort {$popgenot{$a} <=> $popgenot{$b} || $a cmp $b } (keys %popgenot) ) { $popmoda = $q; }
}
#sort the hash acording to the values, times that each genotype appears
#my $divers = %popgenot;
my $missnum = $popgenot{'./.'};
delete $popgenot{'./.'};
my $refal = $popgenot{'0/0'};
my $hetal = $popgenot{'0/1'};
my $altal = $popgenot{'1/1'};
if ($refal == $hetal && $refal == $altal && $hetal == $altal) {
my @highest = ('0/0', '0/1', '1/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$modanomiss = $highest[$choosen];
}
if ($refal < $hetal && $refal < $altal && $hetal == $altal) {
my @highest = ('0/1', '1/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$modanomiss = $highest[$choosen];
}
if ($refal == $hetal && $refal > $altal && $hetal > $altal) {
my @highest = ('0/0', '0/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$modanomiss = $highest[$choosen];
}
if ($refal > $hetal && $refal == $altal && $hetal > $altal) {
my @highest = ('0/0', '1/1');
my $sameal = scalar @highest;
my $choosen = int(rand($sameal));
$modanomiss = $highest[$choosen];
}
else { for my $nm (sort {$popgenot{$a} <=> $popgenot{$b} || $a cmp $b } (keys %popgenot) ) { $modanomiss = $nm; } }
my $highestal = $popgenot{$modanomiss};
#Replace the missing
if($missnum >= $numgenot) {
# Replace missing values when missing in an entire population.
$pophandler = "2g";
my $inputmode = $refgenotyped{$snpcount};
foreach my $id2 (0..$indexofindex) {
my $sampleposition = $sampleindex[$id2];
#my $checkrow = $wholerow[$sampleposition];
#my @matches = ($checkrow =~ /\.\/\./g);
#my $getmatch = @matches;
#$gralcount = $gralcount + $getmatch;
$wholerow[$sampleposition]=~ s/\.\/\./$inputmode/g;
}
$popwisecount++;
$popwise++;
$printpop = "global mode was input";
$shitlocus = "$wholerow[0]" . ", $wholerow[1] ($wholerow[2])";
if ($test == 0) { push (@report, "\nThere are SNPs missing in $population ($printpop)"); }
push (@report, "$shitlocus missing in all samples");
$test++;
}
elsif ($missnum >= $highestal) {
#$pophandler = "2g";
foreach my $id2 (0..$indexofindex) {
my $sampleposition = $sampleindex[$id2];
my $checkrow = $wholerow[$sampleposition];
my @matches = ($checkrow =~ /\.\/\./g);
my $getmatch = @matches;
$gralcount = $gralcount + $getmatch;
$reportmisspop = $reportmisspop + $getmatch;
$wholerow[$sampleposition]=~ s/\.\/\./$modanomiss/g;
}
if ($warns == 0) {
print "One or more SNPs have the same or higher frequency of missing (\"./.\") than the most frequent genotype. Check Log file.\n";
$warns++;