-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAFA_main_functions
2008 lines (1650 loc) · 126 KB
/
AFA_main_functions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#AF_main_functions.R
###############################################
#source functions:
############CURATE APC##################
curate_apc <- function(APC_raw, cross_GBIF = FALSE, cross_WFO = FALSE) {
#select only the columns I want to work with
APC_raw <- subset(APC_raw, select = c('canonicalName', 'scientificName', 'acceptedNameUsage', 'scientificNameAuthorship', 'taxonRank', 'taxonomicStatus', 'taxonDistribution', 'taxonID'))
#change blanks for NA just in case
APC_raw <- replace(APC_raw, APC_raw=='', NA)
#change APC names for the ones I want
names(APC_raw)[1] = "APC_all_canonical"
names(APC_raw)[2] = "APC_all_scientific"
names(APC_raw)[3] = "APC_accepted"
names(APC_raw)[4] = "authorship"
names(APC_raw)[5] = "rank"
names(APC_raw)[6] = "taxa_status"
names(APC_raw)[8] = "taxaID"
#Subset to get rid of supra-orders that I don't want to include
APC_final_no_suprarank <- subset(APC_raw, rank!="Regnum")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Division")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Classis")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Subclassis")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Ordo")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Subordo")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Series")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Sectio")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Subsectio")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Subseries")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Familia")
APC_final_no_suprarank <- subset(APC_final_no_suprarank, rank!="Genus")
names(APC_final_no_suprarank)
#reorder so NA in distribution are last and can be deleted later when looking for duplicates
APC_reordered <- APC_final_no_suprarank[order(APC_final_no_suprarank[,c(7)],na.last=T),]
#remove duplicates of accepted name usage that have NA in distribution
APC_to_split <- APC_reordered[!duplicated(APC_reordered[,1]),]
#reorder again by accepted name
APC_to_split_reordered <- APC_to_split[order(APC_to_split[,c(1)]),]
#remove records with NA in taxon distribution
APC_to_rename <- APC_to_split_reordered[!is.na(APC_to_split_reordered$taxonDistribution), ]
#change names to what I want
names(APC_to_rename)
names(APC_to_rename)[3] = "APC_name"
names(APC_to_rename)[1] = "APC_canonical_name"
names(APC_to_rename)[4] = "Author"
#####SPLIT DISTRIBUTION INFORMATION BY STATE AND MAIN TERRITORY ON APC#####
#remove column I don't need anymore
APC_split_distribution <- select(APC_to_rename, -APC_all_scientific)
#dummy data frame with rows as species and a column for distribution listing states (commas separated) with naturalised status in parenthesis where not native
DFX <- APC_split_distribution
#list all states included
states <- c("SA", "WA", "NSW", "Qld", "ACT", "NT", "Vic", "Tas", "AR", "CoI", "CaI", "ChI", "LHI", "NI", "HI", "CSI", "MDI")
#create empty columns to receive codes for each state
DFX[ ,states] <- NA
#iterate by species record
for(i in 1:nrow(DFX)) {
#split the distribution character string up by state
vec <- strsplit(as.character(DFX$taxonDistribution)[i], ",")[[1]]
#iterate by state
for(j in states) {
#locate current state
record <- vec[grep(j, vec)]
#assign as native or extract introduced code from parentheses
if(length(record) != 0) {
if(length(grep("(", record, fixed=T)) == 0) {
status <- "native"
} else {
status <- gsub("[\\(\\)]", "", regmatches(record, gregexpr("\\(.*?\\)", record))[1])
}
#change codes to preferred ones if needed
if(status == "native and naturalised") {status <- "native colonising"}
if(status == "native and naturalised and uncertain origin") {status <- "native colonising"}
if(status == "native and doubtfully naturalised") {status <- "native potentially colonising"}
if(status == "native and uncertain origin") {status <- "native potentially colonising"}
#paste codes back to relevant state column
DFX[i,j] <- status
} # close if
} # close for j
} #close for i
#name dataset APC_raw_master_dataset as it is the main dataset in which everything else is going to be based
APC_raw_master_dataset <- DFX
#remove taxonDistribution column as this info has already been integrated in the way we want
APC_raw_master_dataset <- select(APC_raw_master_dataset, - taxonDistribution)
#For simplicity, call it APC as a short to facilitate the script hereafter
APC <- APC_raw_master_dataset
if(cross_GBIF){
APC <- cross_GBIF(dataset = APC)
}
if(cross_WFO){
APC <- cross_WFO(dataset = APC)
}
return(APC)
} #end curate_apc function
##################################################################################
###########CURATE STATE AND MAIN TERRITORY PLANT CENSUSES###########
#########################################
####AUSTRALIAN CAPITAL TERRITORY########
curate_act <- function(ACT, cross_GBIF = FALSE, cross_WFO = FALSE) {
#create new columns for territory (i.e. state for simplicity) canonical name and scientific name using the functions created
ACT$State_canonical_name <- apply(ACT[, c("GENUS", "SPECIES", "INFRASPECIES")], 1, canonical_name_for_ACT)
#Change × by x to make all datasets homogenised
ACT$State_canonical_name <- gsub('×', "x ", ACT$State_canonical_name, fixed=TRUE)
ACT$State_canonical_name <- gsub(' ', " ", ACT$State_canonical_name, fixed=TRUE)
#remove brackets and symbols from ORIGIN column
ACT$ORIGIN <- gsub('[', "", ACT$ORIGIN, fixed=TRUE)
ACT$ORIGIN <- gsub(']', "", ACT$ORIGIN, fixed=TRUE)
ACT$ORIGIN <- gsub('/', "", ACT$ORIGIN, fixed=TRUE)
#assign native when NA in ORIGIN column
ACT$ORIGIN[is.na(ACT$ORIGIN)] <- "native"
names(ACT)
#substitute codes for NATURALISED.STATUS with the ones proposed in AusAFW
ACT$NATURALISED.STATUS <- str_replace(ACT$NATURALISED.STATUS, "Doubtfully", "doubtfully naturalised")
ACT$NATURALISED.STATUS <- str_replace(ACT$NATURALISED.STATUS, "Formerly", "formerly naturalised")
#create a column for alien status by combining the columns ORIGIN and NATURALISED.STATUS
ACT$State_Alien_Status <- ifelse(is.na(ACT$NATURALISED.STATUS), ACT$ORIGIN, ACT$NATURALISED.STATUS)
#substitute codes for introduction with the ones proposed in AusAFW
ACT$State_Alien_Status <- str_replace(ACT$State_Alien_Status, "Exotic Aust", "introduced")
ACT$State_Alien_Status <- str_replace(ACT$State_Alien_Status, "Exotic EA", "introduced")
ACT$State_Alien_Status <- str_replace(ACT$State_Alien_Status, "Indigenousintroduced", "native colonising")
ACT$State_Alien_Status <- str_replace(ACT$State_Alien_Status, "Uncertain", "uncertain origin")
#Select the two variables of interest. For the ACT it was not possible to create State_name because authorship is not provided in the census
ACT <- subset(ACT, select = c('State_canonical_name', 'State_Alien_Status'))
#order by species name
ACT <- ACT[order(ACT$State_canonical_name), ]
#remove records for which species name is NA
ACT <- subset(ACT, !(is.na(State_canonical_name)))
#remove duplicates based on state name
ACT <- ACT[!duplicated(ACT$State_canonical_name),]
#ACT plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
ACT <- cross_GBIF(dataset = ACT)
}
if(cross_WFO){
ACT <- cross_WFO(dataset = ACT)
}
return(ACT)
} #end curate_act function
################################
###NEW SOUTH WALES########
curate_nsw <- function(NSW, cross_GBIF = FALSE, cross_WFO = FALSE) {
#Create a column for state name as mixing scientific name with current accepted names in the column accepted.name
NSW$State_name <- ifelse(is.na(NSW$accepted.name), NSW$scientific.name, NSW$accepted.name)
#Canonical name is complicated, so we create it in steps. We do trials of names for different taxonomic levels
#Species level:
#paste and collapse the genus and species from state name
NSW$trialspp <- vapply(strsplit(NSW$State_name, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#replace NA NA by blank
NSW$trialspp <- str_replace_all(NSW$trialspp, "NA NA", "")
#replace blank by NA
NSW$trialspp[NSW$trialspp==""] <- NA
#phrase species
#identify pattern and extract what is after
NSW$trialphrasesp <- str_extract(NSW$State_name, "[ ]sp[.]..*")
#remove symbols
NSW$trialphrasesp <- gsub("\\s*\\([^\\)]+\\)","",as.character(NSW$trialphrasesp))
#paste and collapse the second and third, to remove both, the space at the beginning and the author at the end
NSW$trialphrasesp <- vapply(strsplit(NSW$trialphrasesp, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
NSW$trialphrasesp <- str_replace_all(NSW$trialphrasesp, "NA NA", "")
#replace blank by NA
NSW$trialphrasesp[NSW$trialphrasesp==""] <- NA
#Hybrids:
#identify pattern and extract what is after
NSW$trialhybrid <- str_extract(NSW$State_name, "[ ]x[ ]..*")
#paste and collapse the second and third, to remove both, the space at the beginning and the author at the end
NSW$trialhybrid2 <- vapply(strsplit(NSW$trialhybrid, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
NSW$trialhybrid2 <- str_replace_all(NSW$trialhybrid2, "NA NA", "")
#replace blank by NA
NSW$trialhybrid2[NSW$trialhybrid2==""] <- NA
#Infraspecies level:
#Subsp.:
#First, extract the records that have "subsp" in their names
NSW$trialsubsp <- str_extract(NSW$State_name, "subsp\\..*")
#paste and collapse the first two words (i.e. subsp. + subsp. epithet)
NSW$subsp <- vapply(strsplit(NSW$trialsubsp, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#replace NA NA by blank
NSW$subsp <- str_replace_all(NSW$subsp, "NA NA", "")
#replace blank by NA
NSW$subsp[NSW$subsp==""] <- NA
#Var.:
#First, extract the records that have "var" in their names
NSW$trialvar <- str_extract(NSW$State_name, "var\\..*")
#paste and collapse the first two words (i.e. var. + var. epithet)
NSW$var <- vapply(strsplit(NSW$trialvar, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#replace NA NA by blank
NSW$var <- str_replace_all(NSW$var, "NA NA", "")
#replace blank by NA
NSW$var[NSW$var==""] <- NA
#Forma:
#First, extract the records that have space followed by f in their names
NSW$trialforma <- str_extract(NSW$State_name, "[ ]f\\..*")
#paste and collapse the second and third word, to remove both, the space at the beginning and the author at the end
NSW$forma <- vapply(strsplit(NSW$trialforma, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
NSW$forma <- str_replace_all(NSW$forma, "NA NA", "")
#replace blank by NA
NSW$forma[NSW$forma==""] <- NA
#Extract only genus from State name once accepted names have been merged
NSW$genus_new <- vapply(strsplit(NSW$State_name, " "), \(x) paste0(x[1]), character(1L))
#column with phrase species names
NSW$phrase <- apply(NSW[, c("genus_new", "trialphrasesp")], 1, phrasespNSW)
#column for infraspecies ranks and epithets
# your new merged column. Start with subsp.
NSW$infrasp = NSW$subsp
# merge with var
NSW$infrasp[!is.na(NSW$var)] = NSW$var[!is.na(NSW$var)]
# merge with forma
NSW$infrasp[!is.na(NSW$forma)] = NSW$forma[!is.na(NSW$forma)]
#create a column as a step 1 for canonical name applying the previous function
NSW$trial_canonical_name <- apply(NSW[, c("trialspp", "infrasp")], 1, canonical_name_NSW1)
#create a column as a step 2 for canonical name applying the previous function
NSW$trial_canonical_name2 <- apply(NSW[, c("trial_canonical_name", "trialhybrid2")], 1, canonical_name_NSW2)
#column for State_canonical_name
# your new merged column. Start with trial_canonical_name2
NSW$State_canonical_name = NSW$trial_canonical_name2 # your new merged column. Start with x
#merge with phrase species
NSW$State_canonical_name[!is.na(NSW$phrase)] = NSW$phrase[!is.na(NSW$phrase)] # merge with y
#replace duplicated xx with just one (consequence of the process of split the names)
NSW$State_canonical_name <- str_replace_all(NSW$State_canonical_name, "xx", "x")
#substitute codes for introduction with the ones proposed in AusAFW
NSW$natural.introduced <- gsub('N/I', "native colonising", NSW$natural.introduced, fixed=TRUE)
rep_strNSW = c('N'='native','I'='introduced')
#create a column for alien status
NSW$State_Alien_Status <- str_replace_all(NSW$natural.introduced, rep_strNSW)
#reorder so NA in alien status are last and can be deleted later when looking for duplicates
NSW <- NSW[order(NSW$State_Alien_Status),]
#remove duplicates of state_name that have NA alien status
NSW <- NSW[!duplicated(NSW$State_name),]
#select columns of interest
NSW <- subset(NSW, select = c('State_name','State_canonical_name', 'State_Alien_Status'))
#order by state name
NSW <- NSW[order(NSW$State_name),]
#remove records for which state name is NA
NSW <- subset(NSW, !(is.na(State_name)))
#NSW plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
NSW <- cross_GBIF(dataset = NSW)
}
if(cross_WFO){
NSW <- cross_WFO(dataset = NSW)
}
return(NSW)
} #end curate_nsw function
######NORTHERN TERRITORY#######
curate_nt <- function(NT, cross_GBIF = FALSE, cross_WFO = FALSE) {
#create new columns for territory (i.e. state for simplicity) canonical name and scientific name using the functions created
NT$State_canonical_name <- apply(NT[, c("Genus", "Species", "InfraSpecies.Rank", "Infra.Species.Name")], 1, canonical_name_from_scratch)
NT$State_name <- apply(NT[, c("Genus", "Species", "Species.Author.Name", "InfraSpecies.Rank", "Infra.Species.Name", "Infra.Species.Author.Name")], 1, state_name_from_scratch)
#remove records for which state name is NA
NT <- subset(NT, !(is.na(State_name)))
#remove brackets from Introduced.Status column
NT$Introduced.Status <- gsub('(', "", NT$Introduced.Status, fixed=TRUE)
NT$Introduced.Status <- gsub(')', "", NT$Introduced.Status, fixed=TRUE)
#substitute codes for introduction with the ones proposed in AusAFW
rep_strNT = c('Native to NT'='native','Formerly introduced to NT extinct'='formerly introduced','Introduced to NT'='introduced', 'Status uncertain in NT' = 'uncertain origin')
#create a column for alien status
NT$State_Alien_Status <- str_replace_all(NT$Introduced.Status, rep_strNT)
#select the three variables of interest (i.e. scientific name, canonical name and introduction status)
NT <- subset(NT, select = c('State_name','State_canonical_name', 'State_Alien_Status'))
#remove duplicates based on state name
NT <- NT[!duplicated(NT$State_name),]
#order by species name
NT <- NT[order(NT$State_name), ]
#NT plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
NT <- cross_GBIF(dataset = NT)
}
if(cross_WFO){
NT <- cross_WFO(dataset = NT)
}
return(NT)
} #end curate_nt function
######QUEENSLAND#########
curate_qld <- function(QLD, cross_GBIF = FALSE, cross_WFO = FALSE) {
#Subset to select only kingfom plantae, and from it angiosperms, gymnosperms, and pteridophytes
QLD <- QLD[QLD$Kingdom == "Plantae",]
QLD <- QLD[QLD$Group_Name == "Angiosperms"|QLD$Group_Name == "Gymnosperms"|QLD$Group_Name == "Pteridophytes",]
#rename columns
QLD <- rename(QLD, State_name = Botanical_Name)
QLD <- rename(QLD, State_canonical_name = Taxon_Name)
#remove records for which state name is NA
QLD <- subset(QLD, !(is.na(State_name)))
#substitute codes for introduction with the ones proposed in AusAFW
rep_strQLD = c('Native to QLD'='native', 'Native and naturalised in QLD'='native colonising','Formerly naturalised in QLD'='formerly naturalised', 'Doubtfully naturalised in QLD'='doubtfully naturalised', 'Naturalised in QLD'='naturalised')
#create a column for alien status
QLD$State_Alien_Status <- str_replace_all(QLD$Naturalisation_Status, rep_strQLD)
#select the three variables of interest (i.e. scientific name, canonical name and introduction status)
QLD <- subset(QLD, select = c('State_name','State_canonical_name', 'State_Alien_Status'))
#remove duplicates based on state name
QLD <- QLD[!duplicated(QLD$State_name),]
#order by species name
QLD <- QLD[order(QLD$State_canonical_name), ]
#QLD plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
QLD <- cross_GBIF(dataset = QLD)
}
if(cross_WFO){
QLD <- cross_WFO(dataset = QLD)
}
return(QLD)
} #end curate_qld function
######SOUTH AUSTRALIA#########
curate_sa <- function(SA, cross_GBIF = FALSE, cross_WFO = FALSE) {
#rename columns of interest
SA <- rename(SA, State_canonical_name = SCIENTIFIC.NAME)
SA <- rename(SA, State_name = SPECIES.with.AUTHOR)
SA <- subset(SA, SP!="sp.")
SA <- subset(SA, SP!="spp.")
#add space after bracket for species name
SA$State_name <- str_replace(SA$State_name, "[)]", ") ")
#replace infraspecific rank in canonical name and in state name
SA$State_canonical_name <- str_replace(SA$State_canonical_name, "ssp.", "subsp.")
SA$State_name <- str_replace(SA$State_name, "ssp.", "subsp.")
#Change X by x for hybrids to make all datasets homogenised
SA$State_canonical_name <- gsub(' X ', " x ", SA$State_canonical_name, fixed=TRUE)
SA$State_name <- gsub(' X ', " x ", SA$State_name, fixed=TRUE)
#substitute codes for introduction with the ones proposed in AusAFW
#assign introduced when *
SA$INTRODUCED <- str_replace(SA$INTRODUCED, "[*]", "introduced")
#assign native when blank
SA$INTRODUCED[is.na(SA$INTRODUCED)] <- "native"
#detect pattern 'xtinct' because sometimes is in capital letters and sometimes it's not
SA$extinct_true <- str_detect(SA$NPW.ACT.STATUS.COMMENT, "xtinct", negate = FALSE)
#assign NA if pattern not detected and presumed extinct if pattern detected
SA$extinct_true[SA$extinct_true=="FALSE"] <- NA
SA$extinct_true[SA$extinct_true=="TRUE"] <- "presumed extinct"
#create column with alien status. It is not necessary to modify presumed extinct as there are no species of introduced origin flagged as presumed extinct
SA$State_Alien_Status <- ifelse(is.na(SA$extinct_true), SA$INTRODUCED, SA$extinct_true)
#select the three variables of interest (i.e. scientific name, canonical name and introduction status)
SA <- subset(SA, select = c('State_name','State_canonical_name', 'State_Alien_Status'))
#remove duplicates based on state name
SA <- SA[!duplicated(SA$State_name),]
#order by species name
SA <- SA[order(SA$State_name), ]
#SA plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
SA <- cross_GBIF(dataset = SA)
}
if(cross_WFO){
SA <- cross_WFO(dataset = SA)
}
return(SA)
} #end curate_sa function
#####TASMANIA########
curate_tas <- function(TAS, cross_GBIF = FALSE, cross_WFO = FALSE) {
#remove equal symbol and space from ACCEPTED column and create a new column called Accepted_name
TAS$Accepted_name <- gsub('= ','',TAS$ACCEPTED)
#Create column for state name at species level
TAS$State_name <- apply(TAS[, c("Accepted_name", "FULLNAME")], 1, state_name)
#As for NSW, canonical name for Tasmania is complicated, so we create it in steps. We do trials of names for different taxonomic levels
#extract the genus from the state name after merging with accepted names
TAS$genus_new <- vapply(strsplit(TAS$State_name, " "), \(x) paste0(x[1]), character(1L))
#Species level:
#paste and collapse the genus and species from state name
TAS$trialspp <- vapply(strsplit(TAS$State_name, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#phrase species
#identify pattern and extract what is after
TAS$trialphrasesp <- str_extract(TAS$State_name, "[ ]sp[.]..*")
#remove symbols
TAS$trialphrasesp <- gsub("\\s*\\([^\\)]+\\)","",as.character(TAS$trialphrasesp))
#paste and collapse the second and third, to remove both, the space at the beginning and the author at the end
TAS$trialphrasesp <- vapply(strsplit(TAS$trialphrasesp, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
TAS$trialphrasesp <- str_replace_all(TAS$trialphrasesp, "NA NA", "")
#replace blank by NA
TAS$trialphrasesp[TAS$trialphrasesp==""] <- NA
#Hybrids:
#identify pattern and extract what is after
TAS$trialhybrid <- str_extract(TAS$State_name, "[ ]x[ ]..*")
#paste and collapse the second and third, to remove both, the space at the beginning and the author at the end
TAS$trialhybrid2 <- vapply(strsplit(TAS$trialhybrid, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
TAS$trialhybrid2 <- str_replace_all(TAS$trialhybrid2, "NA NA", "")
#replace blank by NA
TAS$trialhybrid2[TAS$trialhybrid2==""] <- NA
#Infraspecies level:
#Subsp.:
#First, extract the records that have "subsp" in their names
TAS$trialsubsp <- str_extract(TAS$State_name, "subsp\\..*")
#paste and collapse the first two words (i.e. subsp. + subsp. epithet)
TAS$subsp <- vapply(strsplit(TAS$trialsubsp, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#replace NA NA by blank
TAS$subsp <- str_replace_all(TAS$subsp, "NA NA", "")
#replace blank by NA
TAS$subsp[TAS$subsp==""] <- NA
#Var.:
#First, extract the records that have "var" in their names
TAS$trialvar <- str_extract(TAS$State_name, "var\\..*")
#paste and collapse the first two words (i.e. var. + var. epithet)
TAS$var <- vapply(strsplit(TAS$trialvar, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#replace NA NA by blank
TAS$var <- str_replace_all(TAS$var, "NA NA", "")
#replace blank by NA
TAS$var[TAS$var==""] <- NA
#Forma:
#First, extract the records that have space followed by f in their names
TAS$trialforma <- str_extract(TAS$State_name, "[ ]f\\..*")
#paste and collapse the second and third word, to remove both, the space at the beginning and the author at the end
TAS$forma <- vapply(strsplit(TAS$trialforma, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
TAS$forma <- str_replace_all(TAS$forma, "NA NA", "")
#replace blank by NA
TAS$forma[TAS$forma==""] <- NA
#column for infraspecies ranks and epithets
# your new merged column. Start with subsp.
TAS$infrasp = TAS$subsp
# merge with var
TAS$infrasp[!is.na(TAS$var)] = TAS$var[!is.na(TAS$var)]
# merge with forma
TAS$infrasp[!is.na(TAS$forma)] = TAS$forma[!is.na(TAS$forma)] # merge with y
#create a column as a step 1 for canonical name applying the previous function
TAS$trial_canonical_name <- apply(TAS[, c("trialspp", "infrasp")], 1, canonical_name_TAS1)
#create a column as a step 2 for canonical name applying the previous function
TAS$trial_canonical_name2 <- apply(TAS[, c("trial_canonical_name", "trialhybrid2")], 1, canonical_name_TAS2)
#column for State_canonical_name
# your new merged column. Start with trial_canonical_name2
TAS$State_canonical_name = TAS$trial_canonical_name2 # your new merged column. Start with x
#merge with phrase species
TAS$State_canonical_name[!is.na(TAS$phrase)] = TAS$phrase[!is.na(TAS$phrase)] # merge with y
#replace duplicated xx with just one (consequence of the process of split the names)
TAS$State_canonical_name <- str_replace_all(TAS$State_canonical_name, "xx", "x")
#replace two records that are nothosubsp. and escaped the curation process
TAS$State_canonical_name <- apply(TAS[, c("State_name", "State_canonical_name")], 1, escape_curation_tasmania)
#in column extinct, remove ? symbols
TAS$EXTINCT <- gsub('?', "", TAS$EXTINCT, fixed=TRUE)
#in column extinct, replace x by presumed extinct
TAS$EXTINCT <- gsub('x','presumed extinct',TAS$EXTINCT)
#substitute codes for introduction with the ones proposed in AusAFW
rep_strTAS = c('[?]'='doubtfully ','i'='introduced')
#create a column for alien status
TAS$Introduction <- str_replace_all(TAS$INTRO, rep_strTAS)
#replace blanks by native
TAS$Introduction[is.na(TAS$Introduction)] = "native"
#create column with extinct_status
TAS$extinct_status <- apply(TAS[, c("EXTINCT", "Introduction")], 1, TAS_extinct_status)
#create column for alien status by combining introduction status, origin, and occurrence. If introduction status is NA, keep record from recently created column extinct_status
TAS$State_Alien_Status <- ifelse(is.na(TAS$extinct_status), TAS$Introduction, TAS$extinct_status)
#select columns of interest
TAS <- subset(TAS, select = c("State_name", "State_canonical_name", "State_Alien_Status"))
#order by state name
TAS <- TAS[order(TAS$State_name),]
#remove records for which state name is NA
TAS <- subset(TAS, !(is.na(State_name)))
#remove duplicates based on state name
TAS <- TAS[!duplicated(TAS$State_name),]
#TAS plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
TAS <- cross_GBIF(dataset = TAS)
}
if(cross_WFO){
TAS <- cross_WFO(dataset = TAS)
}
return(TAS)
} #end curate_tas function
####VICTORIA######
curate_vic <- function(VIC, cross_GBIF = FALSE, cross_WFO = FALSE) {
#subset census to remove supra-rank orders
VIC <- subset(VIC, taxon_rank!="family")
VIC <- subset(VIC, taxon_rank!="phylum")
VIC <- subset(VIC, taxon_rank!="class")
VIC <- subset(VIC, taxon_rank!="order")
VIC <- subset(VIC, taxon_rank!="superorder")
VIC <- subset(VIC, taxon_rank!="genus")
VIC <- subset(VIC, taxon_rank!="section")
#create a column for canonical name and fix names by merging species names with accepted names that have changed
VIC$State_canonical_name <- ifelse(is.na(VIC$accepted_name_usage), VIC$scientific_name, VIC$accepted_name_usage)
#create a column for authorship and fix authors by merging those from accepted names that have changed
VIC$authorship <- ifelse(is.na(VIC$accepted_name_usage_authorship), VIC$scientific_name_authorship, VIC$accepted_name_usage_authorship)
#create a column for state name (scientific name), by combining canonical name and author
VIC$State_name <- paste(VIC$State_canonical_name, VIC$authorship, sep =" ")
#remove duplicate records of canonical name
VIC <- VIC[!duplicated(VIC$State_canonical_name),]
#Change × by x to make all datasets homogenised
VIC$State_name <- gsub('×', "x ", VIC$State_name, fixed=TRUE)
VIC$State_canonical_name <- gsub('×', "x ", VIC$State_canonical_name, fixed=TRUE)
VIC$State_name <- gsub(' ', " ", VIC$State_name, fixed=TRUE)
VIC$State_canonical_name <- gsub(' ', " ", VIC$State_canonical_name, fixed=TRUE)
#remove records appearing as excluded for occurrence status
VIC <- subset(VIC, occurrence_status!="excluded")
#substitute codes for occurrence status with the ones proposed in AusAFW
rep_strVIC1 = c('present'='','extinct'='extinct')
VIC$occurrence_status <- str_replace_all(VIC$occurrence_status, rep_strVIC1)
#substitute codes for DC concepts - establishment means with the ones proposed in AusAFW
rep_strVIC2 = c('native'='native','introduced'='introduced', 'uncertain'='uncertain origin')
VIC$origin <- str_replace_all(VIC$establishment_means, rep_strVIC2)
#substitute codes for DC concepts - degree of establishment with the ones proposed in AusAFW
rep_strVIC3 = c('casual'='','established'='naturalised','reproducing'='naturalised')
VIC$introduction_status <- str_replace_all(VIC$degree_of_establishment, rep_strVIC3)
VIC$introduction_status[VIC$introduction_status==""]<- NA
#substitute codes for has_introduced_occurrences to identify native colonising
rep_strVIC4 = c('1'='native colonising')
VIC$has_introduced_occurrences <- str_replace_all(VIC$has_introduced_occurrences, rep_strVIC4)
#create column with extinct_status
VIC$extinct_status <- apply(VIC[, c("occurrence_status", "origin")], 1, VIC_extinct_status)
#create column for alien status by combining introduction status, origin, and occurrence. If introduction status is NA, keep record from recently created column extinct_status
VIC$State_Alien_Status <- ifelse(is.na(VIC$introduction_status), VIC$extinct_status, VIC$introduction_status)
VIC$State_Alien_Status <- ifelse(!is.na(VIC$has_introduced_occurrences), VIC$has_introduced_occurrences, VIC$State_Alien_Status)
#keeps the one we are interested in. We also keep here the DC concepts for establishment means and degree of establishment< as we will need them later.
VIC <- subset(VIC, select = c("State_name", "State_canonical_name", "State_Alien_Status", "establishment_means", "degree_of_establishment"))
#remove duplicates based on state name
VIC <- VIC[!duplicated(VIC$State_name),]
#VIC plant census is now standardised and ready to be cross-referenced with AusAFW
VIC$State_name <- sub(VIC$State_name, pattern = "[ NA]*$", replacement = "")
if(cross_GBIF){
VIC <- cross_GBIF(dataset = VIC)
}
if(cross_WFO){
VIC <- cross_WFO(dataset = VIC)
}
return(VIC)
} #end curate_vic function
###WESTERN AUSTRALIA#####
curate_wa <- function(WA, cross_GBIF = FALSE, cross_WFO = FALSE) {
#create a column for state name (scientific name), by combining canonical name and author
WA$State_name <- paste(WA$SPECIES_NAME, WA$AUTHOR, sep =" ")
#rename canonical name and introduction status
WA <- rename(WA, State_canonical_name = SPECIES_NAME)
WA <- rename(WA, State_Alien_Status = NATURALISED_STATUS)
#select the three variables of interest (i.e. scientific name, canonical name and introduction status)
WA <- subset(WA, select = c("State_name", "State_canonical_name", "State_Alien_Status"))
#substitute codes for introduction with the ones proposed in AusAFW
rep_strWA = c('N'='native','M'='native colonising','A'='naturalised')
#create a column for alien status
WA$State_Alien_Status <- str_replace_all(WA$State_Alien_Status, rep_strWA)
#order by species name
WA <- WA[order(WA$State_name), ]
#remove records for which species name is NA
WA <- subset(WA, !(is.na(State_name)))
#remove duplicates based on state name
WA <- WA[!duplicated(WA$State_name),]
#WA plant census is now standardised and ready to be cross-referenced with AusAFW
if(cross_GBIF){
WA <- cross_GBIF(dataset = WA)
}
if(cross_WFO){
WA <- cross_WFO(dataset = WA)
}
return(WA)
} #end curate_wa function
#######CURATE THE GRIIS DATASET######
curate_griis <- function(GRIIS, invasive_griis,distribution_griis) {
#combine datasets
#create matching code based on id between GRIIS and invasive_griis
m1_griis <- match(GRIIS$id, invasive_griis$id)
#add invasive status to the GRIIS dataset
GRIIS$Invasive_status <- invasive_griis$isInvasive[m1_griis]
#add habitat to the GRIIS dataset
GRIIS$Invasive_habitat <- invasive_griis$habitat[m1_griis]
#create matching code based on id between GRIIS and distribution_griis
m2_griis <- match(GRIIS$id, distribution_griis$id)
#add country code to the GRIIS dataset
GRIIS$countryCode <- distribution_griis$countryCode[m2_griis]
#add occurrence status to the GRIIS dataset
GRIIS$occurrenceStatus <- distribution_griis$occurrenceStatus[m2_griis]
#add establishment means to the GRIIS dataset
GRIIS$establishmentMeans <- distribution_griis$establishmentMeans[m2_griis]
#add event date
GRIIS$eventDate <- distribution_griis$eventDate[m2_griis]
#now we have it all combined into one dataset
#subset dataset to only contain plants
GRIIS_plantae <- subset(GRIIS, kingdom="Plantae")
#combined with current accepted names
GRIIS_plantae$GRIIS_name <- ifelse(!is.na(GRIIS_plantae$acceptedNameUsage), GRIIS_plantae$scientificName, GRIIS_plantae$acceptedNameUsage)
#remove duplicates based on species names
GRIIS_plantae <- GRIIS_plantae[!duplicated(GRIIS_plantae$GRIIS_name),]
#convert codes for introduction status according to the ones proposed in AusAFW. Notice that on the Australian GRIIS cryptogenic had been used as a synonym for uncertain, that's why we converted it to blank
rep_strGRIIS= c('Alien'='introduced', 'Cryptogenic'='','Uncertain'='uncertain origin')
#create column origin and replace establishment means with codes proposed
GRIIS_plantae$Origin <- str_replace_all(GRIIS_plantae$establishmentMeans, rep_strGRIIS)
#delete symbol
GRIIS_plantae$Origin <- gsub('|', "", GRIIS_plantae$Origin, fixed=TRUE)
#for invasive status, change 'Null' by blank
GRIIS_plantae$Invasive_status <- gsub('Null', "", GRIIS_plantae$Invasive_status, fixed=TRUE)
#for invasive status, substitute blank by NA
GRIIS_plantae$Invasive_status[GRIIS_plantae$Invasive_status==""] <- NA
#create column for GRIIS alien status by combining origin and invasive status
GRIIS_plantae$GRIIS_Alien_Status = GRIIS_plantae$Origin # your new merged column. Start with x
GRIIS_plantae$GRIIS_Alien_Status[!is.na(GRIIS_plantae$Invasive_status)] = GRIIS_plantae$Invasive_status[!is.na(GRIIS_plantae$Invasive_status)] # merge with y
#change the term invasive to harmful invasive, as on the GRIIS, invasiveness was based on negative impact
GRIIS_plantae$GRIIS_Alien_Status[GRIIS_plantae$GRIIS_Alien_Status=="Invasive"] <- "harmful invasive"
#Curate the GRIIS database to create canonical name
#Species level:
#paste and collapse the genus and species from species name
GRIIS_plantae$trialspp <- vapply(strsplit(GRIIS_plantae$GRIIS_name, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#substitute NA NA by blank
GRIIS_plantae$trialspp <- str_replace_all(GRIIS_plantae$trialspp, "NA NA", "")
#substitute blank by NA
GRIIS_plantae$trialspp[GRIIS_plantae$trialspp==""] <- NA
#Hybrids:
#identify pattern and extract what is after
GRIIS_plantae$trialhybrid <- str_extract(GRIIS_plantae$GRIIS_name, "[ ]x[ ]..*")
#paste and collapse the second and third, to remove both, the space at the beginning and the author at the end
GRIIS_plantae$trialhybrid2 <- vapply(strsplit(GRIIS_plantae$trialhybrid, " "), \(x) paste0(x[2:3], collapse = " "), character(1L))
#replace NA NA by blank
GRIIS_plantae$trialhybrid2 <- str_replace_all(GRIIS_plantae$trialhybrid2, "NA NA", "")
#replace blank by NA
GRIIS_plantae$trialhybrid2[GRIIS_plantae$trialhybrid2==""] <- NA
#no need to do phrase species because there are not any on the GRIIS
#Infraspecies level:
#Subsp.:
#First, extract the records that have "subsp" in their names
GRIIS_plantae$trialsubsp <- str_extract(GRIIS_plantae$GRIIS_name, "subsp\\..*")
#paste and collapse the first and second words, to remove the author at the end
GRIIS_plantae$subsp <- vapply(strsplit(GRIIS_plantae$trialsubsp, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#substitute NA NA by blank
GRIIS_plantae$subsp <- str_replace_all(GRIIS_plantae$subsp, "NA NA", "")
#substitute blank by NA
GRIIS_plantae$subsp[GRIIS_plantae$subsp==""] <- NA
#Var.:
#extract the records that have "var" in their names
GRIIS_plantae$trialvar <- str_extract(GRIIS_plantae$GRIIS_name, "var\\..*")
#paste and collapse the first and second words, to remove the author at the end
GRIIS_plantae$var <- vapply(strsplit(GRIIS_plantae$trialvar, " "), \(x) paste0(x[1:2], collapse = " "), character(1L))
#substitute NA NA by blank
GRIIS_plantae$var <- str_replace_all(GRIIS_plantae$var, "NA NA", "")
#substitute blank by NA
GRIIS_plantae$var[GRIIS_plantae$var==""] <- NA
#no need to do forma because there are not any on the GRIIS
#column for infraspecies ranks and epithets
# your new merged column. Start with subsp.
GRIIS_plantae$infrasp = GRIIS_plantae$subsp # your new merged column. Start with x
# merge with var
GRIIS_plantae$infrasp[!is.na(GRIIS_plantae$var)] = GRIIS_plantae$var[!is.na(GRIIS_plantae$var)] # merge with y
#create a column as a step 1 for canonical name applying the previous function
GRIIS_plantae$trial_canonical_name1 <- apply(GRIIS_plantae[, c("trialspp", "infrasp")], 1, canonical_name_GRIIS1)
#create a column as a step 2 for canonical name applying the previous function
GRIIS_plantae$trial_canonical_name2 <- apply(GRIIS_plantae[, c("trial_canonical_name1", "trialhybrid2")], 1, canonical_name_GRIIS2)
#add canonical name to the GRIIS dataset
#replace duplicated xx with just one (consequence of the process of split the names)
GRIIS_plantae$GRIIS_canonical_name <- str_replace_all(GRIIS_plantae$trial_canonical_name2, "xx", "x")
return(GRIIS_plantae)
} #end curate_griis function
#####MATCH STATUSES BETWEEN STATE CENSUSES AND THE APC#########
############################################
match_act <- function(APC_raw, APC_processed, ACT_processed) {
#we convert the original APC (the one that includes not only accepted names by also synonyms, excluded, misapplied, etc) to lower cases. We do did so that phrase species can also get a match because in some sources they use capitals but not in others
APC_raw$APC_all_canonical_lower <- tolower(APC_raw$canonicalName)
APC_raw$APC_all_scientific_lower <- tolower(APC_raw$scientificName)
APC_raw$APC_accepted_lower <- tolower(APC_raw$acceptedNameUsage)
#Do the same process with the subset of accepted taxa on the APC to prioritise matching with these ones
APC_accepted <- subset(APC_raw, taxonomicStatus == "accepted")
APC_accepted$APC_all_canonical_lower <- tolower(APC_accepted$canonicalName)
APC_accepted$APC_all_scientific_lower <- tolower(APC_accepted$scientificName)
APC_accepted$APC_accepted_lower <- tolower(APC_accepted$acceptedNameUsage)
#convert species name on state censuses to lower case as well
ACT_processed$ACT_sp_name_lower <- tolower(ACT_processed$State_canonical_name)
#1. match with original APC
#match with original APC based on canonical name (because phrase names sometimes match this one instead, depending on herbaria formats)
m_ACT <- match(ACT_processed$ACT_sp_name_lower,APC_raw$APC_all_canonical_lower)
ACT_processed$APC_name1 <- APC_raw$acceptedNameUsage[m_ACT] #extract matched name
ACT_processed$taxa_status1 <- APC_raw$taxonomicStatus[m_ACT] #extract matched taxa status
#match with original APC based on scientific name
m_ACT2 <- match(ACT_processed$ACT_sp_name_lower,APC_raw$APC_all_scientific_lower)
ACT_processed$APC_name2 <- APC_raw$acceptedNameUsage[m_ACT2] #extract matched name
ACT_processed$taxa_status2 <- APC_raw$taxonomicStatus[m_ACT2] #extract matched taxa status
#match with original APC based on accepted name (because if it already matches an accepted name, this has to be prioritise)
m_ACT3 <- match(ACT_processed$ACT_sp_name_lower,APC_raw$APC_accepted_lower)
ACT_processed$APC_name3 <- APC_raw$acceptedNameUsage[m_ACT3] #extract matched name
ACT_processed$taxa_status3 <- APC_raw$taxonomicStatus[m_ACT3] #extract matched taxa status
#2. Match with accepted subset of APC
#match with subset of accepted taxa on APC based on canonical name
m_ACT4 <- match(ACT_processed$ACT_sp_name_lower,APC_accepted$APC_all_canonical_lower)
ACT_processed$APC_name4 <- APC_accepted$acceptedNameUsage[m_ACT4] #extract name
ACT_processed$taxa_status4 <- APC_accepted$taxonomicStatus[m_ACT4] #extract taxa status
#match with subset of accepted taxa on APC based on scientific name
m_ACT5 <- match(ACT_processed$ACT_sp_name_lower,APC_accepted$APC_all_scientific_lower)
ACT_processed$APC_name5 <- APC_accepted$acceptedNameUsage[m_ACT5] #extract name
ACT_processed$taxa_status5 <- APC_accepted$taxonomicStatus[m_ACT5] #extract taxa status
#match with subset of accepted taxa on APC based on accepted name
m_ACT6 <- match(ACT_processed$ACT_sp_name_lower,APC_accepted$APC_accepted_lower)
ACT_processed$APC_name6 <- APC_accepted$acceptedNameUsage[m_ACT6] #extract name
ACT_processed$taxa_status6 <- APC_accepted$taxonomicStatus[m_ACT6] #extract taxa status
#3. Combine both
#create new column of APC_name based on the original dataset by merging with all
ACT_processed$APC_name7 = ACT_processed$APC_name1 # merge first with canonical name
ACT_processed$APC_name7[!is.na(ACT_processed$APC_name2)] = ACT_processed$APC_name2[!is.na(ACT_processed$APC_name2)] # now merge first with scientific name to prioritise these matches over the first ones
ACT_processed$APC_name7[!is.na(ACT_processed$APC_name3)] = ACT_processed$APC_name3[!is.na(ACT_processed$APC_name3)] # finally merge with accepted name to prioritise these matches over all the former ones
#create new column of APC_name based on the accepted dataset by merging with all
ACT_processed$APC_name8 = ACT_processed$APC_name4 # your new merged column. Start with x
ACT_processed$APC_name8[!is.na(ACT_processed$APC_name5)] = ACT_processed$APC_name5[!is.na(ACT_processed$APC_name5)] # merge with y
ACT_processed$APC_name8[!is.na(ACT_processed$APC_name6)] = ACT_processed$APC_name6[!is.na(ACT_processed$APC_name6)] # merge with y
#Combine both columns resulting of matching process with original and accepted APC databases into a final APC_name column
ACT_processed$APC_name <- ACT_processed$APC_name7 #merge with x
ACT_processed$APC_name[!is.na(ACT_processed$APC_name8)] = ACT_processed$APC_name8[!is.na(ACT_processed$APC_name8)] # merge with y
#create a new column of taxa_status based on the original dataset by merging with all
ACT_processed$taxa_status7 = ACT_processed$taxa_status1 # merge first based on canonical name
ACT_processed$taxa_status7[!is.na(ACT_processed$taxa_status2)] = ACT_processed$taxa_status2[!is.na(ACT_processed$taxa_status2)] # now merge first based on scientific name to prioritise these matches over the first ones
ACT_processed$taxa_status7[!is.na(ACT_processed$taxa_status3)] = ACT_processed$taxa_status3[!is.na(ACT_processed$taxa_status3)] # finally merge based on accepted name to prioritise these matches over the two former ones
#create a new column of taxa_status based on the accepted dataset by merging with all
ACT_processed$taxa_status8 = ACT_processed$taxa_status4 # merge first based on canonical name
ACT_processed$taxa_status8[!is.na(ACT_processed$taxa_status5)] = ACT_processed$taxa_status5[!is.na(ACT_processed$taxa_status5)] # now merge first based on scientific name to prioritise these matches over the first ones
ACT_processed$taxa_status8[!is.na(ACT_processed$taxa_status6)] = ACT_processed$taxa_status6[!is.na(ACT_processed$taxa_status6)] # finally merge based on accepted name to prioritise these matches over the two former ones
#Combine both columns resulting of matching process with original and accepted APC databases into a final taxa_status column
ACT_processed$taxa_status <- ACT_processed$taxa_status7 #merge with x
ACT_processed$taxa_status[!is.na(ACT_processed$taxa_status8)] = ACT_processed$taxa_status8[!is.na(ACT_processed$taxa_status8)] # merge with y
#4. Populate other fields from APC
#match code between the final APC name we have created and the APC name of our master dataset (only including accepted taxa)
m_ACTaccepted <- match(ACT_processed$APC_name,APC_processed$APC_name)
#populate canonical name from APC
ACT_processed$APC_canonical_name <- APC_processed$APC_canonical_name[m_ACTaccepted]
#populate alien status from distribution in ACT column
ACT_processed$APC_Alien_Status <- APC_processed$ACT[m_ACTaccepted]
#populate authorship from ACT
ACT_processed$APC_Authorship <- APC_processed$Author[m_ACTaccepted]
#select columns of interest
ACT_processed <- subset(ACT_processed, select = c("State_canonical_name", "State_Alien_Status", "APC_name", "APC_canonical_name", "APC_Alien_Status", "APC_Authorship", "taxa_status"))
#status comparison between state censuses and APC
#ACT#
#add status comparison
ACT_processed$Status_comparison <- apply(ACT_processed[, c("State_Alien_Status", "APC_Alien_Status", "APC_name", "taxa_status")], 1, compare_status)
#add most conservative status
ACT_processed$Unified_status <- apply(ACT_processed[, c("State_Alien_Status", "APC_Alien_Status", "Status_comparison")], 1, state_conservative_status)
#fix pro parte species that are impossible to assign to one species
pro_parte <- c("Apium australe", "Astrotricha sp. 1", "Carissa spinarum", "Cleome microaustralica", "Cleome viscosa", "Daviesia benthamii subsp. humilis",
"Epacris heteronema var. planifolia", "Epilobium glabellum", "Euphrasia brownii", "Gentiana montana", "Geranium dissectum var. australe",
"Glycine clandestina var. sericea", "Helichrysum rutidolepis", "Hibbertia astrotricha", "Hibbertia billardierei", "Hibbertia densiflora",
"Hibbertia stricta var. stricta", "Hibbertia stricta var. hirtiflora", "Hibiscus trionum", "Isoetes lacustris", "Lepidosperma laterale var. angustum",
"Lepilaena preissii", "Monotoca scoparia var. submutica", "Myriophyllum elatinoides", "Myriophyllum propinquum", "Olearia sp. 2",
"Oreomyrrhis andicola", "Pleurandra ovata", "Pleurandra sericea", "Poa australis", "Potamogeton natans", "Roepera latifolia", "Sauropus trachyspermus",
"Senecio lautus subsp. maritimus", "Senecio pinnatifolius var. 2", "Stipa variabilis", "Styphelia virgata", "Taraxacum officinale",
"Thelymitra porphyrosticta", "Trachymene australis", "Tritonia squalida", "Vittadinia australis", "Xanthium strumarium", "Zygophyllum billardierei",
"Zygophyllum prismatothecum")
pro_parte <- as.data.frame(pro_parte)
colnames(pro_parte)[1]<- "State_canonical_name"
pro_parte_modify_ACT <- ACT_processed$State_canonical_name %in% pro_parte$State_canonical_name
ACT_processed$APC_name[pro_parte_modify_ACT]<- NA
ACT_processed$APC_canonical_name[pro_parte_modify_ACT]<- NA
ACT_processed$APC_Authorship[pro_parte_modify_ACT]<- NA
ACT_processed$APC_Alien_Status[pro_parte_modify_ACT]<- NA
ACT_processed$Status_comparison[pro_parte_modify_ACT]<- "It corresponds to several accepted names on the APC due to have been pro parte misapplied, therefore it is impossible to assign one"
ACT_processed$Unified_status[pro_parte_modify_ACT]<- ACT_processed$State_Alien_Status[pro_parte_modify_ACT]
ACT_processed$taxa_status[pro_parte_modify_ACT]<- "pro parte misapplied"
#create df with conservative status to add establishment means and degree of establishment. We'll have to unnest it after
ACTUnified_status_df <- as.data.frame(ACT_processed[, c("Unified_status")])
#add establishment_means
ACT_processed$establishment_means <- apply(ACTUnified_status_df, 1, state_establishment_means)
#add degree_of_establishment
ACT_processed$degree_of_establishment <- apply(ACTUnified_status_df, 1, state_degree_of_establishment)
#unnest both variables
ACT_processedb <- unnest(ACT_processed, establishment_means, keep_empty = TRUE)
ACT_processedc <- unnest(ACT_processedb, degree_of_establishment, keep_empty = TRUE)
#convert to df
ACT_compared <- as.data.frame(ACT_processedc)
#save dataset for supplementary material
return(ACT_compared)
} #end of match_ACT function
############################################
match_nsw <- function(APC_raw, APC_processed, NSW_processed) {
#we convert the original APC (the one that includes not only accepted names by also synonyms, excluded, misapplied, etc) to lower cases. We do did so that phrase species can also get a match because in some sources they use capitals but not in others
APC_raw$APC_all_canonical_lower <- tolower(APC_raw$canonicalName)
APC_raw$APC_all_scientific_lower <- tolower(APC_raw$scientificName)
APC_raw$APC_accepted_lower <- tolower(APC_raw$acceptedNameUsage)
#Do the same process with the subset of accepted taxa on the APC to prioritise matching with these ones
APC_accepted <- subset(APC_raw, taxonomicStatus == "accepted")
APC_accepted$APC_all_canonical_lower <- tolower(APC_accepted$APC_all_canonical)
APC_accepted$APC_all_scientific_lower <- tolower(APC_accepted$APC_all_scientific)
APC_accepted$APC_accepted_lower <- tolower(APC_accepted$APC_accepted)
#convert species name on state censuses to lower case as well
NSW_processed$NSW_sp_name_lower <- tolower(NSW_processed$State_canonical_name)
#1. match with original APC
#match with original APC based on canonical name (because phrase names sometimes match this one instead, depending on herbaria formats)
m_NSW <- match(NSW_processed$NSW_sp_name_lower,APC_raw$APC_all_canonical_lower)
NSW_processed$APC_name1 <- APC_raw$acceptedNameUsage[m_NSW]
NSW_processed$taxa_status1 <- APC_raw$taxonomicStatus[m_NSW]
#match with original APC based on scientific name
m_NSW2 <- match(NSW_processed$NSW_sp_name_lower,APC_raw$APC_all_scientific_lower)
NSW_processed$APC_name2 <- APC_raw$acceptedNameUsage[m_NSW2]
NSW_processed$taxa_status2 <- APC_raw$taxonomicStatus[m_NSW2]
#match with original APC based on accepted name (because if it already matches an accepted name, this has to be prioritise)
m_NSW3 <- match(NSW_processed$NSW_sp_name_lower,APC_raw$APC_accepted_lower)
NSW_processed$APC_name3 <- APC_raw$acceptedNameUsage[m_NSW3]
NSW_processed$taxa_status3 <- APC_raw$taxonomicStatus[m_NSW3]
#2. Match with accepted subset of APC
#match with subset of accepted taxa on APC based on canonical name
m_NSW4 <- match(NSW_processed$NSW_sp_name_lower,APC_accepted$APC_all_canonical_lower)
NSW_processed$APC_name4 <- APC_accepted$acceptedNameUsage[m_NSW4]
NSW_processed$taxa_status4 <- APC_accepted$taxonomicStatus[m_NSW4]
#match with subset of accepted taxa on APC based on scientific name
m_NSW5 <- match(NSW_processed$NSW_sp_name_lower,APC_accepted$APC_all_scientific_lower)
NSW_processed$APC_name5 <- APC_accepted$acceptedNameUsage[m_NSW5]
NSW_processed$taxa_status5 <- APC_accepted$taxonomicStatus[m_NSW5]
#match with subset of accepted taxa on APC based on accepted name
m_NSW6 <- match(NSW_processed$NSW_sp_name_lower,APC_accepted$APC_accepted_lower)
NSW_processed$APC_name6 <- APC_accepted$acceptedNameUsage[m_NSW6]
NSW_processed$taxa_status6 <- APC_accepted$taxonomicStatus[m_NSW6]
#3. Combine both
#create new column of APC_name based on the original dataset by merging with all
NSW_processed$APC_name7 = NSW_processed$APC_name1 # your new merged column. Start with x
NSW_processed$APC_name7[!is.na(NSW_processed$APC_name2)] = NSW_processed$APC_name2[!is.na(NSW_processed$APC_name2)] # merge with y
NSW_processed$APC_name7[!is.na(NSW_processed$APC_name3)] = NSW_processed$APC_name3[!is.na(NSW_processed$APC_name3)] # merge with y
#create new column of APC_name based on the accepted dataset by merging with all
NSW_processed$APC_name8 = NSW_processed$APC_name4 # your new merged column. Start with x
NSW_processed$APC_name8[!is.na(NSW_processed$APC_name5)] = NSW_processed$APC_name5[!is.na(NSW_processed$APC_name5)] # merge with y
NSW_processed$APC_name8[!is.na(NSW_processed$APC_name6)] = NSW_processed$APC_name6[!is.na(NSW_processed$APC_name6)] # merge with y
#Combine both columns resulting of matching process with original and accepted APC databases into a final APC_name column
NSW_processed$APC_name <- NSW_processed$APC_name7
NSW_processed$APC_name[!is.na(NSW_processed$APC_name8)] = NSW_processed$APC_name8[!is.na(NSW_processed$APC_name8)] # merge with y
#create a new column of taxa_status based on the original dataset by merging with all
NSW_processed$taxa_status7 = NSW_processed$taxa_status1 # your new merged column. Start with x
NSW_processed$taxa_status7[!is.na(NSW_processed$taxa_status2)] = NSW_processed$taxa_status2[!is.na(NSW_processed$taxa_status2)] # merge with y
NSW_processed$taxa_status7[!is.na(NSW_processed$taxa_status3)] = NSW_processed$taxa_status3[!is.na(NSW_processed$taxa_status3)] # merge with y
#create a new column of taxa_status based on the accepted dataset by merging with all
NSW_processed$taxa_status8 = NSW_processed$taxa_status4 # your new merged column. Start with x
NSW_processed$taxa_status8[!is.na(NSW_processed$taxa_status5)] = NSW_processed$taxa_status5[!is.na(NSW_processed$taxa_status5)] # merge with y
NSW_processed$taxa_status8[!is.na(NSW_processed$taxa_status6)] = NSW_processed$taxa_status6[!is.na(NSW_processed$taxa_status6)] # merge with y
#Combine both columns resulting of matching process with original and accepted APC databases into a final taxa_status column
NSW_processed$taxa_status <- NSW_processed$taxa_status7
NSW_processed$taxa_status[!is.na(NSW_processed$taxa_status8)] = NSW_processed$taxa_status8[!is.na(NSW_processed$taxa_status8)] # merge with y
#4. Populate other fields from APC
#match code between the final APC name we have created and the APC name of our master dataset (only including accepted taxa)
m_NSWaccepted <- match(NSW_processed$APC_name,APC_processed$APC_name)
#populate canonical name from APC
NSW_processed$APC_canonical_name <- APC_processed$APC_canonical_name[m_NSWaccepted]
#populate alien status from distribution in ACT column
NSW_processed$APC_Alien_Status <- APC_processed$NSW[m_NSWaccepted]