closes #6. fungi csvs working

stevenpbachman · May 31, 2024 · 93bd920 · 93bd920
1 parent 45244fb
commit 93bd920
Show file tree

Hide file tree

Showing 22 changed files with 232 additions and 171 deletions.
diff --git a/R/get_name_keys.R b/R/get_name_keys.R
@@ -3,7 +3,6 @@
 #'
 #' @param df character) Data frame with taxon names
 #' @param name_col (character) Column for taxon names. Include taxonomic authority for better matching
-#' @param tax_status (character) Default `any` provides all taxonomic, otherwise 'accepted'
 #' @param match (character) Controls the output of the search. Use `single` to
 #' force a single match result that has the highest confidence or `any` to return
 #' all possible matches sorted in order of confidence
@@ -12,24 +11,23 @@
 #'
 #' @return Returns a data frame with accepted GBIF and POWO identifiers
 #' @export
-#' @details Designed for batch processing. Default setting (tax_status = "accepted" and match = "single")
-#' returns an accepted name with the best single match against GBIF and POWO name backbones.
-#' Final list may return fewer names if there are discrepancies e.g. accepted
+#' @details Designed for batch processing. Default setting (match = "single", kingdom = "plantae")
+#' returns an accepted name with the best single match against GBIF and POWO name backbones (GBIF
+#' only for kingdom = "fungi"). Final list may return fewer names if there are discrepancies e.g. accepted
 #'  in GBIF, but not in POWO. Output data frame includes GBIF 'usageKey' that can be used with
 #'   [`get_gbif_occs()`] to get occurrences from GBIF, and 'wcvp_ipni_id' that can be used with [`powo_range()`]
-#' to get native ranges. To see a wider range of plausible matches adjust 'tax_status' and 'match' to 'any'.
+#' to get native ranges. To see a wider range of plausible matches adjust 'match' to 'any'.
 
 # add option to determine which sources you want to search e.g. WCVP for plants, or IF for fungi
-get_name_keys <- function(df, name_column, tax_status = "any", match = "single", kingdom = "plantae") {
+get_name_keys <- function(df, name_column, match = "single", kingdom = "plantae") {
   # search terms
   search_names <- as.vector(unlist(df[, name_column]))
 
   # get the GBIF keys
   gbif_names_out <-
     purrr::map_dfr(search_names,
                    name_search_gbif,
-                   match = match,
-                   gbif_tax_stat = tax_status)
+                   match = match)
   colnames(gbif_names_out) <- paste0("GBIF", "_", colnames(gbif_names_out))
 
   keys_df <- gbif_names_out
@@ -39,8 +37,7 @@ get_name_keys <- function(df, name_column, tax_status = "any", match = "single",
   # get the POWO keys
     powo_names_out <-
       name_search_powo(df = df,
-                       name_column = name_column,
-                       powo_tax_stat = tax_status)
+                       name_column = name_column)
 
   # bind them together - need to fix when WCVP returns multiple matches
   keys_df <-

diff --git a/R/get_native_range.R b/R/get_native_range.R
@@ -2,9 +2,9 @@
 #' Get native ranges for taxa
 #'
 #' @param keys (data frame) Contain identifier for the taxon e.g. derived from [`get_name_keys()`]
-#' @param keys (name_col) Column name for the identifier
 #'
-#' @return (data frame) A list of places where a taxon occurs.
+#' @return (data frame) A list of 'botanical countries' (World Geogrpahic Scheme for Recording Plant
+#' Distributions) where a taxon occurs.
 #' @export
 #'
 #' @details Currently one option to get native ranges from (Plants of the
@@ -14,7 +14,6 @@
 get_native_range <- function(keys) { #,name_col
 
   # get the search ids
-  #search_ids <- as.vector(unlist(keys[, name_col]))
   search_ids <- as.vector(unlist(keys[, "wcvp_ipni_id"]))
 
   # run the powo range function through map_dfr

diff --git a/R/get_occs_range.R b/R/get_occs_range.R
@@ -2,22 +2,26 @@
 #'
 #' @param sis_points (dataframe) SIS points file
 #'
-#' @return Occurrence dataframe with ISO two-digit codes added
+#' @return Occurrence dataframe with ISO two-digit codes added and internal id
 
 get_occs_range <- function(sis_points) {
 
-  sf_use_s2(FALSE)
+  sf::sf_use_s2(FALSE)
 
   # get unique lat longs
   sis_points_sf <- sf::st_as_sf(sis_points, coords = c("dec_long","dec_lat"))
 
   # prepare the tdwg polygon data
-  st_crs(LCr::tdwg_level3) <- st_crs(tdwg_level3)
-  st_crs(sis_points_sf) <- st_crs(tdwg_level3)
+  sf::st_crs(tdwg_level3) <- sf::st_crs(tdwg_level3)
+  sf::st_crs(sis_points_sf) <- sf::st_crs(tdwg_level3)
 
   # do the intersect
   out <- sf::st_join(sis_points_sf, tdwg_level3)
 
+  # tidy up the table with just the required fields
+  out <- out %>% dplyr::select("LEVEL3_COD", "sci_name", "internal_taxon_id") %>%
+    sf::st_drop_geometry() %>% unique() %>% tidyr::drop_na() %>% dplyr::arrange(sci_name)
+
   return(out)
 
 }
diff --git a/R/make_LC_points.R b/R/make_LC_points.R
@@ -7,7 +7,7 @@
 #' @param institution (character) Name of institution or affiliation
 #' @param range_check (boolean) TRUE if you want to carry out occurrence cleaning with native range filter
 #'
-#' @return (list) Includes the GBIF citation, and "points" the cleaned SIS compatible point file
+#' @return (list) Includes the GBIF "citation", and "points" the cleaned SIS compatible point file
 #' @export
 #' @details Designed for batch processing using keys derived from [`get_name_keys()`]
 
@@ -52,7 +52,6 @@ make_LC_points <-
       res_list <- list("citation" = gbif_ref, "points" = final_points)
     }
 
-    #res_list <- list("citation" = gbif_ref, "points" = final_points)
     return(res_list)
 
   }
diff --git a/R/make_sis_csvs.R b/R/make_sis_csvs.R
@@ -1,64 +1,108 @@
 
+
 #' Generate all SIS connect csv files
 #'
-#' @param keys (data frame) Derived from [`get_name_keys()`] function. Must include at least GBIF_usageKey to obtain GBIF occurrences
+#' @param unique_id (character) Unique identifier - default is the GBIF usage key
 #' @param first_name (character) First name of assessor
 #' @param second_name (character) Second name of assessor
 #' @param email (character) Email of assessor
 #' @param institution (character) Name of institution or affiliation
 #' @param gbif_ref (data frame) A GBIF download citation according to IUCN format.
+#' @param powo_ref (data frame) A citation for use of POWO according to IUCN format.
 #' @param native_ranges (data frame) Native ranges derived from [`get_native_range()`]
+#' @param family (character) Field containing the family
+#' @param genus (character) Field containing the genus
+#' @param species (character) Field containing the specific epithet
+#' @param taxonomicAuthority (character) Field containing the taxonomic authority
 #' @param kingdom (character) Default is 'plantae', but can also be 'fungi'
 #'
 #' @return Returns an SIS compliant zip file
 #' @export
 
 make_sis_csvs <-
-  function(unique_id, wcvp_ipni_id, first_name, second_name, email,
-           institution, gbif_ref = NULL, native_ranges,
-           family, genus, species, taxonomicAuthority,
-           kingdom = "plantae", powo_ref = FALSE) {
-
+  function(unique_id,
+           wcvp_ipni_id,
+           first_name,
+           second_name,
+           email,
+           institution,
+           gbif_ref = NULL,
+           powo_ref = FALSE,
+           native_ranges = NULL,
+           family,
+           genus,
+           species,
+           taxonomicAuthority,
+           kingdom = "plantae")
+  {
     if (kingdom == "plantae") {
-
       # get most of the csvs here
-      countries <- sis_countries(unique_id, wcvp_ipni_id, native_ranges)
+      if (!is.null(native_ranges)) {
+      countries <- sis_countries(native_ranges)
+      }
       allfields <- sis_allfields(unique_id)
       assessments <- sis_assessments(unique_id)
       plantspecific <- sis_plantspecific(unique_id, kingdom)
       habitats <- sis_habitats(unique_id)
       credits <- sis_credits(unique_id, first_name, second_name, email, affiliation = institution)
       taxonomy <- sis_taxonomy(unique_id, family, genus, species, taxonomicAuthority)
 
-      # need to embed map into the function, but these are a bit awkward - try again later
-      references <- purrr::map_dfr(unique_id, sis_references, powo_ref = powo_ref, gbif_ref)
+      # need to embed map into the function, but refs a bit awkward - try again later
+      references <- purrr::map_dfr(unique_id, sis_references, powo_ref = powo_ref, gbif_ref = gbif_ref)
 
-      return(
-        list(allfields = allfields, assessments = assessments, plantspecific = plantspecific,
-          habitats = habitats, taxonomy = taxonomy, credits = credits, references = references,
-          countries = countries
+      # list of default results - these should always be generated
+      results <-
+        list(
+          allfields = allfields,
+          assessments = assessments,
+          plantspecific = plantspecific,
+          habitats = habitats,
+          credits = credits,
+          taxonomy = taxonomy,
+          references = references
         )
-      )
+
+      # countries df depends on native ranges, so only add countries if exists
+      if (exists("countries")) {
+        results$countries <- countries
+      }
+
+      return(results)
     }
 
     if (kingdom == "fungi") {
-
       # get most of the csvs here
-      #countries <- sis_countries(unique_id, wcvp_ipni_id, native_ranges)
+      if (!is.null(native_ranges)) {
+        countries <- sis_countries(native_ranges)
+      }
       allfields <- sis_allfields(unique_id)
       assessments <- sis_assessments(unique_id)
       plantspecific <- sis_plantspecific(unique_id, kingdom)
       habitats <- sis_habitats(unique_id)
       credits <- sis_credits(unique_id, first_name, second_name, email, affiliation = institution)
-      taxonomy <- sis_taxonomy(unique_id, family,genus, species, taxonomicAuthority)
+      taxonomy <- sis_taxonomy(unique_id, family, genus, species, taxonomicAuthority)
 
-      # need to embed map into the function, but these are a bit awkward - try again later
+      # need to embed map into the function, but refs a bit awkward - try again later
       references <- purrr::map_dfr(unique_id, sis_references, gbif_ref, powo_ref)
 
-      return(
-        list(allfields = allfields, assessments = assessments, plantspecific = plantspecific,
-             habitats = habitats, credits = credits, taxonomy = taxonomy, references = references
+      # list of default results - these should always be generated
+      results <-
+        list(
+          allfields = allfields,
+          assessments = assessments,
+          plantspecific = plantspecific,
+          habitats = habitats,
+          credits = credits,
+          taxonomy = taxonomy,
+          references = references
         )
-      )
-  }
+
+      # countries df depends on native ranges, so only add countries if exists
+      if (exists("countries")) {
+        results$countries <- countries
+      }
+
+      return(results)
+
+    }
   }
diff --git a/R/make_sis_occs.R b/R/make_sis_occs.R
@@ -25,6 +25,7 @@ make_sis_occs <- function(occs_clean, first_name = "", second_name = "", institu
                               basisOfRecord,
                               elevation,
                               catalogNumber,
+                              speciesKey,
                               gbifID
   )
 
@@ -42,6 +43,7 @@ make_sis_occs <- function(occs_clean, first_name = "", second_name = "", institu
   )
 
   sis_points <- dplyr::mutate(sis_points,
+                              internal_taxon_id = speciesKey,
                               source = paste0("https://www.gbif.org/occurrence/", gbifID),
                               yrcompiled = format(Sys.Date(), "%Y"),
                               citation = institution,
@@ -56,15 +58,20 @@ make_sis_occs <- function(occs_clean, first_name = "", second_name = "", institu
                                 basisofrec,
                                 "FOSSIL_SPECIMEN" = "FossilSpecimen",
                                 "HUMAN_OBSERVATION" = "HumanObservation",
-                                "LITERATURE" = "",
                                 "LIVING_SPECIMEN" = "LivingSpecimen",
                                 "MACHINE_OBSERVATION" = "MachineObservation",
-                                "OBSERVATION" = "",
                                 "PRESERVED_SPECIMEN" = "PreservedSpecimen",
-                                "UNKNOWN" = "Unknown"
+                                "UNKNOWN" = "Unknown",
+                                "OCCURRENCE" = "HumanObservation",
+                                "LITERATURE" = "",
+                                "OBSERVATION" = "HumanObservation",
+                                "MATERIAL_ENTITY" = "",
+                                "MATERIAL_SAMPLE" = "PreservedSpecimen",
+                                "MATERIAL_CITATION" = "PreservedSpecimen"
                               )
   )
-  sis_points <- dplyr::select(sis_points, -gbifID)
+
+  sis_points <- dplyr::select(sis_points, -speciesKey,-gbifID)
 
   return(sis_points)
 }
diff --git a/R/name_search_gbif.R b/R/name_search_gbif.R
@@ -10,8 +10,6 @@
 #' @param match (character) Controls the output of the search. Use `single` to
 #' force a single match result that has the highest confidence or `any` to return
 #' all possible matches sorted in order of confidence
-#' @param gbif_tax_stat (character) Default `any` provides all taxonomic
-#' matches, otherwise `accepted` returns only accepted names according to GBIF
 #'
 #' @return Returns a data frame with initial search term and matching name(s)
 #' @export
@@ -21,8 +19,8 @@
 
 name_search_gbif = function(name,
                             species_rank = TRUE,
-                            match = "single",
-                            gbif_tax_stat = "any") {
+                            match = "single") {
+                            #gbif_tax_stat = "any") { #removed as was not working well
   # set up default results table
   default_tbl = gbif_name_tbl_(name)
 
@@ -65,14 +63,14 @@ name_search_gbif = function(name,
       results = dplyr::arrange(results, dplyr::desc(confidence))
     }
 
-    # option to filter on GBIF accepted species only
-    if (gbif_tax_stat == "any") {
-      results = results
-    } else {
-      if (gbif_tax_stat == "accepted") {
-        results = dplyr::filter(results, status == "ACCEPTED")
-      }
-    }
+    # # option to filter on GBIF accepted species only
+    # if (gbif_tax_stat == "any") {
+    #   results = results
+    # } else {
+    #   if (gbif_tax_stat == "accepted") {
+    #     results = dplyr::filter(results, status == "ACCEPTED")
+    #   }
+    # }
 
     # option to filter on maximum confidence from GBIF search - one option only "single"
     # or allow list of options "any"

diff --git a/R/name_search_powo.R b/R/name_search_powo.R
@@ -5,23 +5,21 @@
 #'
 #' @param df (data frame) Taxon name(s)
 #' @param name_column (string) Column that contains the name(s)
-#' @param powo_tax_stat (character) Default `any` provides all taxonomic
-#' matches, otherwise `accepted` returns only accepted names according to POWO
 #'
 #' @return Returns a data frame with initial search term and matching name(s)
 #' @export
 
 
-name_search_powo <- function(df, name_column, powo_tax_stat = "any"){
+name_search_powo <- function(df, name_column){
   #name_df <- data.frame(name = name)
   results <- rWCVP::wcvp_match_names(names_df = df, name_col = name_column)
 
-  # allow user to filter on accepted name only
-  if (powo_tax_stat == "any"){
-    results = results
-  } else {
-    if (powo_tax_stat == "accepted") {
-      results = dplyr::filter(results, wcvp_status == "Accepted")
-    }}
+  # # allow user to filter on accepted name only
+  # if (powo_tax_stat == "any"){
+  #   results = results
+  # } else {
+  #   if (powo_tax_stat == "accepted") {
+  #     results = dplyr::filter(results, wcvp_status == "Accepted")
+  #   }}
   return(results)
 }