Merge pull request #41 from nhsbsa-data-analytics/non-db-calc-match-a…

…ddresses Non db calc match addresses
nhsbsa-data-analytics · May 15, 2024 · f3396e8 · f3396e8
2 parents bb052c8 + 094dfad
commit f3396e8
Show file tree

Hide file tree

Showing 5 changed files with 286 additions and 59 deletions.
diff --git a/R/calc_match_addresses_df.R b/R/calc_match_addresses_df.R
@@ -0,0 +1,206 @@
+#' Match two sets of addresses, using dataframes rather than lazy_tables
+#'
+#' Match a distinct dataframe of primary addresses to a distinct dataframe of
+#' lookup addresses.
+#'
+#' Returns a dataframe of exact matches (postcode and single line address) and
+#' non exact matches (postcode and fuzzy single line address). For non exact
+#' matches it will retain draws.
+#'
+#' Non-matching records will also be returned in the output
+#'
+#' @param primary_df Dataframe of distinct primary addresses
+#' @param primary_postcode_col Column containing the primary postcode
+#' @param primary_address_col Column containing the primary single line address
+#' @param lookup_df Dataframe of distinct lookup addresses
+#' @param lookup_postcode_col Column containing the lookup postcode
+#' @param lookup_address_col Column containing the lookup single line address
+#'
+#' @examples
+#' @export
+calc_match_addresses_df <- function(
+    primary_df,
+    primary_postcode_col,
+    primary_address_col,
+    lookup_df,
+    lookup_postcode_col,
+    lookup_address_col
+) {
+  # Disable group by override groups notice
+  options(dplyr.summarise.inform = FALSE)
+
+  # Check if there are zero shared postcodes
+  row_count = lookup_df %>%
+    dplyr::rename("{primary_postcode_col}" := .data[[lookup_postcode_col]]) %>%
+    dplyr::inner_join(
+      primary_df,
+      by = primary_postcode_col,
+      relationship = "many-to-many"
+    ) %>%
+    nrow(.)
+
+  # Break if zero shared postcodes
+  if(row_count == 0){
+    message("There are no shared postcodes between the datasets so no matching was possible")
+    return(NULL)
+  }
+
+  # Catch error when supplied data is too big
+  out = tryCatch(
+    {
+      # Generate primary df ID
+      primary_df = primary_df %>%
+        dplyr::mutate(ID = dplyr::row_number())
+
+      # Generate lookup df ID
+      lookup_df = lookup_df %>%
+        dplyr::mutate(ID_LOOKUP = dplyr::row_number()) %>%
+        dplyr::rename("{primary_postcode_col}" := .data[[lookup_postcode_col]])
+
+      # Find exact matches
+      exact_match_df = dplyr::inner_join(
+        x = primary_df %>%
+          dplyr::mutate(JOIN_ADDRESS = .data[[primary_address_col]]),
+        y = lookup_df %>%
+          dplyr::mutate(JOIN_ADDRESS = .data[[lookup_address_col]]),
+        by = c(primary_postcode_col, "JOIN_ADDRESS"),
+        suffix = c("", "_LOOKUP")
+        ) %>%
+        dplyr::select(ID, ID_LOOKUP) %>%
+        dplyr::mutate(
+          SCORE = 1,
+          MATCH_TYPE = "EXACT"
+        )
+
+      # Identify then tokenise non exact matches
+      non_exact_match_df <- primary_df %>%
+        dplyr::anti_join(
+          y = exact_match_df,
+          na_matches = "na",
+          by = "ID"
+        ) %>%
+        tidytext::unnest_tokens(
+          output = "TOKEN",
+          input = primary_address_col,
+          to_lower = FALSE,
+          drop = FALSE,
+          token = stringr::str_split,
+          pattern = "\\s"
+          ) %>%
+        dplyr::group_by(ID) %>%
+        dplyr::mutate(
+          TOKEN_WEIGHT = dplyr::if_else(grepl("[0-9]", TOKEN) == TRUE, 4, 1),
+          TOKEN_NUMBER = row_number(),
+          # Add the theoretical max score for each non exact match address
+          MAX_SCORE = sum(TOKEN_WEIGHT, na.rm = TRUE)
+        ) %>%
+        dplyr::ungroup()
+
+      # Tokenise lookup addresses
+      lookup_tokens_df <- lookup_df %>%
+        tidytext::unnest_tokens(
+          output = "TOKEN",
+          input = lookup_address_col,
+          to_lower = FALSE,
+          drop = FALSE,
+          token = stringr::str_split,
+          pattern = "\\s"
+          ) %>%
+        # Only need 1 instance of token per lookup ID
+        dplyr::distinct() %>%
+        dplyr::mutate(
+          TOKEN_WEIGHT = dplyr::if_else(grepl("[0-9]", TOKEN) == TRUE, 4, 1)
+          )
+
+      # Score remaining matches
+      non_exact_match_df = dplyr::full_join(
+        x = non_exact_match_df,
+        y = lookup_tokens_df,
+        by = c(primary_postcode_col, "TOKEN_WEIGHT"),
+        suffix = c("", "_LOOKUP"),
+        relationship = "many-to-many"
+        ) %>%
+        # Score remaining token pairs
+        dplyr::mutate(
+          SCORE = dplyr::case_when(
+            # Exact matches
+            TOKEN == TOKEN_LOOKUP ~ 1,
+            (TOKEN != TOKEN_LOOKUP) & (TOKEN_WEIGHT == 4) ~ 0,
+            TOKEN != TOKEN_LOOKUP ~ stringdist::stringsim(
+              a = TOKEN,
+              b = TOKEN_LOOKUP,
+              method = "jw",
+              p = 0.1
+              )
+            )
+        ) %>%
+        # Remove tokens with score less than 0.8 then multiple by weight
+        dplyr::filter(SCORE > 0.8) %>%
+        dplyr::mutate(SCORE = SCORE * TOKEN_WEIGHT) %>%
+        # Max score per token
+        dplyr::group_by(ID, ID_LOOKUP, MAX_SCORE, TOKEN_NUMBER) %>%
+        dplyr::summarise(SCORE = max(SCORE, na.rm = TRUE)) %>%
+        dplyr::ungroup() %>%
+        # Sum scores per ID pair & generate score out of maximum score
+        dplyr::group_by(ID, ID_LOOKUP, MAX_SCORE) %>%
+        dplyr::summarise(SCORE = sum(SCORE, na.rm = TRUE)) %>%
+        dplyr::mutate(SCORE = SCORE / MAX_SCORE) %>%
+        dplyr::ungroup() %>%
+        dplyr::select(-MAX_SCORE) %>%
+        # Slice top score with ties per primary df ID
+        dplyr::group_by(ID) %>%
+        dplyr::slice_max(order_by = SCORE, with_ties = TRUE) %>%
+        dplyr::ungroup() %>%
+        dplyr::mutate(MATCH_TYPE = "NON-EXACT")
+
+      # Identify records not exact or non-exact matched
+      no_match_df = primary_df %>%
+        dplyr::anti_join(
+          exact_match_df %>% select(ID) %>% distinct(),
+          by = "ID"
+        ) %>%
+        dplyr::anti_join(
+          non_exact_match_df %>% select(ID) %>% distinct(),
+          by = "ID"
+        ) %>%
+        dplyr::transmute(
+          ID,
+          ID_LOOKUP = NA,
+          SCORE = 0,
+          MATCH_TYPE = "NONE"
+        )
+
+      # Stack the exact and non exact matches together and output
+      output = dplyr::bind_rows(exact_match_df, non_exact_match_df, no_match_df) %>%
+        dplyr::left_join(
+          primary_df,
+          by = "ID"
+        ) %>%
+        dplyr::left_join(
+          lookup_df,
+          by = "ID_LOOKUP",
+          suffix = c("", "_LOOKUP")
+        ) %>%
+        dplyr::group_by(ID) %>%
+        dplyr::mutate(MATCH_COUNT = n_distinct(ID_LOOKUP, na.rm = TRUE)) %>%
+        dplyr::ungroup() %>%
+        dplyr::select(-c(ID, ID_LOOKUP)); gc()
+
+      # Return output
+      return(output)
+  },
+  error = function(cond){
+    gc()
+    items = c(
+    "The size of the datasets attempting to be matched are too big and the function has failed.",
+    "Try looping through groups of postcodes, such as several thousand at a time.",
+    "Also ensure each matching df only contains distinct address-postcode records, to streamline the process.",
+    "Although RAM dependent, the function has the ability to match up to tens of thousands of records on each side.",
+    "For large-scale matching tasks, please use the database version of this function: addressMatchR::calc_match_addresses()."
+    )
+    message(paste(" ", items, sep = "\n"))
+    return(NULL)
+    }
+  )
+  return(out)
+}
diff --git a/R/tidy_postcode.R b/R/tidy_postcode.R
@@ -1,4 +1,4 @@
-#' Tidy a postcode
+#' Tidy a lazy_table postcode
 #'
 #' Tidy a postcode so it is ready for joining.
 #'
@@ -13,9 +13,7 @@ tidy_postcode <- function(df, col) {
   # Tide the postcode column
   df %>%
     dplyr::mutate(
-
-      {{ col }} := REGEXP_REPLACE(toupper({{ col }}), # Uppercase
-                                  "[^A-Z0-9]", "")    # Remove anything not a character or digit
-
+      # Remove anything not a character or digit
+      {{ col }} := REGEXP_REPLACE(toupper({{ col }}), "[^A-Z0-9]", "")
     )
 }
diff --git a/R/tidy_postcode_df.R b/R/tidy_postcode_df.R
@@ -0,0 +1,19 @@
+#' Tidy a datarame postcode
+#'
+#' Tidy a postcode so it is ready for joining.
+#'
+#' @param df dataFrame
+#' @param col Postcode column
+#'
+#' @examples
+#'
+#' @export
+tidy_postcode_df <- function(df, col) {
+
+  # Tide the postcode column
+  df %>%
+    dplyr::mutate(
+      # Remove anything not a character or digit
+      {{ col }} := gsub("[^A-Z0-9]", "", toupper({{ col }}))
+    )
+}
diff --git a/R/tidy_single_line_address.R b/R/tidy_single_line_address.R
@@ -1,8 +1,8 @@
-#' Tidy a single line address
+#' Tidy a lazy_table single line address
 #'
 #' Tidy a single line address ready for tokenising.
 #'
-#' @param df Either a database table or a local data frame
+#' @param df Must be  lazy_table (i.e. not a data frame)
 #' @param col Single line address column
 #' @param remove_postcode If to remove the postcode. Default is FALSE
 #'
@@ -13,59 +13,27 @@ tidy_single_line_address <- function(df, col, remove_postcode = FALSE) {
 
   # Remove postcode from single line address if necessary (e.g. after last ",")
   if (remove_postcode) {
-
-    # Check if lazy table
-    if(inherits(df, c("tbl_dbi", "tbl_lazy"))){
-
-      # Process as lazy table
-      df <- df %>%
-        dplyr::mutate({{ col }} := REGEXP_REPLACE({{ col }}, "[,][^,]+$", ""))
-    }else{
-
-      # Else process as df
-      df <- df %>%
-        dplyr::mutate({{ col }} := gsub("[,][^,]+$", "", {{ col }}))
-    }
+    # Process as lazy table
+    df <- df %>%
+      dplyr::mutate({{ col }} := REGEXP_REPLACE({{ col }}, "[,][^,]+$", ""))
   }
 
-  # Check if object is a lazy table
-  if(inherits(df, c("tbl_dbi", "tbl_lazy"))){
-
-    # Process as a lazy frame
-    df %>%
-      dplyr::mutate(
-        {{ col }} := trimws(REPLACE(REGEXP_REPLACE(REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(toupper({{ col }}),             # Uppercase
-                                                                                                                "[,.();:#'']", " "),      # replace special characters with a single space
-                                                                                                                "(\\d)(\\D)", "\\1 \\2"), # add a space between any digit followed by a non-digit (e.g. 1A becomes 1 A)
-                                                                                                                "(\\D)(\\d)", "\\1 \\2"), # add a space between any non-digit followed by a digit (e.g. A1 becomes A 1)
-                                                                                                                "&", " AND "),            # replace the ampersand character with the string "and"
-                                                                                                                "( ){2,}", " "),          # replace any multiple spaces with a single space
-                                                                                                                " - ", "-")               # remove any spaces around a hyphen
-        ),
-
-        # Only remove spaces around hyphen if surrounded by numbers
-        {{ col }} := dplyr::case_when(
-          REGEXP_INSTR({{ col }}, "[0-9] - [0-9]") > 0L ~ REPLACE(" - ", "-", {{ col }}),
-          TRUE ~ {{ col }}
-        )
-      )
-  }else{
-
-    #Process as a data frame
-    df %>%
-      dplyr::mutate(
-        # Address cleaning
-        {{ col }} := toupper({{ col }}),
-        {{ col }} := gsub(" & ", " AND ", {{ col }}),
-        {{ col }} := gsub("(\\D)(\\d)", "\\1 \\2", {{ col }}),
-        {{ col }} := gsub("(\\d)(\\D)", "\\1 \\2", {{ col }}),
-        {{ col }} := gsub("[,.();:#''\"]", " ", {{ col }}),
-        {{ col }} := stringr::str_squish({{ col }}),
-        {{ col }} := ifelse(
-          grepl("[0-9] - [0-9]", {{ col }}) == TRUE,
-          gsub(" - ", "-", {{ col }}),
-          {{ col }}
-        )
+  # Process as a lazy frame
+  df %>%
+    dplyr::mutate(
+      {{ col }} := trimws(REPLACE(REGEXP_REPLACE(REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(toupper({{ col }}),             # Uppercase
+                                                                                                              "[,.();:#'']", " "),      # replace special characters with a single space
+                                                                                                              "(\\d)(\\D)", "\\1 \\2"), # add a space between any digit followed by a non-digit (e.g. 1A becomes 1 A)
+                                                                                                              "(\\D)(\\d)", "\\1 \\2"), # add a space between any non-digit followed by a digit (e.g. A1 becomes A 1)
+                                                                                                              "&", " AND "),            # replace the ampersand character with the string "and"
+                                                                                                              "( ){2,}", " "),          # replace any multiple spaces with a single space
+                                                                                                              " - ", "-")               # remove any spaces around a hyphen
+      ),
+
+      # Only remove spaces around hyphen if surrounded by numbers
+      {{ col }} := dplyr::case_when(
+        REGEXP_INSTR({{ col }}, "[0-9] - [0-9]") > 0L ~ REPLACE(" - ", "-", {{ col }}),
+        TRUE ~ {{ col }}
       )
-  }
+    )
 }
diff --git a/R/tidy_single_line_address_df.R b/R/tidy_single_line_address_df.R
@@ -0,0 +1,36 @@
+#' Tidy a dataframe single line address
+#'
+#' Tidy a single line address ready for tokenising.
+#'
+#' @param df Must be a dataframe (i.e. not a lazy_frame)
+#' @param col Single line address column
+#' @param remove_postcode If to remove the postcode. Default is FALSE
+#'
+#' @examples
+#' @export
+#'
+tidy_single_line_address_df <- function(df, col, remove_postcode = FALSE) {
+
+  # Remove postcode from single line address if necessary (e.g. after last ",")
+  if (remove_postcode) {
+    # Else process as df
+    df <- df %>% dplyr::mutate({{ col }} := gsub("[,][^,]+$", "", {{ col }}))
+    }
+
+  # Process df
+  df %>%
+    dplyr::mutate(
+      # Address cleaning
+      {{ col }} := toupper({{ col }}),
+      {{ col }} := gsub(" & ", " AND ", {{ col }}),
+      {{ col }} := gsub("(\\D)(\\d)", "\\1 \\2", {{ col }}),
+      {{ col }} := gsub("(\\d)(\\D)", "\\1 \\2", {{ col }}),
+      {{ col }} := gsub("[,.();:#''\"]", " ", {{ col }}),
+      {{ col }} := stringr::str_squish({{ col }}),
+      {{ col }} := ifelse(
+        grepl("[0-9] - [0-9]", {{ col }}) == TRUE,
+        gsub(" - ", "-", {{ col }}),
+        {{ col }}
+      )
+    )
+}