Skip to content

Commit

Permalink
Merge pull request #41 from nhsbsa-data-analytics/non-db-calc-match-a…
Browse files Browse the repository at this point in the history
…ddresses

Non db calc match addresses
  • Loading branch information
AdnanShroufi authored May 15, 2024
2 parents bb052c8 + 094dfad commit f3396e8
Show file tree
Hide file tree
Showing 5 changed files with 286 additions and 59 deletions.
206 changes: 206 additions & 0 deletions R/calc_match_addresses_df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#' Match two sets of addresses, using dataframes rather than lazy_tables
#'
#' Match a distinct dataframe of primary addresses to a distinct dataframe of
#' lookup addresses.
#'
#' Returns a dataframe of exact matches (postcode and single line address) and
#' non exact matches (postcode and fuzzy single line address). For non exact
#' matches it will retain draws.
#'
#' Non-matching records will also be returned in the output
#'
#' @param primary_df Dataframe of distinct primary addresses
#' @param primary_postcode_col Column containing the primary postcode
#' @param primary_address_col Column containing the primary single line address
#' @param lookup_df Dataframe of distinct lookup addresses
#' @param lookup_postcode_col Column containing the lookup postcode
#' @param lookup_address_col Column containing the lookup single line address
#'
#' @examples
#' @export
calc_match_addresses_df <- function(
primary_df,
primary_postcode_col,
primary_address_col,
lookup_df,
lookup_postcode_col,
lookup_address_col
) {
# Disable group by override groups notice
options(dplyr.summarise.inform = FALSE)

# Check if there are zero shared postcodes
row_count = lookup_df %>%
dplyr::rename("{primary_postcode_col}" := .data[[lookup_postcode_col]]) %>%
dplyr::inner_join(
primary_df,
by = primary_postcode_col,
relationship = "many-to-many"
) %>%
nrow(.)

# Break if zero shared postcodes
if(row_count == 0){
message("There are no shared postcodes between the datasets so no matching was possible")
return(NULL)
}

# Catch error when supplied data is too big
out = tryCatch(
{
# Generate primary df ID
primary_df = primary_df %>%
dplyr::mutate(ID = dplyr::row_number())

# Generate lookup df ID
lookup_df = lookup_df %>%
dplyr::mutate(ID_LOOKUP = dplyr::row_number()) %>%
dplyr::rename("{primary_postcode_col}" := .data[[lookup_postcode_col]])

# Find exact matches
exact_match_df = dplyr::inner_join(
x = primary_df %>%
dplyr::mutate(JOIN_ADDRESS = .data[[primary_address_col]]),
y = lookup_df %>%
dplyr::mutate(JOIN_ADDRESS = .data[[lookup_address_col]]),
by = c(primary_postcode_col, "JOIN_ADDRESS"),
suffix = c("", "_LOOKUP")
) %>%
dplyr::select(ID, ID_LOOKUP) %>%
dplyr::mutate(
SCORE = 1,
MATCH_TYPE = "EXACT"
)

# Identify then tokenise non exact matches
non_exact_match_df <- primary_df %>%
dplyr::anti_join(
y = exact_match_df,
na_matches = "na",
by = "ID"
) %>%
tidytext::unnest_tokens(
output = "TOKEN",
input = primary_address_col,
to_lower = FALSE,
drop = FALSE,
token = stringr::str_split,
pattern = "\\s"
) %>%
dplyr::group_by(ID) %>%
dplyr::mutate(
TOKEN_WEIGHT = dplyr::if_else(grepl("[0-9]", TOKEN) == TRUE, 4, 1),
TOKEN_NUMBER = row_number(),
# Add the theoretical max score for each non exact match address
MAX_SCORE = sum(TOKEN_WEIGHT, na.rm = TRUE)
) %>%
dplyr::ungroup()

# Tokenise lookup addresses
lookup_tokens_df <- lookup_df %>%
tidytext::unnest_tokens(
output = "TOKEN",
input = lookup_address_col,
to_lower = FALSE,
drop = FALSE,
token = stringr::str_split,
pattern = "\\s"
) %>%
# Only need 1 instance of token per lookup ID
dplyr::distinct() %>%
dplyr::mutate(
TOKEN_WEIGHT = dplyr::if_else(grepl("[0-9]", TOKEN) == TRUE, 4, 1)
)

# Score remaining matches
non_exact_match_df = dplyr::full_join(
x = non_exact_match_df,
y = lookup_tokens_df,
by = c(primary_postcode_col, "TOKEN_WEIGHT"),
suffix = c("", "_LOOKUP"),
relationship = "many-to-many"
) %>%
# Score remaining token pairs
dplyr::mutate(
SCORE = dplyr::case_when(
# Exact matches
TOKEN == TOKEN_LOOKUP ~ 1,
(TOKEN != TOKEN_LOOKUP) & (TOKEN_WEIGHT == 4) ~ 0,
TOKEN != TOKEN_LOOKUP ~ stringdist::stringsim(
a = TOKEN,
b = TOKEN_LOOKUP,
method = "jw",
p = 0.1
)
)
) %>%
# Remove tokens with score less than 0.8 then multiple by weight
dplyr::filter(SCORE > 0.8) %>%
dplyr::mutate(SCORE = SCORE * TOKEN_WEIGHT) %>%
# Max score per token
dplyr::group_by(ID, ID_LOOKUP, MAX_SCORE, TOKEN_NUMBER) %>%
dplyr::summarise(SCORE = max(SCORE, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
# Sum scores per ID pair & generate score out of maximum score
dplyr::group_by(ID, ID_LOOKUP, MAX_SCORE) %>%
dplyr::summarise(SCORE = sum(SCORE, na.rm = TRUE)) %>%
dplyr::mutate(SCORE = SCORE / MAX_SCORE) %>%
dplyr::ungroup() %>%
dplyr::select(-MAX_SCORE) %>%
# Slice top score with ties per primary df ID
dplyr::group_by(ID) %>%
dplyr::slice_max(order_by = SCORE, with_ties = TRUE) %>%
dplyr::ungroup() %>%
dplyr::mutate(MATCH_TYPE = "NON-EXACT")

# Identify records not exact or non-exact matched
no_match_df = primary_df %>%
dplyr::anti_join(
exact_match_df %>% select(ID) %>% distinct(),
by = "ID"
) %>%
dplyr::anti_join(
non_exact_match_df %>% select(ID) %>% distinct(),
by = "ID"
) %>%
dplyr::transmute(
ID,
ID_LOOKUP = NA,
SCORE = 0,
MATCH_TYPE = "NONE"
)

# Stack the exact and non exact matches together and output
output = dplyr::bind_rows(exact_match_df, non_exact_match_df, no_match_df) %>%
dplyr::left_join(
primary_df,
by = "ID"
) %>%
dplyr::left_join(
lookup_df,
by = "ID_LOOKUP",
suffix = c("", "_LOOKUP")
) %>%
dplyr::group_by(ID) %>%
dplyr::mutate(MATCH_COUNT = n_distinct(ID_LOOKUP, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::select(-c(ID, ID_LOOKUP)); gc()

# Return output
return(output)
},
error = function(cond){
gc()
items = c(
"The size of the datasets attempting to be matched are too big and the function has failed.",
"Try looping through groups of postcodes, such as several thousand at a time.",
"Also ensure each matching df only contains distinct address-postcode records, to streamline the process.",
"Although RAM dependent, the function has the ability to match up to tens of thousands of records on each side.",
"For large-scale matching tasks, please use the database version of this function: addressMatchR::calc_match_addresses()."
)
message(paste(" ", items, sep = "\n"))
return(NULL)
}
)
return(out)
}
8 changes: 3 additions & 5 deletions R/tidy_postcode.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' Tidy a postcode
#' Tidy a lazy_table postcode
#'
#' Tidy a postcode so it is ready for joining.
#'
Expand All @@ -13,9 +13,7 @@ tidy_postcode <- function(df, col) {
# Tide the postcode column
df %>%
dplyr::mutate(

{{ col }} := REGEXP_REPLACE(toupper({{ col }}), # Uppercase
"[^A-Z0-9]", "") # Remove anything not a character or digit

# Remove anything not a character or digit
{{ col }} := REGEXP_REPLACE(toupper({{ col }}), "[^A-Z0-9]", "")
)
}
19 changes: 19 additions & 0 deletions R/tidy_postcode_df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#' Tidy a datarame postcode
#'
#' Tidy a postcode so it is ready for joining.
#'
#' @param df dataFrame
#' @param col Postcode column
#'
#' @examples
#'
#' @export
tidy_postcode_df <- function(df, col) {

# Tide the postcode column
df %>%
dplyr::mutate(
# Remove anything not a character or digit
{{ col }} := gsub("[^A-Z0-9]", "", toupper({{ col }}))
)
}
76 changes: 22 additions & 54 deletions R/tidy_single_line_address.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Tidy a single line address
#' Tidy a lazy_table single line address
#'
#' Tidy a single line address ready for tokenising.
#'
#' @param df Either a database table or a local data frame
#' @param df Must be lazy_table (i.e. not a data frame)
#' @param col Single line address column
#' @param remove_postcode If to remove the postcode. Default is FALSE
#'
Expand All @@ -13,59 +13,27 @@ tidy_single_line_address <- function(df, col, remove_postcode = FALSE) {

# Remove postcode from single line address if necessary (e.g. after last ",")
if (remove_postcode) {

# Check if lazy table
if(inherits(df, c("tbl_dbi", "tbl_lazy"))){

# Process as lazy table
df <- df %>%
dplyr::mutate({{ col }} := REGEXP_REPLACE({{ col }}, "[,][^,]+$", ""))
}else{

# Else process as df
df <- df %>%
dplyr::mutate({{ col }} := gsub("[,][^,]+$", "", {{ col }}))
}
# Process as lazy table
df <- df %>%
dplyr::mutate({{ col }} := REGEXP_REPLACE({{ col }}, "[,][^,]+$", ""))
}

# Check if object is a lazy table
if(inherits(df, c("tbl_dbi", "tbl_lazy"))){

# Process as a lazy frame
df %>%
dplyr::mutate(
{{ col }} := trimws(REPLACE(REGEXP_REPLACE(REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(toupper({{ col }}), # Uppercase
"[,.();:#'']", " "), # replace special characters with a single space
"(\\d)(\\D)", "\\1 \\2"), # add a space between any digit followed by a non-digit (e.g. 1A becomes 1 A)
"(\\D)(\\d)", "\\1 \\2"), # add a space between any non-digit followed by a digit (e.g. A1 becomes A 1)
"&", " AND "), # replace the ampersand character with the string "and"
"( ){2,}", " "), # replace any multiple spaces with a single space
" - ", "-") # remove any spaces around a hyphen
),

# Only remove spaces around hyphen if surrounded by numbers
{{ col }} := dplyr::case_when(
REGEXP_INSTR({{ col }}, "[0-9] - [0-9]") > 0L ~ REPLACE(" - ", "-", {{ col }}),
TRUE ~ {{ col }}
)
)
}else{

#Process as a data frame
df %>%
dplyr::mutate(
# Address cleaning
{{ col }} := toupper({{ col }}),
{{ col }} := gsub(" & ", " AND ", {{ col }}),
{{ col }} := gsub("(\\D)(\\d)", "\\1 \\2", {{ col }}),
{{ col }} := gsub("(\\d)(\\D)", "\\1 \\2", {{ col }}),
{{ col }} := gsub("[,.();:#''\"]", " ", {{ col }}),
{{ col }} := stringr::str_squish({{ col }}),
{{ col }} := ifelse(
grepl("[0-9] - [0-9]", {{ col }}) == TRUE,
gsub(" - ", "-", {{ col }}),
{{ col }}
)
# Process as a lazy frame
df %>%
dplyr::mutate(
{{ col }} := trimws(REPLACE(REGEXP_REPLACE(REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(toupper({{ col }}), # Uppercase
"[,.();:#'']", " "), # replace special characters with a single space
"(\\d)(\\D)", "\\1 \\2"), # add a space between any digit followed by a non-digit (e.g. 1A becomes 1 A)
"(\\D)(\\d)", "\\1 \\2"), # add a space between any non-digit followed by a digit (e.g. A1 becomes A 1)
"&", " AND "), # replace the ampersand character with the string "and"
"( ){2,}", " "), # replace any multiple spaces with a single space
" - ", "-") # remove any spaces around a hyphen
),

# Only remove spaces around hyphen if surrounded by numbers
{{ col }} := dplyr::case_when(
REGEXP_INSTR({{ col }}, "[0-9] - [0-9]") > 0L ~ REPLACE(" - ", "-", {{ col }}),
TRUE ~ {{ col }}
)
}
)
}
36 changes: 36 additions & 0 deletions R/tidy_single_line_address_df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#' Tidy a dataframe single line address
#'
#' Tidy a single line address ready for tokenising.
#'
#' @param df Must be a dataframe (i.e. not a lazy_frame)
#' @param col Single line address column
#' @param remove_postcode If to remove the postcode. Default is FALSE
#'
#' @examples
#' @export
#'
tidy_single_line_address_df <- function(df, col, remove_postcode = FALSE) {

# Remove postcode from single line address if necessary (e.g. after last ",")
if (remove_postcode) {
# Else process as df
df <- df %>% dplyr::mutate({{ col }} := gsub("[,][^,]+$", "", {{ col }}))
}

# Process df
df %>%
dplyr::mutate(
# Address cleaning
{{ col }} := toupper({{ col }}),
{{ col }} := gsub(" & ", " AND ", {{ col }}),
{{ col }} := gsub("(\\D)(\\d)", "\\1 \\2", {{ col }}),
{{ col }} := gsub("(\\d)(\\D)", "\\1 \\2", {{ col }}),
{{ col }} := gsub("[,.();:#''\"]", " ", {{ col }}),
{{ col }} := stringr::str_squish({{ col }}),
{{ col }} := ifelse(
grepl("[0-9] - [0-9]", {{ col }}) == TRUE,
gsub(" - ", "-", {{ col }}),
{{ col }}
)
)
}

0 comments on commit f3396e8

Please sign in to comment.