-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from nhsbsa-data-analytics/non-db-calc-match-a…
…ddresses Non db calc match addresses
- Loading branch information
Showing
5 changed files
with
286 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
#' Match two sets of addresses, using dataframes rather than lazy_tables | ||
#' | ||
#' Match a distinct dataframe of primary addresses to a distinct dataframe of | ||
#' lookup addresses. | ||
#' | ||
#' Returns a dataframe of exact matches (postcode and single line address) and | ||
#' non exact matches (postcode and fuzzy single line address). For non exact | ||
#' matches it will retain draws. | ||
#' | ||
#' Non-matching records will also be returned in the output | ||
#' | ||
#' @param primary_df Dataframe of distinct primary addresses | ||
#' @param primary_postcode_col Column containing the primary postcode | ||
#' @param primary_address_col Column containing the primary single line address | ||
#' @param lookup_df Dataframe of distinct lookup addresses | ||
#' @param lookup_postcode_col Column containing the lookup postcode | ||
#' @param lookup_address_col Column containing the lookup single line address | ||
#' | ||
#' @examples | ||
#' @export | ||
calc_match_addresses_df <- function( | ||
primary_df, | ||
primary_postcode_col, | ||
primary_address_col, | ||
lookup_df, | ||
lookup_postcode_col, | ||
lookup_address_col | ||
) { | ||
# Disable group by override groups notice | ||
options(dplyr.summarise.inform = FALSE) | ||
|
||
# Check if there are zero shared postcodes | ||
row_count = lookup_df %>% | ||
dplyr::rename("{primary_postcode_col}" := .data[[lookup_postcode_col]]) %>% | ||
dplyr::inner_join( | ||
primary_df, | ||
by = primary_postcode_col, | ||
relationship = "many-to-many" | ||
) %>% | ||
nrow(.) | ||
|
||
# Break if zero shared postcodes | ||
if(row_count == 0){ | ||
message("There are no shared postcodes between the datasets so no matching was possible") | ||
return(NULL) | ||
} | ||
|
||
# Catch error when supplied data is too big | ||
out = tryCatch( | ||
{ | ||
# Generate primary df ID | ||
primary_df = primary_df %>% | ||
dplyr::mutate(ID = dplyr::row_number()) | ||
|
||
# Generate lookup df ID | ||
lookup_df = lookup_df %>% | ||
dplyr::mutate(ID_LOOKUP = dplyr::row_number()) %>% | ||
dplyr::rename("{primary_postcode_col}" := .data[[lookup_postcode_col]]) | ||
|
||
# Find exact matches | ||
exact_match_df = dplyr::inner_join( | ||
x = primary_df %>% | ||
dplyr::mutate(JOIN_ADDRESS = .data[[primary_address_col]]), | ||
y = lookup_df %>% | ||
dplyr::mutate(JOIN_ADDRESS = .data[[lookup_address_col]]), | ||
by = c(primary_postcode_col, "JOIN_ADDRESS"), | ||
suffix = c("", "_LOOKUP") | ||
) %>% | ||
dplyr::select(ID, ID_LOOKUP) %>% | ||
dplyr::mutate( | ||
SCORE = 1, | ||
MATCH_TYPE = "EXACT" | ||
) | ||
|
||
# Identify then tokenise non exact matches | ||
non_exact_match_df <- primary_df %>% | ||
dplyr::anti_join( | ||
y = exact_match_df, | ||
na_matches = "na", | ||
by = "ID" | ||
) %>% | ||
tidytext::unnest_tokens( | ||
output = "TOKEN", | ||
input = primary_address_col, | ||
to_lower = FALSE, | ||
drop = FALSE, | ||
token = stringr::str_split, | ||
pattern = "\\s" | ||
) %>% | ||
dplyr::group_by(ID) %>% | ||
dplyr::mutate( | ||
TOKEN_WEIGHT = dplyr::if_else(grepl("[0-9]", TOKEN) == TRUE, 4, 1), | ||
TOKEN_NUMBER = row_number(), | ||
# Add the theoretical max score for each non exact match address | ||
MAX_SCORE = sum(TOKEN_WEIGHT, na.rm = TRUE) | ||
) %>% | ||
dplyr::ungroup() | ||
|
||
# Tokenise lookup addresses | ||
lookup_tokens_df <- lookup_df %>% | ||
tidytext::unnest_tokens( | ||
output = "TOKEN", | ||
input = lookup_address_col, | ||
to_lower = FALSE, | ||
drop = FALSE, | ||
token = stringr::str_split, | ||
pattern = "\\s" | ||
) %>% | ||
# Only need 1 instance of token per lookup ID | ||
dplyr::distinct() %>% | ||
dplyr::mutate( | ||
TOKEN_WEIGHT = dplyr::if_else(grepl("[0-9]", TOKEN) == TRUE, 4, 1) | ||
) | ||
|
||
# Score remaining matches | ||
non_exact_match_df = dplyr::full_join( | ||
x = non_exact_match_df, | ||
y = lookup_tokens_df, | ||
by = c(primary_postcode_col, "TOKEN_WEIGHT"), | ||
suffix = c("", "_LOOKUP"), | ||
relationship = "many-to-many" | ||
) %>% | ||
# Score remaining token pairs | ||
dplyr::mutate( | ||
SCORE = dplyr::case_when( | ||
# Exact matches | ||
TOKEN == TOKEN_LOOKUP ~ 1, | ||
(TOKEN != TOKEN_LOOKUP) & (TOKEN_WEIGHT == 4) ~ 0, | ||
TOKEN != TOKEN_LOOKUP ~ stringdist::stringsim( | ||
a = TOKEN, | ||
b = TOKEN_LOOKUP, | ||
method = "jw", | ||
p = 0.1 | ||
) | ||
) | ||
) %>% | ||
# Remove tokens with score less than 0.8 then multiple by weight | ||
dplyr::filter(SCORE > 0.8) %>% | ||
dplyr::mutate(SCORE = SCORE * TOKEN_WEIGHT) %>% | ||
# Max score per token | ||
dplyr::group_by(ID, ID_LOOKUP, MAX_SCORE, TOKEN_NUMBER) %>% | ||
dplyr::summarise(SCORE = max(SCORE, na.rm = TRUE)) %>% | ||
dplyr::ungroup() %>% | ||
# Sum scores per ID pair & generate score out of maximum score | ||
dplyr::group_by(ID, ID_LOOKUP, MAX_SCORE) %>% | ||
dplyr::summarise(SCORE = sum(SCORE, na.rm = TRUE)) %>% | ||
dplyr::mutate(SCORE = SCORE / MAX_SCORE) %>% | ||
dplyr::ungroup() %>% | ||
dplyr::select(-MAX_SCORE) %>% | ||
# Slice top score with ties per primary df ID | ||
dplyr::group_by(ID) %>% | ||
dplyr::slice_max(order_by = SCORE, with_ties = TRUE) %>% | ||
dplyr::ungroup() %>% | ||
dplyr::mutate(MATCH_TYPE = "NON-EXACT") | ||
|
||
# Identify records not exact or non-exact matched | ||
no_match_df = primary_df %>% | ||
dplyr::anti_join( | ||
exact_match_df %>% select(ID) %>% distinct(), | ||
by = "ID" | ||
) %>% | ||
dplyr::anti_join( | ||
non_exact_match_df %>% select(ID) %>% distinct(), | ||
by = "ID" | ||
) %>% | ||
dplyr::transmute( | ||
ID, | ||
ID_LOOKUP = NA, | ||
SCORE = 0, | ||
MATCH_TYPE = "NONE" | ||
) | ||
|
||
# Stack the exact and non exact matches together and output | ||
output = dplyr::bind_rows(exact_match_df, non_exact_match_df, no_match_df) %>% | ||
dplyr::left_join( | ||
primary_df, | ||
by = "ID" | ||
) %>% | ||
dplyr::left_join( | ||
lookup_df, | ||
by = "ID_LOOKUP", | ||
suffix = c("", "_LOOKUP") | ||
) %>% | ||
dplyr::group_by(ID) %>% | ||
dplyr::mutate(MATCH_COUNT = n_distinct(ID_LOOKUP, na.rm = TRUE)) %>% | ||
dplyr::ungroup() %>% | ||
dplyr::select(-c(ID, ID_LOOKUP)); gc() | ||
|
||
# Return output | ||
return(output) | ||
}, | ||
error = function(cond){ | ||
gc() | ||
items = c( | ||
"The size of the datasets attempting to be matched are too big and the function has failed.", | ||
"Try looping through groups of postcodes, such as several thousand at a time.", | ||
"Also ensure each matching df only contains distinct address-postcode records, to streamline the process.", | ||
"Although RAM dependent, the function has the ability to match up to tens of thousands of records on each side.", | ||
"For large-scale matching tasks, please use the database version of this function: addressMatchR::calc_match_addresses()." | ||
) | ||
message(paste(" ", items, sep = "\n")) | ||
return(NULL) | ||
} | ||
) | ||
return(out) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#' Tidy a datarame postcode | ||
#' | ||
#' Tidy a postcode so it is ready for joining. | ||
#' | ||
#' @param df dataFrame | ||
#' @param col Postcode column | ||
#' | ||
#' @examples | ||
#' | ||
#' @export | ||
tidy_postcode_df <- function(df, col) { | ||
|
||
# Tide the postcode column | ||
df %>% | ||
dplyr::mutate( | ||
# Remove anything not a character or digit | ||
{{ col }} := gsub("[^A-Z0-9]", "", toupper({{ col }})) | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#' Tidy a dataframe single line address | ||
#' | ||
#' Tidy a single line address ready for tokenising. | ||
#' | ||
#' @param df Must be a dataframe (i.e. not a lazy_frame) | ||
#' @param col Single line address column | ||
#' @param remove_postcode If to remove the postcode. Default is FALSE | ||
#' | ||
#' @examples | ||
#' @export | ||
#' | ||
tidy_single_line_address_df <- function(df, col, remove_postcode = FALSE) { | ||
|
||
# Remove postcode from single line address if necessary (e.g. after last ",") | ||
if (remove_postcode) { | ||
# Else process as df | ||
df <- df %>% dplyr::mutate({{ col }} := gsub("[,][^,]+$", "", {{ col }})) | ||
} | ||
|
||
# Process df | ||
df %>% | ||
dplyr::mutate( | ||
# Address cleaning | ||
{{ col }} := toupper({{ col }}), | ||
{{ col }} := gsub(" & ", " AND ", {{ col }}), | ||
{{ col }} := gsub("(\\D)(\\d)", "\\1 \\2", {{ col }}), | ||
{{ col }} := gsub("(\\d)(\\D)", "\\1 \\2", {{ col }}), | ||
{{ col }} := gsub("[,.();:#''\"]", " ", {{ col }}), | ||
{{ col }} := stringr::str_squish({{ col }}), | ||
{{ col }} := ifelse( | ||
grepl("[0-9] - [0-9]", {{ col }}) == TRUE, | ||
gsub(" - ", "-", {{ col }}), | ||
{{ col }} | ||
) | ||
) | ||
} |