Skip to content

Commit

Permalink
additions straight to main (used elsewhere already)
Browse files Browse the repository at this point in the history
  • Loading branch information
AdnanShroufi committed May 15, 2024
1 parent f3396e8 commit 823b849
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 7 deletions.
234 changes: 234 additions & 0 deletions R/calc_addressbase_plus_single_line_address_df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@

#' Calculate AddressBase Plus DPA single line address.
#'
#' @param df local df with individual address base plus fields
#' @param include_postcode Whether or not to include postcode. Default is FALSE.
#'
#' @examples
#' @export
# Calculate AddressBase Plus DPA single line address
calc_addressbase_plus_dpa_single_line_address <- function(
df,
include_postcode = FALSE
) {
df <- df %>%
dplyr::mutate(
DPA_SINGLE_LINE_ADDRESS = paste0(
ifelse(
test = !is.na(DEPARTMENT_NAME),
yes = paste0(DEPARTMENT_NAME, ", "),
no = ""
),
ifelse(
test = !is.na(RM_ORGANISATION_NAME),
yes = paste0(RM_ORGANISATION_NAME, ", "),
no = ""
),
ifelse(
test = !is.na(SUB_BUILDING_NAME),
yes = paste0(SUB_BUILDING_NAME, ", "),
no = ""
),
ifelse(
test = !is.na(BUILDING_NAME),
yes = paste0(BUILDING_NAME, ", "),
no = ""
),
ifelse(
test = !is.na(BUILDING_NUMBER),
yes = paste0(BUILDING_NUMBER, " "),
no = ""
),
ifelse(
test = !is.na(PO_BOX_NUMBER),
yes = paste0("PO BOX ", PO_BOX_NUMBER, ", "),
no = ""
),
ifelse(
test = !is.na(DEP_THOROUGHFARE),
yes = paste0(DEP_THOROUGHFARE, ", "),
no = ""
),
ifelse(
test = !is.na(THOROUGHFARE),
yes = paste0(THOROUGHFARE, ", "),
no = ""
),
ifelse(
test = !is.na(DOU_DEP_LOCALITY),
yes = paste0(DOU_DEP_LOCALITY, ", "),
no = ""
),
ifelse(
test = !is.na(DEP_LOCALITY),
yes = paste0(DEP_LOCALITY, ", "),
no = ""
),
ifelse(
test = !is.na(POST_TOWN),
yes = paste0(POST_TOWN, ", "),
no = ""
)
)
)

# Add the postcode if necessary
if (include_postcode) {

df <- df %>%
dplyr::mutate(
DPA_SINGLE_LINE_ADDRESS = paste0(
DPA_SINGLE_LINE_ADDRESS,
ifelse(
test = !is.na(POSTCODE),
yes = paste0(POSTCODE),
no = ""
)
)
)
}
df
}


#' Calculate AddressBase Plus GEO single line address.
#'
#' @param df local df with individual address base plus fields
#' @param include_postcode Whether or not to include postcode. Default is FALSE.
#'
#' @examples
#' @export
# Calculate AddressBase Plus GEO single line address
calc_addressbase_plus_geo_single_line_address <- function(
df,
include_postcode=FALSE
) {
df <- df %>%
dplyr::mutate(
GEO_SINGLE_LINE_ADDRESS = paste0(
ifelse(
test = !is.na(LA_ORGANISATION),
yes = paste0(LA_ORGANISATION, ", "),
no = ""
),
ifelse(
test = !is.na(SAO_TEXT),
yes = paste0(SAO_TEXT, ", "),
no = ""
),
ifelse(
test = !is.na(SAO_START_NUMBER) &
is.na(SAO_START_SUFFIX) &
is.na(SAO_END_NUMBER),
yes = paste0(SAO_START_NUMBER, ", "),
no = ifelse(
test = is.na(SAO_START_NUMBER),
yes = "",
no = as.character(SAO_START_NUMBER)
)
),
ifelse(
test = !is.na(SAO_START_SUFFIX) & is.na(SAO_END_NUMBER),
yes = paste0(SAO_START_SUFFIX, ", "),
no = ifelse(
test = !is.na(SAO_START_SUFFIX) & !is.na(SAO_END_NUMBER),
yes = SAO_START_SUFFIX,
no = ""
)
),
ifelse(
test = !is.na(SAO_END_SUFFIX) & !is.na(SAO_END_NUMBER),
yes = "-",
no = ifelse(
test = !is.na(SAO_START_NUMBER) & !is.na(SAO_END_NUMBER),
yes = "-",
no = ""
)
),
ifelse(
test = !is.na(SAO_END_NUMBER) & is.na(SAO_END_SUFFIX),
yes = paste0(SAO_END_NUMBER, ", "),
no = ifelse(
test = is.na(SAO_END_NUMBER),
yes = "",
no = as.character(SAO_END_NUMBER)
)
),
ifelse(
test = !is.na(PAO_TEXT),
yes = paste0(PAO_TEXT, ", "),
no = ""
),
ifelse(
test = !is.na(PAO_START_NUMBER) &
is.na(PAO_START_SUFFIX) &
is.na(PAO_END_NUMBER),
yes = paste0(PAO_START_NUMBER, ", "),
no = ifelse(
test = is.na(PAO_START_NUMBER),
yes = "",
no = as.character(PAO_START_NUMBER)
)
),
ifelse(
test = !is.na(PAO_START_SUFFIX) & is.na(PAO_END_NUMBER),
yes = paste0(PAO_START_SUFFIX, ", "),
no = ifelse(
test = !is.na(PAO_START_SUFFIX) & !is.na(PAO_END_NUMBER),
yes = PAO_START_SUFFIX,
no = ""
)
),
ifelse(
test = !is.na(PAO_END_SUFFIX) & !is.na(PAO_END_NUMBER),
yes = "-",
no = ifelse(
test = !is.na(PAO_START_NUMBER) & !is.na(PAO_END_NUMBER),
yes = "-",
no = ""
)
),
ifelse(
test = !is.na(PAO_END_NUMBER) & is.na(PAO_END_SUFFIX),
yes = paste0(PAO_END_NUMBER, ", "),
no = ifelse(
test = is.na(PAO_END_NUMBER),
yes = "",
no = as.character(PAO_END_NUMBER)
)
),
ifelse(
test = !is.na(STREET_DESCRIPTION),
yes = paste0(STREET_DESCRIPTION, ", "),
no = ""
),
ifelse(
test = !is.na(LOCALITY),
yes = paste0(LOCALITY, ", "),
no = ""
),
ifelse(
test = !is.na(TOWN_NAME),
yes = paste0(TOWN_NAME, ", "),
no = ""
)
)
)

# Add the postcode if necessary
if (include_postcode) {

df <- df %>%
dplyr::mutate(
GEO_SINGLE_LINE_ADDRESS = paste0(
GEO_SINGLE_LINE_ADDRESS,
ifelse(
test = !is.na(POSTCODE_LOCATOR),
yes = paste0(POSTCODE_LOCATOR),
no = ""
)
)
)
}
df
}
15 changes: 8 additions & 7 deletions R/tidy_single_line_address.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ tidy_single_line_address <- function(df, col, remove_postcode = FALSE) {
# Process as a lazy frame
df %>%
dplyr::mutate(
{{ col }} := trimws(REPLACE(REGEXP_REPLACE(REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(toupper({{ col }}), # Uppercase
"[,.();:#'']", " "), # replace special characters with a single space
"(\\d)(\\D)", "\\1 \\2"), # add a space between any digit followed by a non-digit (e.g. 1A becomes 1 A)
"(\\D)(\\d)", "\\1 \\2"), # add a space between any non-digit followed by a digit (e.g. A1 becomes A 1)
"&", " AND "), # replace the ampersand character with the string "and"
"( ){2,}", " "), # replace any multiple spaces with a single space
" - ", "-") # remove any spaces around a hyphen
{{ col }} := trimws(REGEXP_REPLACE(REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(toupper({{ col }}), # Uppercase
"([A-Za-z])''([A-Za-z])", '\\1\\2'), # replace apostrophe between letters
"[,.();:#'']", " "), # replace special characters with a single space
"(\\d)(\\D)", "\\1 \\2"), # add a space between any digit followed by a non-digit (e.g. 1A becomes 1 A)
"(\\D)(\\d)", "\\1 \\2"), # add a space between any non-digit followed by a digit (e.g. A1 becomes A 1)
"&", " AND "), # replace the ampersand character with the string "and"
"( ){2,}", " ") # replace any multiple spaces with a single space

),

# Only remove spaces around hyphen if surrounded by numbers
Expand Down
1 change: 1 addition & 0 deletions R/tidy_single_line_address_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tidy_single_line_address_df <- function(df, col, remove_postcode = FALSE) {
dplyr::mutate(
# Address cleaning
{{ col }} := toupper({{ col }}),
{{ col }} := gsub("([A-Za-z])''([A-Za-z])", "\\1\\2", {{ col }}),
{{ col }} := gsub(" & ", " AND ", {{ col }}),
{{ col }} := gsub("(\\D)(\\d)", "\\1 \\2", {{ col }}),
{{ col }} := gsub("(\\d)(\\D)", "\\1 \\2", {{ col }}),
Expand Down

0 comments on commit 823b849

Please sign in to comment.