From 8ada08ed01d844f65c2f95f6a78cbdb8aa75e0ed Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Thu, 20 Jun 2024 14:45:00 +0100 Subject: [PATCH 1/9] cqc funs --- .../workflow/01_upload_cqc_data_from_api.R | 207 +++++++++++++++++- data-raw/workflow/workflow_run_23_24.R | 47 ++++ 2 files changed, 244 insertions(+), 10 deletions(-) create mode 100644 data-raw/workflow/workflow_run_23_24.R diff --git a/data-raw/workflow/01_upload_cqc_data_from_api.R b/data-raw/workflow/01_upload_cqc_data_from_api.R index 2834b50..d55063c 100644 --- a/data-raw/workflow/01_upload_cqc_data_from_api.R +++ b/data-raw/workflow/01_upload_cqc_data_from_api.R @@ -1,23 +1,210 @@ -# Function to get api content from url -get_api_content <- function(url) { - # Get api data - data = httr::GET(url) +# Get cqc primary key from environ file +key = Sys.getenv("CQC_PRIMARY_KEY") + +# Function to get api content from url +get_api_content <- function(url){ + + # Get api data + data = httr::GET(url, httr::add_headers(`Ocp-Apim-Subscription-Key` = key)) + # Convert binary to character content = jsonlite::fromJSON(rawToChar(data$content)) - + # Return content return(content) } -# Get number of cqc pages for main api query -api_content <- get_api_content( - "https://api.cqc.org.uk/public/v1/locations?careHome=Y&page=1&perPage=1" +# Get number of pages +get_number_of_pages = function(){ + + # Define url + url = "https://api.service.cqc.org.uk/public/v1/locations" + + # Get locations overview + api_content = get_api_content(url) + + # Get number of pages + total_pages = api_content$totalPages + + # Return + return(total_pages) +} + +# Get all locations per page +get_location_ids_per_page = function(page_num){ + + # Define url + url = paste0( + "https://api.service.cqc.org.uk/public/v1/locations?page=", + page_num, + "&perPage=1000" + ) + + # Get locations overview + api_content = get_api_content(url) + + # Get locations ids + location_vec = api_content$locations$locationId + + # Return + return(location_vec) +} + +# Get all locations by location_vec index +get_location_info_by_id <- function(loc_num) { + + # Paste location url with location_id + url = paste0( + "https://api.service.cqc.org.uk/public/v1/locations", + location_vec[loc_num] + ) + + # Get data + data = get_api_content(url) %>% + unlist() %>% + bind_rows() + + # Sleep if less than 2 rows + while (ncol(data) <= 2) { + Sys.sleep(0.05) + + data = get_api_content(url) %>% + unlist() %>% + bind_rows() + } + + # Return data + return(data) +} + +# Get total pages +total_pages = get_number_of_pages() + +# Get all location ids +location_vec = lapply(1:total_pages, get_location_ids_per_page) + +# Unlist into a single vector +location_vec = unlist(all_locations) + + +# Get columns names from a location id +get_col_names = function(index){ + + # Create url + url = paste0( + "https://api.service.cqc.org.uk/public/v1/locations/", + location_vec[index] + ) + + # Get data + data = get_api_content(url) + + # Column names + cols = names(data) + + # Return + return(cols) +} + +cols = lapply(1:length(location_vec), get_col_names) + + +data$assessment + + unlist() %>% + bind_rows() + + +data$uprn + +a = get_location_info_by_id(48) + +cbind( + data["name"], + data$specialisms, + data$regulatedActivities, + data["locationId"] ) -# Get number of 10k blocks required -no_of_pages = ceiling(api_content$total / 10000) + +names(data) + + + + + + +data$assessment +data$assessmentServiceGroup +data$numberOfBeds + +data$type +data$locationTypes + +cqc_cols = c( + 'name', + 'postalCode', + 'uprn', + 'locationId', + 'providerId', + 'organisationType', + 'type', + 'lastInspection', + 'deregistrationDate', + 'registrationStatus', + 'registrationDate', + 'postalAddressLine1', + 'postalAddressLine2', + 'postalAddressTownCity', + 'postalAddressCounty', + 'numberOfBeds', + 'gacServicesTypes', + 'gacServicesTypesNames', + 'regulatedActivities', + 'specialisms', + +) + + +uprn = as.numeric(uprn), +location_id, +provider_id, +last_inspection_date, +registration_date, +deregistration_date, +single_line_address, +postcode = toupper(gsub("[^[:alnum:]]", "", postal_code)), +nursing_home_flag = as.integer(grepl( + "Nursing home", gac_service_types_names +)), +residential_home_flag = as.integer(grepl( + "Residential home", gac_service_types_names +)), +# type, +number_of_beds = as.integer(number_of_beds), +current_rating = current_ratings_overall_rating, +key_question_names = current_ratings_overall_key_question_ratings_names, +key_question_ratings = current_ratings_overall_key_question_ratings_ratings, +cqc_date = download_date, +ods_code, +specialisms, +regulated_activities_names, +gac_service_types = gac_service_types_names, + + +which + +c = data[names(data) %in% cqc_cols] %>% + unlist() %>% + bind_rows() +c + + +data$uprn + + get_cqc_locations_details <- function(page_num) { diff --git a/data-raw/workflow/workflow_run_23_24.R b/data-raw/workflow/workflow_run_23_24.R new file mode 100644 index 0000000..66cf611 --- /dev/null +++ b/data-raw/workflow/workflow_run_23_24.R @@ -0,0 +1,47 @@ +# Load/install all required packages and functions +source("data-raw/workflow/workflow_packages.R") +source("data-raw/workflow/workflow_helpers.R") +source("data-raw/workflow/workflow_production.R") + +# Specify variables to retain at end of each script +keep_vars = c(ls(), 'keep_vars') + +# FY 22/23 --------------------------------------------------------------------- + +# 1. Get latest cqc data: 0.5hr - Run once in first epoch script +get_latest_cqc_data() + +# 2. Get latest ab plus epoch: ~2hr +get_abp_from_api( + end_date = "2024-03-31" +) + +# 3. Merge and process cqc and ab plus: ~3 mins +create_ab_plus_cqc_data( + ab_plus_data = "INT646_ABP_20230331", + cqc_data = "INT646_CQC_20230602", + start_date = "2022-04-01", + end_date = "2023-03-31" +) + +# 4. Create form level fact for records with a ch-postcode: ~11-14hr +create_form_level_patient_addresses( + address_data = "INT646_ABP_CQC_20220401_20230331" +) + +# 5. Match patient details against ch-postcode uprn and process: ~30-40 mins +create_care_home_address_match( + patient_address_data = "INT646_FORMS_20220401_20230331", + lookup_address_data = "INT646_ABP_CQC_20220401_20230331", + parent_uprn_data = "INT646_ABP_20230331" +) + +# 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min +# create_postcode_lookup() # Run once in first epoch script + + +# 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs +create_matched_prescription_base_table( + match_data = "INT646_MATCH_20220401_20230331", + form_data = "INT646_FORMS_20220401_20230331" +) From 3fcfb801168ed16b0fe7edaf843e362efdaa6a11 Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Thu, 20 Jun 2024 17:49:30 +0100 Subject: [PATCH 2/9] updated cqc api --- .../workflow/01_upload_cqc_data_from_api.R | 212 +++++++----------- 1 file changed, 81 insertions(+), 131 deletions(-) diff --git a/data-raw/workflow/01_upload_cqc_data_from_api.R b/data-raw/workflow/01_upload_cqc_data_from_api.R index d55063c..c0c1f46 100644 --- a/data-raw/workflow/01_upload_cqc_data_from_api.R +++ b/data-raw/workflow/01_upload_cqc_data_from_api.R @@ -1,8 +1,34 @@ +# Functions and variables ------------------------------------------------------ # Get cqc primary key from environ file key = Sys.getenv("CQC_PRIMARY_KEY") +# Define cqc columns of interest +cqc_cols = c( + 'name', + 'postalCode', + 'uprn', + 'locationId', + 'providerId', + 'organisationType', + 'type', + 'lastInspection', + 'deregistrationDate', + 'registrationStatus', + 'registrationDate', + 'postalAddressLine1', + 'postalAddressLine2', + 'postalAddressTownCity', + 'postalAddressCounty', + 'numberOfBeds', + 'gacServicesTypes', + 'regulatedActivities', + 'specialisms', + 'current_ratings', + 'odsCode' +) + # Function to get api content from url get_api_content <- function(url){ @@ -20,7 +46,7 @@ get_api_content <- function(url){ get_number_of_pages = function(){ # Define url - url = "https://api.service.cqc.org.uk/public/v1/locations" + url = "https://api.service.cqc.org.uk/public/v1/locations?careHome=Y" # Get locations overview api_content = get_api_content(url) @@ -37,7 +63,7 @@ get_location_ids_per_page = function(page_num){ # Define url url = paste0( - "https://api.service.cqc.org.uk/public/v1/locations?page=", + "https://api.service.cqc.org.uk/public/v1/locations?careHome=Y&page=", page_num, "&perPage=1000" ) @@ -57,28 +83,30 @@ get_location_info_by_id <- function(loc_num) { # Paste location url with location_id url = paste0( - "https://api.service.cqc.org.uk/public/v1/locations", + "https://api.service.cqc.org.uk/public/v1/locations/", location_vec[loc_num] ) # Get data - data = get_api_content(url) %>% - unlist() %>% - bind_rows() + data = get_api_content(url) - # Sleep if less than 2 rows - while (ncol(data) <= 2) { - Sys.sleep(0.05) - - data = get_api_content(url) %>% - unlist() %>% - bind_rows() - } + print(data) + + # Filter data + filtered_data = data[names(data) %in% cqc_cols] + + # Flat data + flat_data = filtered_data %>% + unlist() %>% + bind_rows() %>% + janitor::clean_names() # Return data - return(data) + return(flat_data) } +# Generate Output -------------------------------------------------------------- + # Get total pages total_pages = get_number_of_pages() @@ -86,35 +114,50 @@ total_pages = get_number_of_pages() location_vec = lapply(1:total_pages, get_location_ids_per_page) # Unlist into a single vector -location_vec = unlist(all_locations) +location_vec = unlist(location_vec) +# Generate appropriate number of cores +n_cores <- parallel::detectCores() - 1 -# Get columns names from a location id -get_col_names = function(index){ - - # Create url - url = paste0( - "https://api.service.cqc.org.uk/public/v1/locations/", - location_vec[index] - ) - - # Get data - data = get_api_content(url) - - # Column names - cols = names(data) - - # Return - return(cols) -} +# Set up parallel +clust <- parallel::makeCluster(n_cores) -cols = lapply(1:length(location_vec), get_col_names) +# Export libraries to cluster +parallel::clusterEvalQ( + cl = clust, + { + library(dplyr); + library(janitor); + library(httr); + library(jsonlite); + } +) + +# Export required objects to cluster +parallel::clusterExport( + cl = clust, + varlist = c( + "get_api_content", + "location_vec", + "key", + "cqc_cols" + ), + envir = environment() +) + +# Generate cqc details +Sys.time() +cqc_data <- parallel::parLapply( + cl = clust, + X = 1:length(location_vec), + fun = get_location_info_by_id +) +Sys.time() +# Stop Cluster +parallel::stopCluster(clust) -data$assessment - unlist() %>% - bind_rows() data$uprn @@ -136,101 +179,8 @@ names(data) -data$assessment -data$assessmentServiceGroup -data$numberOfBeds - -data$type -data$locationTypes - -cqc_cols = c( - 'name', - 'postalCode', - 'uprn', - 'locationId', - 'providerId', - 'organisationType', - 'type', - 'lastInspection', - 'deregistrationDate', - 'registrationStatus', - 'registrationDate', - 'postalAddressLine1', - 'postalAddressLine2', - 'postalAddressTownCity', - 'postalAddressCounty', - 'numberOfBeds', - 'gacServicesTypes', - 'gacServicesTypesNames', - 'regulatedActivities', - 'specialisms', - -) - - -uprn = as.numeric(uprn), -location_id, -provider_id, -last_inspection_date, -registration_date, -deregistration_date, -single_line_address, -postcode = toupper(gsub("[^[:alnum:]]", "", postal_code)), -nursing_home_flag = as.integer(grepl( - "Nursing home", gac_service_types_names -)), -residential_home_flag = as.integer(grepl( - "Residential home", gac_service_types_names -)), -# type, -number_of_beds = as.integer(number_of_beds), -current_rating = current_ratings_overall_rating, -key_question_names = current_ratings_overall_key_question_ratings_names, -key_question_ratings = current_ratings_overall_key_question_ratings_ratings, -cqc_date = download_date, -ods_code, -specialisms, -regulated_activities_names, -gac_service_types = gac_service_types_names, - - -which - -c = data[names(data) %in% cqc_cols] %>% - unlist() %>% - bind_rows() -c - - -data$uprn - - - -get_cqc_locations_details <- function(page_num) { - - # Url with page number pasted inside - url = paste0( - "https://api.cqc.org.uk/public/v1/locations?careHome=Y&page=", - page_num, - "&perPage=10000" - ) - - # Get api data - data = get_api_content(url) - - # Get location info as df within list - locations = data$locations - - # Return location info df - return(locations) -} -# Get all location info, with 10k records per page retrieved -cqc_locations <- lapply(1:no_of_pages, get_cqc_locations_details) %>% - bind_rows() -# Vector of locations -location_vec = cqc_locations %>% pull(locationId) # Function to query cqc api get_cqc_api_location_data <- function(loc_num) { From 36b3c425fbe597d5960bd9076432377a616a9fad Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Fri, 21 Jun 2024 11:19:26 +0100 Subject: [PATCH 3/9] updated cqc script finished --- .../workflow/01_upload_cqc_data_from_api.R | 297 ++++++------------ 1 file changed, 104 insertions(+), 193 deletions(-) diff --git a/data-raw/workflow/01_upload_cqc_data_from_api.R b/data-raw/workflow/01_upload_cqc_data_from_api.R index c0c1f46..7c1c435 100644 --- a/data-raw/workflow/01_upload_cqc_data_from_api.R +++ b/data-raw/workflow/01_upload_cqc_data_from_api.R @@ -4,6 +4,9 @@ # Get cqc primary key from environ file key = Sys.getenv("CQC_PRIMARY_KEY") +# Get current year_month +download_date <- as.integer(format(today(), "%Y%m%d")) + # Define cqc columns of interest cqc_cols = c( 'name', @@ -22,12 +25,26 @@ cqc_cols = c( 'postalAddressTownCity', 'postalAddressCounty', 'numberOfBeds', - 'gacServicesTypes', - 'regulatedActivities', + 'gacServiceTypes', 'specialisms', - 'current_ratings', + 'currentRatings', 'odsCode' -) + ) + +# Define provider cols +provider_cols = c( + "providerId", + "name", + "postalAddressLine1", + "postalAddressLine2", + "postalAddressTownCity", + "postalAddressCounty", + "region", + "postalCode", + "uprn", + "companiesHouseNumber", + "lastInspection" + ) # Function to get api content from url get_api_content <- function(url){ @@ -89,11 +106,34 @@ get_location_info_by_id <- function(loc_num) { # Get data data = get_api_content(url) + + # Filter data + filtered_data = data[names(data) %in% cqc_cols] + + # Flat data + flat_data = filtered_data %>% + unlist() %>% + bind_rows() %>% + janitor::clean_names() + + # Return data + return(flat_data) +} + +# Function to query cqc api +get_provider_info_by_id = function(loc_num){ + + # Paste location url with location_id + url = paste0( + "https://api.service.cqc.org.uk/public/v1/providers/", + provider_vec[loc_num] + ) - print(data) + # Get data + data = get_api_content(url) # Filter data - filtered_data = data[names(data) %in% cqc_cols] + filtered_data = data[names(data) %in% provider_cols] # Flat data flat_data = filtered_data %>% @@ -105,7 +145,16 @@ get_location_info_by_id <- function(loc_num) { return(flat_data) } -# Generate Output -------------------------------------------------------------- +# Concatenate all columns (uniquely and without NA) with same prefix +concatenate_by_prefix = function(df, prefix){ + df %>% + mutate({{ prefix }} := pmap_chr( + select(., starts_with(prefix)), + ~ paste(unique(na.omit(c(...))), collapse = "|") + )) +} + +# Generate location output ----------------------------------------------------- # Get total pages total_pages = get_number_of_pages() @@ -145,130 +194,26 @@ parallel::clusterExport( envir = environment() ) -# Generate cqc details -Sys.time() +# Generate cqc details: ~25 mins cqc_data <- parallel::parLapply( cl = clust, X = 1:length(location_vec), fun = get_location_info_by_id ) -Sys.time() # Stop Cluster parallel::stopCluster(clust) +# Format location output ------------------------------------------------------- - - -data$uprn - -a = get_location_info_by_id(48) - -cbind( - data["name"], - data$specialisms, - data$regulatedActivities, - data["locationId"] -) - - -names(data) - - - - - - - - - -# Function to query cqc api -get_cqc_api_location_data <- function(loc_num) { - - # Paste location url with location_id - url = paste0( - "https://api.cqc.org.uk/public/v1/locations/", - location_vec[loc_num] - ) - - # Get data - data = get_api_content(url) %>% - unlist() %>% - bind_rows() - - while (ncol(data) <= 2) { - Sys.sleep(0.05) - - data = get_api_content(url) %>% - unlist() %>% - bind_rows() - } - - - # Return data - return(data) -} - -# Generate appropriate number of cores -n_cores <- parallel::detectCores() - 2 - -# Set up parallel -clust <- parallel::makeCluster(n_cores) - -# Export libraries to cluster -parallel::clusterEvalQ( - cl = clust, - { - library(dplyr); - library(httr); - library(jsonlite); - } -) - -# Export required objects to cluster -parallel::clusterExport( - cl = clust, - varlist = c( - "get_api_content", - "get_cqc_api_location_data", - "location_vec" - ), - envir = environment() -) - -# Print script update -print("Now downloading CQC API location data ...") - -# Generate cqc details -cqc_details <- parallel::parLapply( - cl = clust, - X = 1:length(location_vec), - fun = get_cqc_api_location_data -) - -# Stop Cluster -parallel::stopCluster(clust) - -cqc_details_check <- cqc_details %>% map(\(x) x$locationId) -cqc_details_check <- cqc_details_check[is.na(cqc_details_check)] -num_missing_locations <- length(cqc_details_check) - -if (num_missing_locations > 0) stop("Missing locations, probably due to timeout!") - -# Get current year_month -download_date <- as.integer(format(today(), "%Y%m%d")) - -# Bind all dfs together and apply some transformations -cqc_details_df <- cqc_details %>% +# Bind all dfs together and clean column names +cqc_details_df <- cqc_data %>% bind_rows() %>% janitor::clean_names() %>% - unite_to_plural( - specialisms, - regulated_activities_names, - current_ratings_overall_key_question_ratings_names, - current_ratings_overall_key_question_ratings_ratings, - gac_service_types_names - ) %>% + concatenate_by_prefix(., 'specialisms') %>% + concatenate_by_prefix('current_ratings_overall_key_question_ratings_name') %>% + concatenate_by_prefix('current_ratings_overall_key_question_ratings_rating') %>% + concatenate_by_prefix('gac_service_types_name') %>% tidyr::unite( single_line_address, c( @@ -291,25 +236,25 @@ cqc_details_df <- cqc_details %>% single_line_address, postcode = toupper(gsub("[^[:alnum:]]", "", postal_code)), nursing_home_flag = as.integer(grepl( - "Nursing home", gac_service_types_names + "Nursing home", gac_service_types_name )), residential_home_flag = as.integer(grepl( - "Residential home", gac_service_types_names + "Residential home", gac_service_types_name )), # type, number_of_beds = as.integer(number_of_beds), current_rating = current_ratings_overall_rating, - key_question_names = current_ratings_overall_key_question_ratings_names, - key_question_ratings = current_ratings_overall_key_question_ratings_ratings, + key_question_names = current_ratings_overall_key_question_ratings_name, + key_question_ratings = current_ratings_overall_key_question_ratings_rating, cqc_date = download_date, ods_code, specialisms, - regulated_activities_names, - gac_service_types = gac_service_types_names, + gac_service_types = gac_service_types_name, .keep = "none" - ) + ) %>% + addressMatchR::tidy_single_line_address(single_line_address) -gc() +# Generate provider output ----------------------------------------------------- # Get provider id vec provider_vec = cqc_details_df %>% @@ -317,37 +262,8 @@ provider_vec = cqc_details_df %>% distinct() %>% pull() -# Function to query cqc api -get_cqc_api_provider_data = function(loc_num){ - - # Wait - Sys.sleep(0.05) - - # Paste location url with location_id - url = paste0( - "https://api.cqc.org.uk/public/v1/providers/", - provider_vec[loc_num] - ) - - # Get data - data = get_api_content(url) %>% - unlist() %>% - bind_rows() - - while (ncol(data) <= 2) { - Sys.sleep(0.05) - - data = get_api_content(url) %>% - unlist() %>% - bind_rows() - } - - # Return data - return(data) -} - # Generate appropriate number of cores -n_cores <- parallel::detectCores() - 2 +n_cores <- parallel::detectCores() - 1 # Set up parallel clust <- parallel::makeCluster(n_cores) @@ -357,6 +273,7 @@ parallel::clusterEvalQ( cl = clust, { library(dplyr); + library(janitor); library(httr); library(jsonlite); } @@ -367,46 +284,42 @@ parallel::clusterExport( cl = clust, varlist = c( "get_api_content", - "get_cqc_api_provider_data", - "provider_vec" + "provider_vec", + "key", + "provider_cols" ), envir = environment() ) -# Print script update -print("Now downloading CQC API provider data ...") - -# Generate cqc details -cqc_details <- parallel::parLapply( +# Generate provider details: ~15 mins +provider_data <- parallel::parLapply( cl = clust, X = 1:length(provider_vec), - fun = get_cqc_api_provider_data + fun = get_provider_info_by_id ) # Stop Cluster parallel::stopCluster(clust) -cqc_details_check <- cqc_details %>% map(\(x) x$locationId) -cqc_details_check <- cqc_details_check[is.na(cqc_details_check)] -num_missing_providers <- length(cqc_details_check) - -if (num_missing_providers > 0) stop("Missing providers, probably due to timeout!") +# Process provider output ------------------------------------------------------ # Process the provider data -cqc_providers_df = cqc_details %>% +cqc_providers_df = provider_data %>% bind_rows() %>% janitor::clean_names() %>% - mutate( - # Paste fields together to create single line address - provider_sla = toupper(paste( - ifelse(is.na(name), "", name), - ifelse(is.na(postal_address_line1), "", postal_address_line1), - ifelse(is.na(postal_address_town_city), "", postal_address_town_city), - ifelse(is.na(region), "", region) - )), - # Postcode Cleaning - postal_code = toupper(gsub("[^[:alnum:]]", "", postal_code)) + tidyr::unite( + provider_sla, + c( + name, + postal_address_line1, + postal_address_line2, + postal_address_town_city, + postal_address_county + ), + sep = " ", + na.rm = TRUE ) %>% + mutate(postal_code = toupper(gsub("[^[:alnum:]]", "", postal_code))) %>% select( provider_id, provider_uprn = uprn, @@ -414,21 +327,18 @@ cqc_providers_df = cqc_details %>% provider_sla, provider_postcode = postal_code, provider_last_inspection_date = last_inspection_date - ) + ) %>% + addressMatchR::tidy_single_line_address(provider_sla) + +# Join location and providder data then save ----------------------------------- # Process the cqc df output, in preparation for matching -cqc_process_df <- cqc_details_df %>% +cqc_final_df <- cqc_details_df %>% left_join(cqc_providers_df, by = "provider_id") %>% - addressMatchR::tidy_single_line_address(single_line_address) %>% - addressMatchR::tidy_single_line_address(provider_sla) %>% rename_with(toupper) -# Check na count per column -print("NA count per column") -print(colSums(is.na(cqc_process_df))) - # Create table name -table_name <- paste0("INT646_CQC_", download_date) +table_name <- paste0("CQC_BASE_", download_date) # Set up connection to the DB con <- nhsbsaR::con_nhsbsa(database = "DALP") @@ -436,9 +346,10 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Drop table if it exists already drop_table_if_exists_db(table_name) -# Upload to DB... -cqc_process_df %>% write_table_long_chars(con, table_name) -# ...and add indexes +# Upload to DB +cqc_final_df %>% write_table_long_chars(con, table_name) + +# Add indexes con %>% add_indexes(table_name, c("UPRN", "POSTCODE")) # Grant access From c1fe5e7fffd9101859a3f24df1b5578334d975e0 Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Fri, 21 Jun 2024 14:16:01 +0100 Subject: [PATCH 4/9] abp from dall_ref --- data-raw/workflow/02c_ab_data_from_dall_ref.R | 122 ++++++++++++++++++ data-raw/workflow/workflow_run_23_24.R | 2 +- 2 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 data-raw/workflow/02c_ab_data_from_dall_ref.R diff --git a/data-raw/workflow/02c_ab_data_from_dall_ref.R b/data-raw/workflow/02c_ab_data_from_dall_ref.R new file mode 100644 index 0000000..ddd2e6a --- /dev/null +++ b/data-raw/workflow/02c_ab_data_from_dall_ref.R @@ -0,0 +1,122 @@ +source("data-raw/workflow/workflow_packages.R") +source("data-raw/workflow/workflow_helpers.R") + +# Identify closest dall_ref release date --------------------------------------- + +# Define end date +end_date = "2024-03-31" + +# Set up connection to the DB +con <- nhsbsaR::con_nhsbsa(database = "DALP") + +# Connect to ab plus in dall_ref +ab <- con %>% + tbl(from = in_schema("DALL_REF", "ADDRESSBASE_PLUS")) + +# Get closest release date +select_date = ab %>% + select(RELEASE_DATE) %>% + distinct() %>% + collect() %>% + mutate( + SELECT_DATE = as.Date(end_date), + RELEASE_DATE = as.Date(RELEASE_DATE), + DIFF = as.integer(abs(RELEASE_DATE - SELECT_DATE)), + DB_DATE = as.integer(gsub("-", "", SELECT_DATE)) + ) %>% + filter(DIFF == min(DIFF)) %>% + select(DB_DATE) %>% + pull() + +# Filter ab by most appropriate release date +ab = ab %>% filter(TO_NUMBER(TO_CHAR(RELEASE_DATE, 'YYYYMMDD')) == db_date) + +# Temp table just with cleaned geo and dpa sla --------------------------------- + +# Define temp table name +table_name_temp = "TEMP_AB_PLUS" + +# Check if temp table exists +drop_table_if_exists_db(table_name_temp) + +# Sla generation and clean +ab_sla = ab %>% + # Create CH flag + mutate(CH_FLAG = ifelse(CLASS == "RI01", 1L, 0L)) %>% + # SLA creation plus formatting + addressMatchR::calc_addressbase_plus_dpa_single_line_address() %>% + addressMatchR::calc_addressbase_plus_geo_single_line_address() %>% + addressMatchR::tidy_single_line_address(col = DPA_SINGLE_LINE_ADDRESS) %>% + addressMatchR::tidy_single_line_address(col = GEO_SINGLE_LINE_ADDRESS) %>% + select( + UPRN, + PARENT_UPRN, + POSTCODE, + DPA_SINGLE_LINE_ADDRESS, + GEO_SINGLE_LINE_ADDRESS, + CH_FLAG, + RELEASE_DATE + ) + +# Save intermediate table under temp table name +ab_sla %>% + compute( + name = table_name_temp, + temporary = FALSE + ) + +# Re-connect to temp table and finish processing ------------------------------- + +# Connect and process +ab_plus = con %>% + tbl(from = table_name_temp) %>% + nhsbsaR::oracle_merge_strings( + first_col = "DPA_SINGLE_LINE_ADDRESS", + second_col = "GEO_SINGLE_LINE_ADDRESS", + merge_col = "CORE_SINGLE_LINE_ADDRESS" + ) %>% + select( + UPRN, + PARENT_UPRN, + POSTCODE, + DPA_SINGLE_LINE_ADDRESS, + GEO_SINGLE_LINE_ADDRESS, + CORE_SINGLE_LINE_ADDRESS, + CH_FLAG, + RELEASE_DATE + ) + +# Define table name +table_name = paste0("ABP_", db_date) + +# Drop table if it exists already +drop_table_if_exists_db(table_name) + +# Write the table back to the DB with indexes +ab_plus %>% + compute( + name = table_name, + temporary = FALSE, + indexes = c("UPRN", "PARENT_UPRN", "POSTCODE") + ) + +# Drop intermediate tables now the final table is done +drop_table_if_exists_db(table_name_temp) + +# Grant access +c("MIGAR", "ADNSH", "MAMCP") %>% grant_table_access (table_name) + +# Disconnect connection to database +DBI::dbDisconnect(con) + +# Print that table has been created +print(paste0("This script has created table: ", table_name)) + +# Return to project directory +setwd(project_dir) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/workflow/workflow_run_23_24.R b/data-raw/workflow/workflow_run_23_24.R index 66cf611..8f24682 100644 --- a/data-raw/workflow/workflow_run_23_24.R +++ b/data-raw/workflow/workflow_run_23_24.R @@ -8,7 +8,7 @@ keep_vars = c(ls(), 'keep_vars') # FY 22/23 --------------------------------------------------------------------- -# 1. Get latest cqc data: 0.5hr - Run once in first epoch script +# 1. Get latest cqc data: ~1hr - Run once in first epoch script (new API code) get_latest_cqc_data() # 2. Get latest ab plus epoch: ~2hr From 49544612de75028211ac7ddd0e17d540599fa041 Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Fri, 21 Jun 2024 18:22:05 +0100 Subject: [PATCH 5/9] edit --- data-raw/workflow/02c_ab_data_from_dall_ref.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-raw/workflow/02c_ab_data_from_dall_ref.R b/data-raw/workflow/02c_ab_data_from_dall_ref.R index ddd2e6a..42ccf84 100644 --- a/data-raw/workflow/02c_ab_data_from_dall_ref.R +++ b/data-raw/workflow/02c_ab_data_from_dall_ref.R @@ -87,7 +87,7 @@ ab_plus = con %>% ) # Define table name -table_name = paste0("ABP_", db_date) +table_name = paste0("ADDRESSBASE_PLUS_", db_date) # Drop table if it exists already drop_table_if_exists_db(table_name) From ad6796bd0df971db6af516017d48e304ea4d042c Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Wed, 26 Jun 2024 13:44:30 +0100 Subject: [PATCH 6/9] workflow production update --- data-raw/app/01_headline_figures_df.R | 2 +- data-raw/workflow/02c_ab_data_from_dall_ref.R | 5 ---- data-raw/workflow/03_address_base_cqc_merge.R | 3 ++- data-raw/workflow/workflow_production.R | 15 ++++++++++- data-raw/workflow/workflow_run_23_24.R | 27 +++++++++---------- .../workflow_union_3_years_in_one_table.sql | 10 +++---- 6 files changed, 35 insertions(+), 27 deletions(-) diff --git a/data-raw/app/01_headline_figures_df.R b/data-raw/app/01_headline_figures_df.R index 372c8a1..103025b 100644 --- a/data-raw/app/01_headline_figures_df.R +++ b/data-raw/app/01_headline_figures_df.R @@ -8,7 +8,7 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from year month dim table in DWCP data_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Key findings used within analysis summary text data_db %>% diff --git a/data-raw/workflow/02c_ab_data_from_dall_ref.R b/data-raw/workflow/02c_ab_data_from_dall_ref.R index 42ccf84..4a2f906 100644 --- a/data-raw/workflow/02c_ab_data_from_dall_ref.R +++ b/data-raw/workflow/02c_ab_data_from_dall_ref.R @@ -1,11 +1,6 @@ -source("data-raw/workflow/workflow_packages.R") -source("data-raw/workflow/workflow_helpers.R") # Identify closest dall_ref release date --------------------------------------- -# Define end date -end_date = "2024-03-31" - # Set up connection to the DB con <- nhsbsaR::con_nhsbsa(database = "DALP") diff --git a/data-raw/workflow/03_address_base_cqc_merge.R b/data-raw/workflow/03_address_base_cqc_merge.R index b7bbde7..d18da00 100644 --- a/data-raw/workflow/03_address_base_cqc_merge.R +++ b/data-raw/workflow/03_address_base_cqc_merge.R @@ -10,7 +10,8 @@ cqc_db <- con %>% # Create a lazy table addressbase data ab_plus_db <- con %>% - tbl(from = ab_plus_data) + tbl(from = ab_plus_data) %>% + rename(EPOCH = RELEASE_DATE) # Get 4 values for final output ab_epoch = pull_date_string(ab_plus_db, EPOCH) diff --git a/data-raw/workflow/workflow_production.R b/data-raw/workflow/workflow_production.R index b3b7c90..408e1c4 100644 --- a/data-raw/workflow/workflow_production.R +++ b/data-raw/workflow/workflow_production.R @@ -34,6 +34,19 @@ get_abp_from_os = function(epoch_year){ } +#' @description downloads a single epoch of ab plus closest to end_date +#' @param end_date: end date as a char in format 'YYYY-MM-DD' +#' @noRd +get_abp_from_dall_ref = function(end_date){ + + # Assign function inputs to global env + assign("end_date", end_date, envir = globalenv()) + + # Get nearest ab plus to end date with cqc postcodes within time frame + tictoc::tic(); source("data-raw/workflow/02c_ab_data_from_dall_ref.R"); tictoc::toc(); print(Sys.time()) +} + + #' @description downloads a single epoch of ab plus closest to end_date #' @param ab_plus_data: name of the ab plus db table #' @param cqc_data: the name of the cqc db table @@ -79,6 +92,7 @@ create_care_home_address_match = function(patient_address_data, lookup_address_d tictoc::tic(); source("data-raw/workflow/05_address_match.R"); tictoc::toc(); print(Sys.time()) } + #' @description creates the postcode lookup table in the DB containing latest available (hard-coded) mappings create_postcode_lookup = function(){ @@ -87,7 +101,6 @@ create_postcode_lookup = function(){ } - #' @description gets prescription info for matched records #' @param match_data: matched address data #' @noRd diff --git a/data-raw/workflow/workflow_run_23_24.R b/data-raw/workflow/workflow_run_23_24.R index 8f24682..7827cdc 100644 --- a/data-raw/workflow/workflow_run_23_24.R +++ b/data-raw/workflow/workflow_run_23_24.R @@ -6,42 +6,41 @@ source("data-raw/workflow/workflow_production.R") # Specify variables to retain at end of each script keep_vars = c(ls(), 'keep_vars') -# FY 22/23 --------------------------------------------------------------------- +# FY 23/24 --------------------------------------------------------------------- # 1. Get latest cqc data: ~1hr - Run once in first epoch script (new API code) get_latest_cqc_data() # 2. Get latest ab plus epoch: ~2hr -get_abp_from_api( +get_abp_from_dall_ref( end_date = "2024-03-31" ) # 3. Merge and process cqc and ab plus: ~3 mins create_ab_plus_cqc_data( - ab_plus_data = "INT646_ABP_20230331", - cqc_data = "INT646_CQC_20230602", - start_date = "2022-04-01", - end_date = "2023-03-31" + ab_plus_data = "ADDRESSBASE_PLUS_20240516", + cqc_data = "CQC_BASE_20240621", + start_date = "2023-04-01", + end_date = "2024-03-31" ) # 4. Create form level fact for records with a ch-postcode: ~11-14hr create_form_level_patient_addresses( - address_data = "INT646_ABP_CQC_20220401_20230331" + address_data = "INT646_ABP_CQC_20230401_20240331" ) # 5. Match patient details against ch-postcode uprn and process: ~30-40 mins create_care_home_address_match( - patient_address_data = "INT646_FORMS_20220401_20230331", - lookup_address_data = "INT646_ABP_CQC_20220401_20230331", - parent_uprn_data = "INT646_ABP_20230331" + patient_address_data = "INT646_FORMS_20230401_20240331", + lookup_address_data = "INT646_ABP_CQC_20230401_20240331", + parent_uprn_data = "ADDRESSBASE_PLUS_20240516" ) # 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min -# create_postcode_lookup() # Run once in first epoch script - +create_postcode_lookup() # Run once in first epoch script # 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs create_matched_prescription_base_table( - match_data = "INT646_MATCH_20220401_20230331", - form_data = "INT646_FORMS_20220401_20230331" + match_data = "INT646_MATCH_20230401_20240331", + form_data = "INT646_FORMS_20230401_20240331" ) diff --git a/data-raw/workflow/workflow_union_3_years_in_one_table.sql b/data-raw/workflow/workflow_union_3_years_in_one_table.sql index e368201..277da80 100644 --- a/data-raw/workflow/workflow_union_3_years_in_one_table.sql +++ b/data-raw/workflow/workflow_union_3_years_in_one_table.sql @@ -1,13 +1,13 @@ -- To run from DALL_REF -drop table int646_base_20200401_20230331 purge; -create table int646_base_20200401_20230331 nologging compress for query low as +drop table int646_base_20200401_20240331 purge; +create table int646_base_20200401_20240331 nologging compress for query low as -select '2020/21' fy, y1.* from migar.int646_base_20200401_20210331 y1 +select '2020/21' fy, y1.* from mamcp.int646_base_20200401_20210331 y1 union all -select '2021/22' fy, y2.* from migar.int646_base_20210401_20220331 y2 +select '2021/22' fy, y2.* from mamcp.int646_base_20210401_20220331 y2 union all -select '2022/23' fy, y3.* from migar.int646_base_20220401_20230331 y3 +select '2022/23' fy, y3.* from mamcp.int646_base_20220401_20230331 y3 ; grant select on int646_base_20200401_20230331 to migar, adnsh, mamcp; From bc07d87f3fe8aafa1b704ff50a3eac8ca5c19335 Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Thu, 27 Jun 2024 15:00:13 +0100 Subject: [PATCH 7/9] sql file additions --- data-raw/app/00_run_all.R | 5 + data-raw/workflow/workflow_fy_comparison.sql | 163 ++++++++++++++++++ .../workflow_union_3_years_in_one_table.sql | 2 + .../workflow_union_4_years_in_one_table.sql | 53 ++++++ data-raw/workflow/wworkflow_fy_comparison.sql | 40 +++++ 5 files changed, 263 insertions(+) create mode 100644 data-raw/workflow/workflow_fy_comparison.sql create mode 100644 data-raw/workflow/workflow_union_4_years_in_one_table.sql create mode 100644 data-raw/workflow/wworkflow_fy_comparison.sql diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R index 560a7c7..cbd114b 100644 --- a/data-raw/app/00_run_all.R +++ b/data-raw/app/00_run_all.R @@ -1,8 +1,12 @@ +Sys.time() +# Load library and generate base geo data source("data-raw/app/data_raw_helpers.R") source("data-raw/app/geo_data.R") +# Define vars to retain in workflow keep_vars = c(ls(), 'keep_vars', 'get_metrics') +# Run all scripts that generate an Rda file source("data-raw/app/01_headline_figures_df.R") source("data-raw/app/02_patients_age_gender_df.R") source("data-raw/app/03_patients_by_imd_df.R") @@ -11,3 +15,4 @@ source("data-raw/app/05_metrics_age_gender_df.R") source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R") source("data-raw/app/07_ch_flag_drug_df.R") source("data-raw/app/08_geo_ch_flag_drug_df.R") +Sys.time() \ No newline at end of file diff --git a/data-raw/workflow/workflow_fy_comparison.sql b/data-raw/workflow/workflow_fy_comparison.sql new file mode 100644 index 0000000..b4785ef --- /dev/null +++ b/data-raw/workflow/workflow_fy_comparison.sql @@ -0,0 +1,163 @@ + +-- STEP 1: AB Check ------------------------------------------------------------ + +-- 20/21: 29.1m +-- 21/22: 29.4m +-- 22/23: 29.7m +-- 23/24: 38.0m (no filters, such as country, applied to DALL_REF ABP) + +select /* +parallel(16) */ count(*) from mamcp.INT646_ABP_20210324 +union all +select /* +parallel(16) */ count(*) from mamcp.INT646_ABP_20220324 +union all +select /* +parallel(16) */ count(*) from mamcp.INT646_ABP_20230331 +union all +select /* +parallel(16) */ count(*) from adnsh.ADDRESSBASE_PLUS_20240516; + +-- STEP 2: CQC Check (single epoch used for initial 3 FY ----------------------- + +-- MAMCP: 29.3k +-- ADNSH: 29.9k + +select /* +parallel(16) */ count(*) from mamcp.INT646_CQC_20230602 +union all +select /* +parallel(16) */ count(*) from adnsh.CQC_BASE_20240621; + +-- STEP 3: AB-CQC Union Check -------------------------------------------------- + +-- 20/21: 846k +-- 21/22: 834k +-- 22/23: 837k +-- 23/24: 743k + +select /* +parallel(16) */ count(*) from mamcp.int646_abp_cqc_20200401_20210331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_abp_cqc_20210401_20220331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_abp_cqc_20220401_20230331 +union all +select /* +parallel(16) */ count(*) from adnsh.int646_abp_cqc_20230401_20240331; + +-- STEP 4: Form Check ---------------------------------------------------------- + +-- 20/21: 257m +-- 21/22: 264m +-- 22/23: 274m +-- 23/24: 285m + +select /* +parallel(16) */ count(*) from mamcp.int646_forms_20200401_20210331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_forms_20210401_20220331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_forms_20220401_20230331 +union all +select /* +parallel(16) */ count(*) from adnsh.int646_forms_20230401_20240331; + +-- STEP 5: Match Output Check -------------------------------------------------- + +-- 20/21: 20.8m +-- 21/22: 20.9m +-- 22/23: 21.9m +-- 23/24: 22.9m + +select /* +parallel(16) */ count(*) from mamcp.int646_match_20200401_20210331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_match_20210401_20220331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_match_20220401_20230331 +union all +select /* +parallel(16) */ count(*) from adnsh.int646_match_20230401_20240331; + +-- STEP 5.1: Match Output Match Type Check ------------------------------------- + +-- Order: NON-EXACT / EXACT + +-- 20/21: 15m / 4.6m +-- 21/22: 15m / 4.7m +-- 22/23: 16m / 5.1m +-- 23/24: 17m / 5.3m + +select /* +parallel(16) */ match_type, count(*) from mamcp.int646_match_20200401_20210331 group by match_type +union all +select /* +parallel(16) */ match_type, count(*) from mamcp.int646_match_20210401_20220331 group by match_type +union all +select /* +parallel(16) */ match_type, count(*) from mamcp.int646_match_20220401_20230331 group by match_type +union all +select /* +parallel(16) */ match_type, count(*) from adnsh.int646_match_20230401_20240331 group by match_type; + +-- STEP 5.2: Match Output CH Flag Check ---------------------------------------- + +-- Order: Y / N + +-- 20/21: 15.9m / 4.9m +-- 21/22: 16.0m / 4.9m +-- 22/23: 16.9m / 4.9m +-- 23/24: 18.6m / 4.3m + +select /* +parallel(16) */ ch_flag, count(*) from mamcp.int646_match_20200401_20210331 group by ch_flag +union all +select /* +parallel(16) */ ch_flag, count(*) from mamcp.int646_match_20210401_20220331 group by ch_flag +union all +select /* +parallel(16) */ ch_flag, count(*) from mamcp.int646_match_20220401_20230331 group by ch_flag +union all +select /* +parallel(16) */ ch_flag, count(*) from adnsh.int646_match_20230401_20240331 group by ch_flag; + +-- STEP 6: Postcode Lookup Check ----------------------------------------------- + +-- MAMCP: 224k +-- ADNSH: 224k + +select /* +parallel(16) */ count(*) from mamcp.int646_postcode_lookup +union all +select /* +parallel(16) */ count(*) from adnsh.int646_postcode_lookup; + +-- STEP 7: Final Base Table Check ---------------------------------------------- + +-- 20/21: 595m +-- 21/22: 602m +-- 22/23: 598m +-- 23/24: 639m + +select /* +parallel(16) */ count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2020/21' +union all +select /* +parallel(16) */ count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2021/22' +union all +select /* +parallel(16) */ count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2022/23' +union all +select /* +parallel(16) */ count(*) from adnsh.int646_base_20230401_20240331; + +-- STEP 7.1: Final Base Table Match Type Check --------------------------------- + +-- Order: NON-EXACT / EXACT / NO MATCH / DOUBLE KW / SINGLE KW / PATIENT COUNT + +-- 20/21: 34.5m / 10.3m / 548m +-- 21/22: 34.3m / 10.6m / 555m +-- 22/23: 35.3m / 11.1m / 550m +-- 23/24: 37.7m / 11.6m / 589m + +select /* +parallel(16) */ fy, match_type, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2020/21' group by fy, match_type +union all +select /* +parallel(16) */ fy, match_type, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2021/22' group by fy, match_type +union all +select /* +parallel(16) */ fy, match_type, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2022/23' group by fy, match_type +union all +select /* +parallel(16) */ '2023/24' as fy, match_type, count(*) from adnsh.int646_base_20230401_20240331 group by '2023/24', match_type order by fy, match_type; + +-- STEP 7.2: Final Base Table CH Flag Check ------------------------------------ + +-- Order: Y / N + +-- 20/21: 35m / 560m +-- 21/22: 35m / 566m +-- 22/23: 37m / 561m +-- 23/24: 40m / 599m + +select /* +parallel(16) */ fy, ch_flag, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2020/21' group by fy, ch_flag +union all +select /* +parallel(16) */ fy, ch_flag, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2021/22' group by fy, ch_flag +union all +select /* +parallel(16) */ fy, ch_flag, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2022/23' group by fy, ch_flag +union all +select /* +parallel(16) */ '2023/24' as fy, ch_flag, count(*) from adnsh.int646_base_20230401_20240331 group by '2023/24', ch_flag order by fy, ch_flag; + +-------------------------------------------------------------------------------- \ No newline at end of file diff --git a/data-raw/workflow/workflow_union_3_years_in_one_table.sql b/data-raw/workflow/workflow_union_3_years_in_one_table.sql index 277da80..f28bfd2 100644 --- a/data-raw/workflow/workflow_union_3_years_in_one_table.sql +++ b/data-raw/workflow/workflow_union_3_years_in_one_table.sql @@ -8,6 +8,8 @@ union all select '2021/22' fy, y2.* from mamcp.int646_base_20210401_20220331 y2 union all select '2022/23' fy, y3.* from mamcp.int646_base_20220401_20230331 y3 +union all +select '2023/24' fy, y4.* from adnsh.int646_base_20230401_20240331 y4 ; grant select on int646_base_20200401_20230331 to migar, adnsh, mamcp; diff --git a/data-raw/workflow/workflow_union_4_years_in_one_table.sql b/data-raw/workflow/workflow_union_4_years_in_one_table.sql new file mode 100644 index 0000000..e5b9af7 --- /dev/null +++ b/data-raw/workflow/workflow_union_4_years_in_one_table.sql @@ -0,0 +1,53 @@ +-- To run from DALL_REF + +drop table int646_base_20200401_20240331 purge; +create table int646_base_20200401_20240331 nologging compress for query low as + +select * from dall_ref.int646_base_20200401_20230331 +union all +select '2023/24' fy, y4.* from adnsh.int646_base_20230401_20240331 y4 +; + +grant select on int646_base_20200401_20240331 to migar, adnsh, mamcp; + +create index int646_i01 on int646_base_20200401_20240331 (fy); +create index int646_i02 on int646_base_20200401_20240331 (fy, year_month); + +create index int646_i03 on int646_base_20200401_20240331 (fy, age_band, gender); +create index int646_i04 on int646_base_20200401_20240331 (fy, age_band, gender, pcd_region_code, pcd_region_name); +create index int646_i05 on int646_base_20200401_20240331 (fy, age_band, gender, pcd_icb_code, pcd_icb_name); +create index int646_i06 on int646_base_20200401_20240331 (fy, age_band, gender, pcd_lad_code, pcd_lad_name); +create index int646_i07 on int646_base_20200401_20240331 (fy, imd_decile); +create index int646_i08 on int646_base_20200401_20240331 (ch_flag); +create index int646_i09 on int646_base_20200401_20240331 (fy, ch_flag); +create index int646_i10 on int646_base_20200401_20240331 (fy, nursing_home_flag); +create index int646_i11 on int646_base_20200401_20240331 (fy, residential_home_flag); + +create index int646_i12 on int646_base_20200401_20240331 (fy, ch_flag, pcd_region_code, pcd_region_name); +create index int646_i13 on int646_base_20200401_20240331 (fy, ch_flag, pcd_icb_code, pcd_icb_name); +create index int646_i14 on int646_base_20200401_20240331 (fy, ch_flag, pcd_lad_code, pcd_lad_name); + +create index int646_i15 on int646_base_20200401_20240331 (fy, chapter_descr, pcd_region_code, pcd_region_name); +create index int646_i16 on int646_base_20200401_20240331 (fy, chapter_descr, pcd_icb_code, pcd_icb_name); +create index int646_i17 on int646_base_20200401_20240331 (fy, chapter_descr, pcd_lad_code, pcd_lad_name); + +create index int646_i18 on int646_base_20200401_20240331 (fy, section_descr, pcd_region_code, pcd_region_name); +create index int646_i19 on int646_base_20200401_20240331 (fy, section_descr, pcd_icb_code, pcd_icb_name); +create index int646_i20 on int646_base_20200401_20240331 (fy, section_descr, pcd_lad_code, pcd_lad_name); + +create index int646_i21 on int646_base_20200401_20240331 (fy, paragraph_descr, pcd_region_code, pcd_region_name); +create index int646_i22 on int646_base_20200401_20240331 (fy, paragraph_descr, pcd_icb_code, pcd_icb_name); +create index int646_i23 on int646_base_20200401_20240331 (fy, paragraph_descr, pcd_lad_code, pcd_lad_name); + +create index int646_i24 on int646_base_20200401_20240331 (fy, chemical_substance_bnf_descr, pcd_region_code, pcd_region_name); +create index int646_i25 on int646_base_20200401_20240331 (fy, chemical_substance_bnf_descr, pcd_icb_code, pcd_icb_name); +create index int646_i26 on int646_base_20200401_20240331 (fy, chemical_substance_bnf_descr, pcd_lad_code, pcd_lad_name); + +create index int646_i27 on int646_base_20200401_20240331 (uprn_flag); + +create index int646_i28 on int646_base_20200401_20240331 (fy, year_month, nhs_no); +create index int646_i29 on int646_base_20200401_20240331 (fy, year_month, pcd_region_code, pcd_region_name, nhs_no); +create index int646_i30 on int646_base_20200401_20240331 (fy, year_month, pcd_icb_code, pcd_icb_name, nhs_no); +create index int646_i31 on int646_base_20200401_20240331 (fy, year_month, pcd_lad_code, pcd_lad_name, nhs_no); + +create index int646_i32 on int646_base_20200401_20240331 (fy, age_band, gender, ch_flag, year_month, nhs_no); \ No newline at end of file diff --git a/data-raw/workflow/wworkflow_fy_comparison.sql b/data-raw/workflow/wworkflow_fy_comparison.sql new file mode 100644 index 0000000..1f5fc6f --- /dev/null +++ b/data-raw/workflow/wworkflow_fy_comparison.sql @@ -0,0 +1,40 @@ + + +-- STEP 3: AB-CQC Union Check -------------------------------------------------- + + +-- STEP 4: Form Check ---------------------------------------------------------- + +-- 20/21: 878m +-- 21/22: 885m +-- 22/23: 880m +-- 23/24: 639m + +select /* +parallel(16) */ count(*) from mamcp.int646_forms_20200401_20210331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_forms_20210401_20220331 +union all +select /* +parallel(16) */ count(*) from mamcp.int646_forms_20220401_20230331 +union all +select /* +parallel(16) */ count(*) from adnsh.int646_forms_20230401_20240331; + +-- STEP 5: Match Output Check -------------------------------------------------- + +-- STEP 6: Postcode Lookup Check ----------------------------------------------- + +-- STEP 7: Final Base Table Check ---------------------------------------------- + +-- 20/21: 878m +-- 21/22: 885m +-- 22/23: 880m +-- 23/24: 639m + +select count(*) from mamcp.int646_forms_20200401_20210331 +union all +select count(*) from mamcp.int646_forms_20210401_20220331 +union all +select count(*) from mamcp.int646_forms_20220401_20230331 +union all +select count(*) from adnsh.int646_forms_20230401_20240331; + +-------------------------------------------------------------------------------- \ No newline at end of file From 69631d45f81ff4ac577431f0fadea30e87470f63 Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Fri, 28 Jun 2024 14:44:15 +0100 Subject: [PATCH 8/9] run_all script amendments --- data-raw/app/00_run_all.R | 22 +++++----- data-raw/app/01_headline_figures_df.R | 9 +++- data-raw/app/02_patients_age_gender_df.R | 15 +++++-- data-raw/app/03_patients_by_imd_df.R | 11 ++++- .../app/04_metrics_by_ch_type_85_split_df.R | 21 ++++++--- data-raw/app/05_metrics_age_gender_df.R | 13 ++++-- .../app/06_metrics_by_geo_and_ch_flag_df.R | 44 +++++++++++++------ data-raw/app/07_ch_flag_drug_df.R | 16 ++++--- data-raw/app/08_geo_ch_flag_drug_df.R | 3 +- data-raw/workflow/07_item_level_base.R | 1 - data-raw/workflow/workflow_run_22_23.R | 1 - 11 files changed, 108 insertions(+), 48 deletions(-) diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R index cbd114b..f79c3b7 100644 --- a/data-raw/app/00_run_all.R +++ b/data-raw/app/00_run_all.R @@ -1,18 +1,20 @@ Sys.time() # Load library and generate base geo data +library(tictoc) source("data-raw/app/data_raw_helpers.R") source("data-raw/app/geo_data.R") # Define vars to retain in workflow -keep_vars = c(ls(), 'keep_vars', 'get_metrics') +keep_vars = c(ls(), 'keep_vars') # Run all scripts that generate an Rda file -source("data-raw/app/01_headline_figures_df.R") -source("data-raw/app/02_patients_age_gender_df.R") -source("data-raw/app/03_patients_by_imd_df.R") -source("data-raw/app/04_metrics_by_ch_type_df.R") -source("data-raw/app/05_metrics_age_gender_df.R") -source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R") -source("data-raw/app/07_ch_flag_drug_df.R") -source("data-raw/app/08_geo_ch_flag_drug_df.R") -Sys.time() \ No newline at end of file +tic(); source("data-raw/app/01_headline_figures_df.R"); toc() +tic(); source("data-raw/app/02_patients_age_gender_df.R"); toc() +tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc() +tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc() +tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc() +tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc() +tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc() +tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc() +Sys.time() + diff --git a/data-raw/app/01_headline_figures_df.R b/data-raw/app/01_headline_figures_df.R index 103025b..e69e5db 100644 --- a/data-raw/app/01_headline_figures_df.R +++ b/data-raw/app/01_headline_figures_df.R @@ -8,6 +8,7 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from year month dim table in DWCP data_db <- con %>% + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Key findings used within analysis summary text @@ -104,4 +105,10 @@ mod_headline_figures_df = rbind(annual_df, monthly_df) usethis::use_data(mod_headline_figures_df, overwrite = TRUE) # Disconnect from database -DBI::dbDisconnect(con); rm(list = ls()); gc() +DBI::dbDisconnect(con) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() \ No newline at end of file diff --git a/data-raw/app/02_patients_age_gender_df.R b/data-raw/app/02_patients_age_gender_df.R index 2bbbc94..3de3c2d 100644 --- a/data-raw/app/02_patients_age_gender_df.R +++ b/data-raw/app/02_patients_age_gender_df.R @@ -1,5 +1,5 @@ -# Running time ~10 min +# Running time ~10 min library(dplyr) library(dbplyr) devtools::load_all() @@ -9,14 +9,14 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Item-level base table base_db <- con |> - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Add a dummy overall column base_db <- base_db |> mutate(OVERALL = "Overall") # Loop over each geography and aggregate using purrr's map function approach - patients_by_fy_geo_age_gender_fun <- function(geography_name) { # Identify geography cols @@ -59,6 +59,7 @@ patients_by_fy_geo_age_gender_fun <- function(geography_name) { } +# Map function patients_by_fy_geo_age_gender_df <- purrr::map( names(geographies), patients_by_fy_geo_age_gender_fun @@ -74,6 +75,7 @@ patients_by_fy_geo_age_gender_df <- #PCT_PATIENTS = janitor::round_half_up(PCT_PATIENTS, 1) ) +# Calculate patient proportions patients_by_fy_geo_age_gender_df <- patients_by_fy_geo_age_gender_df |> group_by(CH_FLAG, FY, GEOGRAPHY, SUB_GEOGRAPHY_CODE, SUB_GEOGRAPHY_NAME) |> mutate( @@ -121,4 +123,9 @@ usethis::use_data( # Disconnect from database DBI::dbDisconnect(con) -rm(list = ls()); gc() + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/03_patients_by_imd_df.R b/data-raw/app/03_patients_by_imd_df.R index b5f2d09..960d8ab 100644 --- a/data-raw/app/03_patients_by_imd_df.R +++ b/data-raw/app/03_patients_by_imd_df.R @@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from the item level base table fact_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Count care home patients in each decile mod_patients_by_imd_df <- fact_db %>% @@ -37,4 +38,10 @@ mod_patients_by_imd_df <- fact_db %>% usethis::use_data(mod_patients_by_imd_df, overwrite = TRUE) # Disconnect -DBI::dbDisconnect(con); rm(list = ls()); gc() +DBI::dbDisconnect(con) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/04_metrics_by_ch_type_85_split_df.R b/data-raw/app/04_metrics_by_ch_type_85_split_df.R index 4dacdc7..45ed56f 100644 --- a/data-raw/app/04_metrics_by_ch_type_85_split_df.R +++ b/data-raw/app/04_metrics_by_ch_type_85_split_df.R @@ -1,11 +1,9 @@ # Initial setup ----------------------------------------------------------- # Expected run time ~35 minutes @parallel 24 - library(dplyr) library(dbplyr) library(tidyr) - devtools::load_all() # Set up connection to DALP @@ -17,7 +15,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Item-level base table base_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Initial manipulation to create CH_TYPE column, later to be grouped by init_db <- base_db %>% @@ -48,6 +47,7 @@ init_db <- base_db %>% ) ) +# Union both initi_db variants init_db <- init_db %>% union( init_db %>% @@ -57,6 +57,7 @@ init_db <- init_db %>% ## Process ---------------------------------------------------------------- +# Get metrics metrics_by_ch_type_85_split_df <- get_metrics( init_db, first_grouping = c( @@ -73,6 +74,7 @@ metrics_by_ch_type_85_split_df <- get_metrics( ) ) +# Generate age band categories metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>% mutate( AGE_BAND = dplyr::case_match( @@ -84,9 +86,16 @@ metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>% ) %>% dplyr::relocate(AGE_BAND, .after = CH_TYPE) -## Save ------------------------------------------------------------------- +## Save ------------------------------------------------------------------------ usethis::use_data(metrics_by_ch_type_85_split_df, overwrite = TRUE) -# Cleanup ----------------------------------------------------------------- +# Cleanup ---------------------------------------------------------------------- + +# Disconnect DBI::dbDisconnect(con) -rm(list = ls()) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/05_metrics_age_gender_df.R b/data-raw/app/05_metrics_age_gender_df.R index c80f90a..052ade0 100644 --- a/data-raw/app/05_metrics_age_gender_df.R +++ b/data-raw/app/05_metrics_age_gender_df.R @@ -1,5 +1,6 @@ # Running time ~35 min +# Libraries and functions library(dplyr) library(dbplyr) devtools::load_all() @@ -10,10 +11,11 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Item-level base table base_db <- con |> - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) |> + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) %>% filter(GENDER %in% c("Male", "Female")) - +# Get metrics metrics_by_age_gender_and_ch_flag_df <- get_metrics( base_db, first_grouping = c( @@ -46,4 +48,9 @@ usethis::use_data( # Disconnect from database DBI::dbDisconnect(con) -rm(list = ls()); gc() + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R b/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R index d7ee584..6851975 100644 --- a/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R +++ b/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R @@ -1,28 +1,30 @@ -# Initial setup ----------------------------------------------------------- +# Initial setup ---------------------------------------------------------------- # Expected run time ~40 minutes @parallel 24 +# Library and functions library(dplyr) library(dbplyr) library(stringr) library(glue) library(purrr) - devtools::load_all() # Set up connection to DALP con <- nhsbsaR::con_nhsbsa(database = "DALP") -# Data validation --------------------------------------------------------- +# Data validation -------------------------------------------------------------- -## Setup ------------------------------------------------------------------ +## Setup ----------------------------------------------------------------------- +# Distinct postcode-related fields PCD <- con %>% tbl(from = "INT646_POSTCODE_LOOKUP") %>% select(ends_with("CODE"), ends_with("NAME")) %>% distinct() %>% collect() +# function to transform fields by geography fields transform_PCD <- function(data, geography) { data %>% select(starts_with(glue("PCD_{geography}"))) %>% @@ -33,19 +35,22 @@ transform_PCD <- function(data, geography) { filter(!is.na(SUB_GEOGRAPHY_NAME)) } +# Generate function output PCD_list <- list( REGION = PCD %>% transform_PCD("REGION"), ICB = PCD %>% transform_PCD("ICB"), LAD = PCD %>% transform_PCD("LAD") ) +# Define gis list GIS_list <- geo_data_validation # Check sub-geography codes and names match exactly between PCD and GIS; script # will stop if not -## Check sub-geography codes ---------------------------------------------- +## Check sub-geography codes --------------------------------------------------- +# Generate check check_sub_geo_codes <- list( in_GIS_only = map2( GIS_list, @@ -59,6 +64,7 @@ check_sub_geo_codes <- list( ) ) +# Stop if check fails stopifnot( "Some difference in geo codes: check `check_sub_geo_codes`"= { character(0) == check_sub_geo_codes %>% @@ -67,8 +73,9 @@ stopifnot( } ) -## Check sub-geography names ---------------------------------------------- +## Check sub-geography names --------------------------------------------------- +# Generate check check_sub_geo_names <- list( in_GIS_only = map2( GIS_list, @@ -82,6 +89,7 @@ check_sub_geo_names <- list( ) ) +# Stop if check fails stopifnot( "Some difference in geo names: check `check_sub_geo_names`"= { character(0) == check_sub_geo_names %>% @@ -90,13 +98,14 @@ stopifnot( } ) -# Data prep --------------------------------------------------------------- +# Data prep -------------------------------------------------------------------- -## Setup ------------------------------------------------------------------ +## Setup ----------------------------------------------------------------------- # Item-level base table base_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Aggregate by a geography aggregate_by_geo <- function(geography_name) { @@ -141,21 +150,28 @@ aggregate_by_geo <- function(geography_name) { rename(!!!geography_cols) } -## Process ---------------------------------------------------------------- +## Process --------------------------------------------------------------------- metrics_by_geo_and_ch_flag_df <- names(geographies)[-1] %>% map(aggregate_by_geo) %>% list_rbind() -## Format ----------------------------------------------------------------- +## Format ---------------------------------------------------------------------- metrics_by_geo_and_ch_flag_df <- metrics_by_geo_and_ch_flag_df %>% mutate(CH_FLAG = as.logical(CH_FLAG)) %>% filter(!is.na(SUB_GEOGRAPHY_NAME)) %>% format_data_raw("CH_FLAG") %>% suppressWarnings() # We do not have Overall in this data -## Save ------------------------------------------------------------------- +## Save ------------------------------------------------------------------------ usethis::use_data(metrics_by_geo_and_ch_flag_df, overwrite = TRUE) -# Cleanup ----------------------------------------------------------------- +# Cleanup ---------------------------------------------------------------------- + +# Disconnect DBI::dbDisconnect(con) -rm(list = ls()) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/07_ch_flag_drug_df.R b/data-raw/app/07_ch_flag_drug_df.R index 17270a0..6cc3792 100644 --- a/data-raw/app/07_ch_flag_drug_df.R +++ b/data-raw/app/07_ch_flag_drug_df.R @@ -1,5 +1,4 @@ - # Library library(dplyr) library(dbplyr) @@ -9,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from the item level base table fact_db <- con %>% - dplyr::tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # BNF columns bnf_cols = c( @@ -91,7 +91,7 @@ get_geo_bnf_prop = function(index){ BNF_PARENT = unique(df$BNF_PARENT), BNF_CHILD = unique(df$BNF_CHILD), METRIC = unique(df$METRIC) - ) %>% + ) %>% left_join(df) %>% mutate(VALUE = ifelse(is.na(VALUE), 0, VALUE)) } @@ -211,6 +211,12 @@ mod_ch_flag_drug_df %>% count(FY, CH_FLAG, BNF_PARENT) usethis::use_data(mod_ch_flag_drug_df, overwrite = TRUE) # Disconnect -DBI::dbDisconnect(con); rm(list = ls()); gc() +DBI::dbDisconnect(con) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() -#------------------------------------------------------------------------------- +#------------------------------------------------------------------------------- \ No newline at end of file diff --git a/data-raw/app/08_geo_ch_flag_drug_df.R b/data-raw/app/08_geo_ch_flag_drug_df.R index dab2966..ff66a34 100644 --- a/data-raw/app/08_geo_ch_flag_drug_df.R +++ b/data-raw/app/08_geo_ch_flag_drug_df.R @@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from the item level base table fact_db <- con %>% - dplyr::tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # BNF columns bnf_cols = c( diff --git a/data-raw/workflow/07_item_level_base.R b/data-raw/workflow/07_item_level_base.R index 12c0278..1f65078 100644 --- a/data-raw/workflow/07_item_level_base.R +++ b/data-raw/workflow/07_item_level_base.R @@ -199,7 +199,6 @@ presc_db = presc_db %>% PRESCRIBER_CODE = PRESCRIBER_LTST_CDE ) - # Process form fact form_db = form_db %>% select( diff --git a/data-raw/workflow/workflow_run_22_23.R b/data-raw/workflow/workflow_run_22_23.R index 3c29c4d..18893e3 100644 --- a/data-raw/workflow/workflow_run_22_23.R +++ b/data-raw/workflow/workflow_run_22_23.R @@ -39,7 +39,6 @@ create_care_home_address_match( # 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min # create_postcode_lookup() # Run once in first epoch script - # 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs create_matched_prescription_base_table( match_data = "INT646_MATCH_20220401_20230331", From 33efb77f06fb583f014c5bd189ee92cf856a40ec Mon Sep 17 00:00:00 2001 From: Adnan Shroufi Date: Tue, 2 Jul 2024 12:12:30 +0100 Subject: [PATCH 9/9] eda edit --- EDA/eda_parent_uprn_and_pat_threshold.R | 7 ++++--- data-raw/app/00_run_all.R | 14 +++++++------- data-raw/app/04_metrics_by_ch_type_85_split_df.R | 2 +- data-raw/app/data_raw_helpers.R | 6 +++--- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/EDA/eda_parent_uprn_and_pat_threshold.R b/EDA/eda_parent_uprn_and_pat_threshold.R index 585f1ae..5e796de 100644 --- a/EDA/eda_parent_uprn_and_pat_threshold.R +++ b/EDA/eda_parent_uprn_and_pat_threshold.R @@ -9,7 +9,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from year month dim table in DWCP data <- con %>% - tbl(from = in_schema("ADNSH", "INT646_BASE_20210401_20220331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) %>% + filter(FY == "2020/21") # Get chapter info chapters = con %>% @@ -39,7 +40,7 @@ df_std = data %>% NURSING_HOME_FLAG = max(NURSING_HOME_FLAG), RESIDENTIAL_HOME_FLAG = max(RESIDENTIAL_HOME_FLAG), MAX_MONTHLY_PATIENTS = max(MAX_MONTHLY_PATIENTS), - NUMBER_OF_BEDS = max(NUMBER_OF_BEDS), + #NUMBER_OF_BEDS = max(NUMBER_OF_BEDS), MONTHS = n_distinct(YEAR_MONTH), PATS = n_distinct(NHS_NO), ITEMS = sum(ITEM_COUNT), @@ -65,7 +66,7 @@ df_merge = data %>% NURSING_HOME_FLAG = max(NURSING_HOME_FLAG), RESIDENTIAL_HOME_FLAG = max(RESIDENTIAL_HOME_FLAG), MAX_MONTHLY_PATIENTS = max(MAX_MONTHLY_PATIENTS), - NUMBER_OF_BEDS = max(NUMBER_OF_BEDS), + #NUMBER_OF_BEDS = max(NUMBER_OF_BEDS), MONTHS = n_distinct(YEAR_MONTH), PATS = n_distinct(NHS_NO), ITEMS = sum(ITEM_COUNT), diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R index f79c3b7..8c23e20 100644 --- a/data-raw/app/00_run_all.R +++ b/data-raw/app/00_run_all.R @@ -8,13 +8,13 @@ source("data-raw/app/geo_data.R") keep_vars = c(ls(), 'keep_vars') # Run all scripts that generate an Rda file -tic(); source("data-raw/app/01_headline_figures_df.R"); toc() +tic(); source("data-raw/app/01_headline_figures_df.R"); toc() # 10 mins tic(); source("data-raw/app/02_patients_age_gender_df.R"); toc() -tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc() -tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc() -tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc() -tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc() -tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc() -tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc() +tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc() # 10 mins +tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc() # 3 hours +tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc() # 30 mins +tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc() # 90 mins +tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc() # 30 mins +tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc() # 90 mins Sys.time() diff --git a/data-raw/app/04_metrics_by_ch_type_85_split_df.R b/data-raw/app/04_metrics_by_ch_type_85_split_df.R index 45ed56f..4ef664f 100644 --- a/data-raw/app/04_metrics_by_ch_type_85_split_df.R +++ b/data-raw/app/04_metrics_by_ch_type_85_split_df.R @@ -1,6 +1,6 @@ # Initial setup ----------------------------------------------------------- -# Expected run time ~35 minutes @parallel 24 +# Expected run time ~35 minutes @parallel 36 library(dplyr) library(dbplyr) library(tidyr) diff --git a/data-raw/app/data_raw_helpers.R b/data-raw/app/data_raw_helpers.R index 330ca39..f8e8aea 100644 --- a/data-raw/app/data_raw_helpers.R +++ b/data-raw/app/data_raw_helpers.R @@ -5,7 +5,7 @@ #' @param first_grouping A vector of col names to do initial grouping by. #' @param second_grouping A vector of col names to do secondary grouping by. #' @param nest_cols A vector of col names to use nesting on when completing. -#' @param num_parallel By default, query will ask for 24 parallel, but can ask +#' @param num_parallel By default, query will ask for 36 parallel, but can ask #' for different value if you think it is needed. #' #' @return @@ -34,13 +34,13 @@ #' "SUB_GEOGRAPHY_CODE", #' "SUB_GEOGRAPHY_NAME" #' ), -#' num_parallel = 32 +#' num_parallel = 36 #' ) get_metrics <- function(init_db, first_grouping, second_grouping, nest_cols = c(), - num_parallel = 24) { + num_parallel = 36) { # Collect data and calculate raw metrics init_db %>% group_by(across(all_of(first_grouping))) %>%