diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R index cbd114b2..f79c3b7d 100644 --- a/data-raw/app/00_run_all.R +++ b/data-raw/app/00_run_all.R @@ -1,18 +1,20 @@ Sys.time() # Load library and generate base geo data +library(tictoc) source("data-raw/app/data_raw_helpers.R") source("data-raw/app/geo_data.R") # Define vars to retain in workflow -keep_vars = c(ls(), 'keep_vars', 'get_metrics') +keep_vars = c(ls(), 'keep_vars') # Run all scripts that generate an Rda file -source("data-raw/app/01_headline_figures_df.R") -source("data-raw/app/02_patients_age_gender_df.R") -source("data-raw/app/03_patients_by_imd_df.R") -source("data-raw/app/04_metrics_by_ch_type_df.R") -source("data-raw/app/05_metrics_age_gender_df.R") -source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R") -source("data-raw/app/07_ch_flag_drug_df.R") -source("data-raw/app/08_geo_ch_flag_drug_df.R") -Sys.time() \ No newline at end of file +tic(); source("data-raw/app/01_headline_figures_df.R"); toc() +tic(); source("data-raw/app/02_patients_age_gender_df.R"); toc() +tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc() +tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc() +tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc() +tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc() +tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc() +tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc() +Sys.time() + diff --git a/data-raw/app/01_headline_figures_df.R b/data-raw/app/01_headline_figures_df.R index 103025b2..e69e5dbe 100644 --- a/data-raw/app/01_headline_figures_df.R +++ b/data-raw/app/01_headline_figures_df.R @@ -8,6 +8,7 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from year month dim table in DWCP data_db <- con %>% + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Key findings used within analysis summary text @@ -104,4 +105,10 @@ mod_headline_figures_df = rbind(annual_df, monthly_df) usethis::use_data(mod_headline_figures_df, overwrite = TRUE) # Disconnect from database -DBI::dbDisconnect(con); rm(list = ls()); gc() +DBI::dbDisconnect(con) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() \ No newline at end of file diff --git a/data-raw/app/02_patients_age_gender_df.R b/data-raw/app/02_patients_age_gender_df.R index 2bbbc946..3de3c2d4 100644 --- a/data-raw/app/02_patients_age_gender_df.R +++ b/data-raw/app/02_patients_age_gender_df.R @@ -1,5 +1,5 @@ -# Running time ~10 min +# Running time ~10 min library(dplyr) library(dbplyr) devtools::load_all() @@ -9,14 +9,14 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Item-level base table base_db <- con |> - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Add a dummy overall column base_db <- base_db |> mutate(OVERALL = "Overall") # Loop over each geography and aggregate using purrr's map function approach - patients_by_fy_geo_age_gender_fun <- function(geography_name) { # Identify geography cols @@ -59,6 +59,7 @@ patients_by_fy_geo_age_gender_fun <- function(geography_name) { } +# Map function patients_by_fy_geo_age_gender_df <- purrr::map( names(geographies), patients_by_fy_geo_age_gender_fun @@ -74,6 +75,7 @@ patients_by_fy_geo_age_gender_df <- #PCT_PATIENTS = janitor::round_half_up(PCT_PATIENTS, 1) ) +# Calculate patient proportions patients_by_fy_geo_age_gender_df <- patients_by_fy_geo_age_gender_df |> group_by(CH_FLAG, FY, GEOGRAPHY, SUB_GEOGRAPHY_CODE, SUB_GEOGRAPHY_NAME) |> mutate( @@ -121,4 +123,9 @@ usethis::use_data( # Disconnect from database DBI::dbDisconnect(con) -rm(list = ls()); gc() + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/03_patients_by_imd_df.R b/data-raw/app/03_patients_by_imd_df.R index b5f2d098..960d8ab5 100644 --- a/data-raw/app/03_patients_by_imd_df.R +++ b/data-raw/app/03_patients_by_imd_df.R @@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from the item level base table fact_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Count care home patients in each decile mod_patients_by_imd_df <- fact_db %>% @@ -37,4 +38,10 @@ mod_patients_by_imd_df <- fact_db %>% usethis::use_data(mod_patients_by_imd_df, overwrite = TRUE) # Disconnect -DBI::dbDisconnect(con); rm(list = ls()); gc() +DBI::dbDisconnect(con) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/04_metrics_by_ch_type_85_split_df.R b/data-raw/app/04_metrics_by_ch_type_85_split_df.R index 4dacdc7e..45ed56f2 100644 --- a/data-raw/app/04_metrics_by_ch_type_85_split_df.R +++ b/data-raw/app/04_metrics_by_ch_type_85_split_df.R @@ -1,11 +1,9 @@ # Initial setup ----------------------------------------------------------- # Expected run time ~35 minutes @parallel 24 - library(dplyr) library(dbplyr) library(tidyr) - devtools::load_all() # Set up connection to DALP @@ -17,7 +15,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Item-level base table base_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Initial manipulation to create CH_TYPE column, later to be grouped by init_db <- base_db %>% @@ -48,6 +47,7 @@ init_db <- base_db %>% ) ) +# Union both initi_db variants init_db <- init_db %>% union( init_db %>% @@ -57,6 +57,7 @@ init_db <- init_db %>% ## Process ---------------------------------------------------------------- +# Get metrics metrics_by_ch_type_85_split_df <- get_metrics( init_db, first_grouping = c( @@ -73,6 +74,7 @@ metrics_by_ch_type_85_split_df <- get_metrics( ) ) +# Generate age band categories metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>% mutate( AGE_BAND = dplyr::case_match( @@ -84,9 +86,16 @@ metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>% ) %>% dplyr::relocate(AGE_BAND, .after = CH_TYPE) -## Save ------------------------------------------------------------------- +## Save ------------------------------------------------------------------------ usethis::use_data(metrics_by_ch_type_85_split_df, overwrite = TRUE) -# Cleanup ----------------------------------------------------------------- +# Cleanup ---------------------------------------------------------------------- + +# Disconnect DBI::dbDisconnect(con) -rm(list = ls()) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/05_metrics_age_gender_df.R b/data-raw/app/05_metrics_age_gender_df.R index c80f90aa..052ade09 100644 --- a/data-raw/app/05_metrics_age_gender_df.R +++ b/data-raw/app/05_metrics_age_gender_df.R @@ -1,5 +1,6 @@ # Running time ~35 min +# Libraries and functions library(dplyr) library(dbplyr) devtools::load_all() @@ -10,10 +11,11 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Item-level base table base_db <- con |> - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) |> + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) %>% filter(GENDER %in% c("Male", "Female")) - +# Get metrics metrics_by_age_gender_and_ch_flag_df <- get_metrics( base_db, first_grouping = c( @@ -46,4 +48,9 @@ usethis::use_data( # Disconnect from database DBI::dbDisconnect(con) -rm(list = ls()); gc() + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R b/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R index d7ee5847..68519755 100644 --- a/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R +++ b/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R @@ -1,28 +1,30 @@ -# Initial setup ----------------------------------------------------------- +# Initial setup ---------------------------------------------------------------- # Expected run time ~40 minutes @parallel 24 +# Library and functions library(dplyr) library(dbplyr) library(stringr) library(glue) library(purrr) - devtools::load_all() # Set up connection to DALP con <- nhsbsaR::con_nhsbsa(database = "DALP") -# Data validation --------------------------------------------------------- +# Data validation -------------------------------------------------------------- -## Setup ------------------------------------------------------------------ +## Setup ----------------------------------------------------------------------- +# Distinct postcode-related fields PCD <- con %>% tbl(from = "INT646_POSTCODE_LOOKUP") %>% select(ends_with("CODE"), ends_with("NAME")) %>% distinct() %>% collect() +# function to transform fields by geography fields transform_PCD <- function(data, geography) { data %>% select(starts_with(glue("PCD_{geography}"))) %>% @@ -33,19 +35,22 @@ transform_PCD <- function(data, geography) { filter(!is.na(SUB_GEOGRAPHY_NAME)) } +# Generate function output PCD_list <- list( REGION = PCD %>% transform_PCD("REGION"), ICB = PCD %>% transform_PCD("ICB"), LAD = PCD %>% transform_PCD("LAD") ) +# Define gis list GIS_list <- geo_data_validation # Check sub-geography codes and names match exactly between PCD and GIS; script # will stop if not -## Check sub-geography codes ---------------------------------------------- +## Check sub-geography codes --------------------------------------------------- +# Generate check check_sub_geo_codes <- list( in_GIS_only = map2( GIS_list, @@ -59,6 +64,7 @@ check_sub_geo_codes <- list( ) ) +# Stop if check fails stopifnot( "Some difference in geo codes: check `check_sub_geo_codes`"= { character(0) == check_sub_geo_codes %>% @@ -67,8 +73,9 @@ stopifnot( } ) -## Check sub-geography names ---------------------------------------------- +## Check sub-geography names --------------------------------------------------- +# Generate check check_sub_geo_names <- list( in_GIS_only = map2( GIS_list, @@ -82,6 +89,7 @@ check_sub_geo_names <- list( ) ) +# Stop if check fails stopifnot( "Some difference in geo names: check `check_sub_geo_names`"= { character(0) == check_sub_geo_names %>% @@ -90,13 +98,14 @@ stopifnot( } ) -# Data prep --------------------------------------------------------------- +# Data prep -------------------------------------------------------------------- -## Setup ------------------------------------------------------------------ +## Setup ----------------------------------------------------------------------- # Item-level base table base_db <- con %>% - tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # Aggregate by a geography aggregate_by_geo <- function(geography_name) { @@ -141,21 +150,28 @@ aggregate_by_geo <- function(geography_name) { rename(!!!geography_cols) } -## Process ---------------------------------------------------------------- +## Process --------------------------------------------------------------------- metrics_by_geo_and_ch_flag_df <- names(geographies)[-1] %>% map(aggregate_by_geo) %>% list_rbind() -## Format ----------------------------------------------------------------- +## Format ---------------------------------------------------------------------- metrics_by_geo_and_ch_flag_df <- metrics_by_geo_and_ch_flag_df %>% mutate(CH_FLAG = as.logical(CH_FLAG)) %>% filter(!is.na(SUB_GEOGRAPHY_NAME)) %>% format_data_raw("CH_FLAG") %>% suppressWarnings() # We do not have Overall in this data -## Save ------------------------------------------------------------------- +## Save ------------------------------------------------------------------------ usethis::use_data(metrics_by_geo_and_ch_flag_df, overwrite = TRUE) -# Cleanup ----------------------------------------------------------------- +# Cleanup ---------------------------------------------------------------------- + +# Disconnect DBI::dbDisconnect(con) -rm(list = ls()) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() diff --git a/data-raw/app/07_ch_flag_drug_df.R b/data-raw/app/07_ch_flag_drug_df.R index 17270a0e..6cc3792f 100644 --- a/data-raw/app/07_ch_flag_drug_df.R +++ b/data-raw/app/07_ch_flag_drug_df.R @@ -1,5 +1,4 @@ - # Library library(dplyr) library(dbplyr) @@ -9,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from the item level base table fact_db <- con %>% - dplyr::tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # BNF columns bnf_cols = c( @@ -91,7 +91,7 @@ get_geo_bnf_prop = function(index){ BNF_PARENT = unique(df$BNF_PARENT), BNF_CHILD = unique(df$BNF_CHILD), METRIC = unique(df$METRIC) - ) %>% + ) %>% left_join(df) %>% mutate(VALUE = ifelse(is.na(VALUE), 0, VALUE)) } @@ -211,6 +211,12 @@ mod_ch_flag_drug_df %>% count(FY, CH_FLAG, BNF_PARENT) usethis::use_data(mod_ch_flag_drug_df, overwrite = TRUE) # Disconnect -DBI::dbDisconnect(con); rm(list = ls()); gc() +DBI::dbDisconnect(con) + +# Remove vars specific to script +remove_vars <- setdiff(ls(), keep_vars) + +# Remove objects and clean environment +rm(list = remove_vars, remove_vars); gc() -#------------------------------------------------------------------------------- +#------------------------------------------------------------------------------- \ No newline at end of file diff --git a/data-raw/app/08_geo_ch_flag_drug_df.R b/data-raw/app/08_geo_ch_flag_drug_df.R index dab29663..ff66a34a 100644 --- a/data-raw/app/08_geo_ch_flag_drug_df.R +++ b/data-raw/app/08_geo_ch_flag_drug_df.R @@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP") # Create a lazy table from the item level base table fact_db <- con %>% - dplyr::tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) + tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) # BNF columns bnf_cols = c( diff --git a/data-raw/workflow/07_item_level_base.R b/data-raw/workflow/07_item_level_base.R index 12c0278d..1f650783 100644 --- a/data-raw/workflow/07_item_level_base.R +++ b/data-raw/workflow/07_item_level_base.R @@ -199,7 +199,6 @@ presc_db = presc_db %>% PRESCRIBER_CODE = PRESCRIBER_LTST_CDE ) - # Process form fact form_db = form_db %>% select( diff --git a/data-raw/workflow/workflow_run_22_23.R b/data-raw/workflow/workflow_run_22_23.R index 3c29c4d7..18893e33 100644 --- a/data-raw/workflow/workflow_run_22_23.R +++ b/data-raw/workflow/workflow_run_22_23.R @@ -39,7 +39,6 @@ create_care_home_address_match( # 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min # create_postcode_lookup() # Run once in first epoch script - # 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs create_matched_prescription_base_table( match_data = "INT646_MATCH_20220401_20230331",