Skip to content

Commit

Permalink
run_all script amendments
Browse files Browse the repository at this point in the history
  • Loading branch information
AdnanShroufi committed Jun 28, 2024
1 parent bc07d87 commit 69631d4
Show file tree
Hide file tree
Showing 11 changed files with 108 additions and 48 deletions.
22 changes: 12 additions & 10 deletions data-raw/app/00_run_all.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
Sys.time()
# Load library and generate base geo data
library(tictoc)
source("data-raw/app/data_raw_helpers.R")
source("data-raw/app/geo_data.R")

# Define vars to retain in workflow
keep_vars = c(ls(), 'keep_vars', 'get_metrics')
keep_vars = c(ls(), 'keep_vars')

# Run all scripts that generate an Rda file
source("data-raw/app/01_headline_figures_df.R")
source("data-raw/app/02_patients_age_gender_df.R")
source("data-raw/app/03_patients_by_imd_df.R")
source("data-raw/app/04_metrics_by_ch_type_df.R")
source("data-raw/app/05_metrics_age_gender_df.R")
source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R")
source("data-raw/app/07_ch_flag_drug_df.R")
source("data-raw/app/08_geo_ch_flag_drug_df.R")
Sys.time()
tic(); source("data-raw/app/01_headline_figures_df.R"); toc()
tic(); source("data-raw/app/02_patients_age_gender_df.R"); toc()
tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc()
tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc()
tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc()
tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc()
tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc()
tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc()
Sys.time()

9 changes: 8 additions & 1 deletion data-raw/app/01_headline_figures_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Create a lazy table from year month dim table in DWCP
data_db <- con %>%
#tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))

# Key findings used within analysis summary text
Expand Down Expand Up @@ -104,4 +105,10 @@ mod_headline_figures_df = rbind(annual_df, monthly_df)
usethis::use_data(mod_headline_figures_df, overwrite = TRUE)

# Disconnect from database
DBI::dbDisconnect(con); rm(list = ls()); gc()
DBI::dbDisconnect(con)

# Remove vars specific to script
remove_vars <- setdiff(ls(), keep_vars)

# Remove objects and clean environment
rm(list = remove_vars, remove_vars); gc()
15 changes: 11 additions & 4 deletions data-raw/app/02_patients_age_gender_df.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Running time ~10 min

# Running time ~10 min
library(dplyr)
library(dbplyr)
devtools::load_all()
Expand All @@ -9,14 +9,14 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Item-level base table
base_db <- con |>
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
#tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))

# Add a dummy overall column
base_db <- base_db |>
mutate(OVERALL = "Overall")

# Loop over each geography and aggregate using purrr's map function approach

patients_by_fy_geo_age_gender_fun <- function(geography_name) {

# Identify geography cols
Expand Down Expand Up @@ -59,6 +59,7 @@ patients_by_fy_geo_age_gender_fun <- function(geography_name) {

}

# Map function
patients_by_fy_geo_age_gender_df <- purrr::map(
names(geographies),
patients_by_fy_geo_age_gender_fun
Expand All @@ -74,6 +75,7 @@ patients_by_fy_geo_age_gender_df <-
#PCT_PATIENTS = janitor::round_half_up(PCT_PATIENTS, 1)
)

# Calculate patient proportions
patients_by_fy_geo_age_gender_df <- patients_by_fy_geo_age_gender_df |>
group_by(CH_FLAG, FY, GEOGRAPHY, SUB_GEOGRAPHY_CODE, SUB_GEOGRAPHY_NAME) |>
mutate(
Expand Down Expand Up @@ -121,4 +123,9 @@ usethis::use_data(

# Disconnect from database
DBI::dbDisconnect(con)
rm(list = ls()); gc()

# Remove vars specific to script
remove_vars <- setdiff(ls(), keep_vars)

# Remove objects and clean environment
rm(list = remove_vars, remove_vars); gc()
11 changes: 9 additions & 2 deletions data-raw/app/03_patients_by_imd_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Create a lazy table from the item level base table
fact_db <- con %>%
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
#tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))

# Count care home patients in each decile
mod_patients_by_imd_df <- fact_db %>%
Expand Down Expand Up @@ -37,4 +38,10 @@ mod_patients_by_imd_df <- fact_db %>%
usethis::use_data(mod_patients_by_imd_df, overwrite = TRUE)

# Disconnect
DBI::dbDisconnect(con); rm(list = ls()); gc()
DBI::dbDisconnect(con)

# Remove vars specific to script
remove_vars <- setdiff(ls(), keep_vars)

# Remove objects and clean environment
rm(list = remove_vars, remove_vars); gc()
21 changes: 15 additions & 6 deletions data-raw/app/04_metrics_by_ch_type_85_split_df.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
# Initial setup -----------------------------------------------------------

# Expected run time ~35 minutes @parallel 24

library(dplyr)
library(dbplyr)
library(tidyr)

devtools::load_all()

# Set up connection to DALP
Expand All @@ -17,7 +15,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Item-level base table
base_db <- con %>%
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
#tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))

# Initial manipulation to create CH_TYPE column, later to be grouped by
init_db <- base_db %>%
Expand Down Expand Up @@ -48,6 +47,7 @@ init_db <- base_db %>%
)
)

# Union both initi_db variants
init_db <- init_db %>%
union(
init_db %>%
Expand All @@ -57,6 +57,7 @@ init_db <- init_db %>%

## Process ----------------------------------------------------------------

# Get metrics
metrics_by_ch_type_85_split_df <- get_metrics(
init_db,
first_grouping = c(
Expand All @@ -73,6 +74,7 @@ metrics_by_ch_type_85_split_df <- get_metrics(
)
)

# Generate age band categories
metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>%
mutate(
AGE_BAND = dplyr::case_match(
Expand All @@ -84,9 +86,16 @@ metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>%
) %>%
dplyr::relocate(AGE_BAND, .after = CH_TYPE)

## Save -------------------------------------------------------------------
## Save ------------------------------------------------------------------------
usethis::use_data(metrics_by_ch_type_85_split_df, overwrite = TRUE)

# Cleanup -----------------------------------------------------------------
# Cleanup ----------------------------------------------------------------------

# Disconnect
DBI::dbDisconnect(con)
rm(list = ls())

# Remove vars specific to script
remove_vars <- setdiff(ls(), keep_vars)

# Remove objects and clean environment
rm(list = remove_vars, remove_vars); gc()
13 changes: 10 additions & 3 deletions data-raw/app/05_metrics_age_gender_df.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Running time ~35 min

# Libraries and functions
library(dplyr)
library(dbplyr)
devtools::load_all()
Expand All @@ -10,10 +11,11 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Item-level base table
base_db <- con |>
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) |>
#tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) %>%
filter(GENDER %in% c("Male", "Female"))


# Get metrics
metrics_by_age_gender_and_ch_flag_df <- get_metrics(
base_db,
first_grouping = c(
Expand Down Expand Up @@ -46,4 +48,9 @@ usethis::use_data(

# Disconnect from database
DBI::dbDisconnect(con)
rm(list = ls()); gc()

# Remove vars specific to script
remove_vars <- setdiff(ls(), keep_vars)

# Remove objects and clean environment
rm(list = remove_vars, remove_vars); gc()
44 changes: 30 additions & 14 deletions data-raw/app/06_metrics_by_geo_and_ch_flag_df.R
Original file line number Diff line number Diff line change
@@ -1,28 +1,30 @@
# Initial setup -----------------------------------------------------------
# Initial setup ----------------------------------------------------------------

# Expected run time ~40 minutes @parallel 24

# Library and functions
library(dplyr)
library(dbplyr)
library(stringr)
library(glue)
library(purrr)

devtools::load_all()

# Set up connection to DALP
con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Data validation ---------------------------------------------------------
# Data validation --------------------------------------------------------------

## Setup ------------------------------------------------------------------
## Setup -----------------------------------------------------------------------

# Distinct postcode-related fields
PCD <- con %>%
tbl(from = "INT646_POSTCODE_LOOKUP") %>%
select(ends_with("CODE"), ends_with("NAME")) %>%
distinct() %>%
collect()

# function to transform fields by geography fields
transform_PCD <- function(data, geography) {
data %>%
select(starts_with(glue("PCD_{geography}"))) %>%
Expand All @@ -33,19 +35,22 @@ transform_PCD <- function(data, geography) {
filter(!is.na(SUB_GEOGRAPHY_NAME))
}

# Generate function output
PCD_list <- list(
REGION = PCD %>% transform_PCD("REGION"),
ICB = PCD %>% transform_PCD("ICB"),
LAD = PCD %>% transform_PCD("LAD")
)

# Define gis list
GIS_list <- geo_data_validation

# Check sub-geography codes and names match exactly between PCD and GIS; script
# will stop if not

## Check sub-geography codes ----------------------------------------------
## Check sub-geography codes ---------------------------------------------------

# Generate check
check_sub_geo_codes <- list(
in_GIS_only = map2(
GIS_list,
Expand All @@ -59,6 +64,7 @@ check_sub_geo_codes <- list(
)
)

# Stop if check fails
stopifnot(
"Some difference in geo codes: check `check_sub_geo_codes`"= {
character(0) == check_sub_geo_codes %>%
Expand All @@ -67,8 +73,9 @@ stopifnot(
}
)

## Check sub-geography names ----------------------------------------------
## Check sub-geography names ---------------------------------------------------

# Generate check
check_sub_geo_names <- list(
in_GIS_only = map2(
GIS_list,
Expand All @@ -82,6 +89,7 @@ check_sub_geo_names <- list(
)
)

# Stop if check fails
stopifnot(
"Some difference in geo names: check `check_sub_geo_names`"= {
character(0) == check_sub_geo_names %>%
Expand All @@ -90,13 +98,14 @@ stopifnot(
}
)

# Data prep ---------------------------------------------------------------
# Data prep --------------------------------------------------------------------

## Setup ------------------------------------------------------------------
## Setup -----------------------------------------------------------------------

# Item-level base table
base_db <- con %>%
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
#tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))

# Aggregate by a geography
aggregate_by_geo <- function(geography_name) {
Expand Down Expand Up @@ -141,21 +150,28 @@ aggregate_by_geo <- function(geography_name) {
rename(!!!geography_cols)
}

## Process ----------------------------------------------------------------
## Process ---------------------------------------------------------------------
metrics_by_geo_and_ch_flag_df <- names(geographies)[-1] %>%
map(aggregate_by_geo) %>%
list_rbind()

## Format -----------------------------------------------------------------
## Format ----------------------------------------------------------------------
metrics_by_geo_and_ch_flag_df <- metrics_by_geo_and_ch_flag_df %>%
mutate(CH_FLAG = as.logical(CH_FLAG)) %>%
filter(!is.na(SUB_GEOGRAPHY_NAME)) %>%
format_data_raw("CH_FLAG") %>%
suppressWarnings() # We do not have Overall in this data

## Save -------------------------------------------------------------------
## Save ------------------------------------------------------------------------
usethis::use_data(metrics_by_geo_and_ch_flag_df, overwrite = TRUE)

# Cleanup -----------------------------------------------------------------
# Cleanup ----------------------------------------------------------------------

# Disconnect
DBI::dbDisconnect(con)
rm(list = ls())

# Remove vars specific to script
remove_vars <- setdiff(ls(), keep_vars)

# Remove objects and clean environment
rm(list = remove_vars, remove_vars); gc()
Loading

0 comments on commit 69631d4

Please sign in to comment.