From 8ada08ed01d844f65c2f95f6a78cbdb8aa75e0ed Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Thu, 20 Jun 2024 14:45:00 +0100
Subject: [PATCH 1/9] cqc funs

---
 .../workflow/01_upload_cqc_data_from_api.R    | 207 +++++++++++++++++-
 data-raw/workflow/workflow_run_23_24.R        |  47 ++++
 2 files changed, 244 insertions(+), 10 deletions(-)
 create mode 100644 data-raw/workflow/workflow_run_23_24.R

diff --git a/data-raw/workflow/01_upload_cqc_data_from_api.R b/data-raw/workflow/01_upload_cqc_data_from_api.R
index 2834b50..d55063c 100644
--- a/data-raw/workflow/01_upload_cqc_data_from_api.R
+++ b/data-raw/workflow/01_upload_cqc_data_from_api.R
@@ -1,23 +1,210 @@
-# Function to get api content from url
-get_api_content <- function(url) {
 
-  # Get api data
-  data = httr::GET(url)
 
+# Get cqc primary key from environ file
+key = Sys.getenv("CQC_PRIMARY_KEY")
+
+# Function to get api content from url
+get_api_content <- function(url){
+  
+  # Get api data
+  data = httr::GET(url, httr::add_headers(`Ocp-Apim-Subscription-Key` = key))
+  
   # Convert binary to character
   content = jsonlite::fromJSON(rawToChar(data$content))
-
+  
   # Return content
   return(content)
 }
 
-# Get number of cqc pages for main api query
-api_content <- get_api_content(
-  "https://api.cqc.org.uk/public/v1/locations?careHome=Y&page=1&perPage=1"
+# Get number of pages
+get_number_of_pages = function(){
+  
+  # Define url
+  url = "https://api.service.cqc.org.uk/public/v1/locations"
+  
+  # Get locations overview
+  api_content = get_api_content(url)
+  
+  # Get number of pages
+  total_pages = api_content$totalPages
+  
+  # Return
+  return(total_pages)
+}
+
+# Get all locations per page
+get_location_ids_per_page = function(page_num){
+  
+  # Define url
+  url = paste0(
+    "https://api.service.cqc.org.uk/public/v1/locations?page=",
+    page_num,
+    "&perPage=1000"
+    )
+  
+  # Get locations overview
+  api_content = get_api_content(url)
+  
+  # Get locations ids
+  location_vec = api_content$locations$locationId
+  
+  # Return
+  return(location_vec)
+}
+
+# Get all locations by location_vec index
+get_location_info_by_id <- function(loc_num) {
+  
+  # Paste location url with location_id
+  url = paste0(
+    "https://api.service.cqc.org.uk/public/v1/locations", 
+    location_vec[loc_num]
+  )
+  
+  # Get data
+  data = get_api_content(url) %>% 
+    unlist() %>% 
+    bind_rows()
+  
+  # Sleep if less than 2 rows 
+  while (ncol(data) <= 2) {
+    Sys.sleep(0.05)
+    
+    data = get_api_content(url) %>% 
+      unlist() %>% 
+      bind_rows()
+  }
+  
+  # Return data
+  return(data)
+}
+
+# Get total pages
+total_pages = get_number_of_pages()
+
+# Get all location ids
+location_vec = lapply(1:total_pages, get_location_ids_per_page)
+
+# Unlist into a single vector
+location_vec = unlist(all_locations)
+
+
+# Get columns names from a location id
+get_col_names = function(index){
+  
+  # Create url
+  url = paste0(
+    "https://api.service.cqc.org.uk/public/v1/locations/", 
+    location_vec[index]
+  )
+  
+  # Get data
+  data = get_api_content(url)
+  
+  # Column names
+  cols = names(data)
+  
+  # Return
+  return(cols)
+}
+
+cols = lapply(1:length(location_vec), get_col_names)
+
+
+data$assessment
+
+  unlist() %>% 
+  bind_rows()
+
+
+data$uprn
+
+a = get_location_info_by_id(48)
+
+cbind(
+  data["name"],
+  data$specialisms,
+  data$regulatedActivities,
+  data["locationId"]
 )
 
-# Get number of 10k blocks required
-no_of_pages = ceiling(api_content$total / 10000)
+
+names(data)
+
+
+
+
+
+
+data$assessment
+data$assessmentServiceGroup
+data$numberOfBeds
+
+data$type
+data$locationTypes
+
+cqc_cols = c(
+  'name', 
+  'postalCode',
+  'uprn',
+  'locationId',
+  'providerId',
+  'organisationType',
+  'type',
+  'lastInspection',
+  'deregistrationDate',
+  'registrationStatus',
+  'registrationDate',
+  'postalAddressLine1', 
+  'postalAddressLine2',
+  'postalAddressTownCity',
+  'postalAddressCounty',
+  'numberOfBeds',
+  'gacServicesTypes',
+  'gacServicesTypesNames',
+  'regulatedActivities',
+  'specialisms',
+  
+)
+
+
+uprn = as.numeric(uprn),
+location_id,
+provider_id,
+last_inspection_date,
+registration_date,
+deregistration_date,
+single_line_address,
+postcode = toupper(gsub("[^[:alnum:]]", "", postal_code)),
+nursing_home_flag = as.integer(grepl(
+  "Nursing home", gac_service_types_names
+)),
+residential_home_flag = as.integer(grepl(
+  "Residential home", gac_service_types_names
+)),
+# type,
+number_of_beds = as.integer(number_of_beds),
+current_rating = current_ratings_overall_rating,
+key_question_names = current_ratings_overall_key_question_ratings_names,
+key_question_ratings = current_ratings_overall_key_question_ratings_ratings,
+cqc_date = download_date,
+ods_code,
+specialisms,
+regulated_activities_names,
+gac_service_types = gac_service_types_names,
+
+
+which
+
+c = data[names(data) %in% cqc_cols] %>% 
+  unlist() %>% 
+  bind_rows()
+c
+
+
+data$uprn
+
+
 
 get_cqc_locations_details <- function(page_num) {
   
diff --git a/data-raw/workflow/workflow_run_23_24.R b/data-raw/workflow/workflow_run_23_24.R
new file mode 100644
index 0000000..66cf611
--- /dev/null
+++ b/data-raw/workflow/workflow_run_23_24.R
@@ -0,0 +1,47 @@
+# Load/install all required packages and functions
+source("data-raw/workflow/workflow_packages.R")
+source("data-raw/workflow/workflow_helpers.R")
+source("data-raw/workflow/workflow_production.R")
+
+# Specify variables to retain at end of each script
+keep_vars = c(ls(), 'keep_vars')
+
+# FY 22/23 ---------------------------------------------------------------------
+
+# 1. Get latest cqc data: 0.5hr - Run once in first epoch script
+get_latest_cqc_data()
+
+# 2. Get latest ab plus epoch: ~2hr
+get_abp_from_api(
+  end_date = "2024-03-31"
+)
+
+# 3. Merge and process cqc and ab plus: ~3 mins
+create_ab_plus_cqc_data(
+  ab_plus_data = "INT646_ABP_20230331",
+  cqc_data = "INT646_CQC_20230602",
+  start_date = "2022-04-01",
+  end_date =   "2023-03-31"
+)
+
+# 4. Create form level fact for records with a ch-postcode: ~11-14hr
+create_form_level_patient_addresses(
+  address_data = "INT646_ABP_CQC_20220401_20230331"
+)
+
+# 5. Match patient details against ch-postcode uprn and process: ~30-40 mins
+create_care_home_address_match(
+  patient_address_data = "INT646_FORMS_20220401_20230331",
+  lookup_address_data = "INT646_ABP_CQC_20220401_20230331",
+  parent_uprn_data = "INT646_ABP_20230331"
+)
+
+# 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min
+# create_postcode_lookup() # Run once in first epoch script
+
+
+# 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs
+create_matched_prescription_base_table(
+  match_data = "INT646_MATCH_20220401_20230331",
+  form_data = "INT646_FORMS_20220401_20230331"
+)

From 3fcfb801168ed16b0fe7edaf843e362efdaa6a11 Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Thu, 20 Jun 2024 17:49:30 +0100
Subject: [PATCH 2/9] updated cqc api

---
 .../workflow/01_upload_cqc_data_from_api.R    | 212 +++++++-----------
 1 file changed, 81 insertions(+), 131 deletions(-)

diff --git a/data-raw/workflow/01_upload_cqc_data_from_api.R b/data-raw/workflow/01_upload_cqc_data_from_api.R
index d55063c..c0c1f46 100644
--- a/data-raw/workflow/01_upload_cqc_data_from_api.R
+++ b/data-raw/workflow/01_upload_cqc_data_from_api.R
@@ -1,8 +1,34 @@
 
+# Functions and variables ------------------------------------------------------
 
 # Get cqc primary key from environ file
 key = Sys.getenv("CQC_PRIMARY_KEY")
 
+# Define cqc columns of interest 
+cqc_cols = c(
+  'name', 
+  'postalCode',
+  'uprn',
+  'locationId',
+  'providerId',
+  'organisationType',
+  'type',
+  'lastInspection',
+  'deregistrationDate',
+  'registrationStatus',
+  'registrationDate',
+  'postalAddressLine1', 
+  'postalAddressLine2',
+  'postalAddressTownCity',
+  'postalAddressCounty',
+  'numberOfBeds',
+  'gacServicesTypes',
+  'regulatedActivities',
+  'specialisms',
+  'current_ratings',
+  'odsCode'
+)
+
 # Function to get api content from url
 get_api_content <- function(url){
   
@@ -20,7 +46,7 @@ get_api_content <- function(url){
 get_number_of_pages = function(){
   
   # Define url
-  url = "https://api.service.cqc.org.uk/public/v1/locations"
+  url = "https://api.service.cqc.org.uk/public/v1/locations?careHome=Y"
   
   # Get locations overview
   api_content = get_api_content(url)
@@ -37,7 +63,7 @@ get_location_ids_per_page = function(page_num){
   
   # Define url
   url = paste0(
-    "https://api.service.cqc.org.uk/public/v1/locations?page=",
+    "https://api.service.cqc.org.uk/public/v1/locations?careHome=Y&page=",
     page_num,
     "&perPage=1000"
     )
@@ -57,28 +83,30 @@ get_location_info_by_id <- function(loc_num) {
   
   # Paste location url with location_id
   url = paste0(
-    "https://api.service.cqc.org.uk/public/v1/locations", 
+    "https://api.service.cqc.org.uk/public/v1/locations/", 
     location_vec[loc_num]
   )
   
   # Get data
-  data = get_api_content(url) %>% 
-    unlist() %>% 
-    bind_rows()
+  data = get_api_content(url)
   
-  # Sleep if less than 2 rows 
-  while (ncol(data) <= 2) {
-    Sys.sleep(0.05)
-    
-    data = get_api_content(url) %>% 
-      unlist() %>% 
-      bind_rows()
-  }
+  print(data)
+  
+  # Filter data 
+  filtered_data = data[names(data) %in% cqc_cols]
+  
+  # Flat data
+  flat_data = filtered_data %>% 
+    unlist() %>% 
+    bind_rows() %>% 
+    janitor::clean_names()
   
   # Return data
-  return(data)
+  return(flat_data)
 }
 
+# Generate Output --------------------------------------------------------------
+
 # Get total pages
 total_pages = get_number_of_pages()
 
@@ -86,35 +114,50 @@ total_pages = get_number_of_pages()
 location_vec = lapply(1:total_pages, get_location_ids_per_page)
 
 # Unlist into a single vector
-location_vec = unlist(all_locations)
+location_vec = unlist(location_vec)
 
+# Generate appropriate number of cores
+n_cores <- parallel::detectCores() - 1
 
-# Get columns names from a location id
-get_col_names = function(index){
-  
-  # Create url
-  url = paste0(
-    "https://api.service.cqc.org.uk/public/v1/locations/", 
-    location_vec[index]
-  )
-  
-  # Get data
-  data = get_api_content(url)
-  
-  # Column names
-  cols = names(data)
-  
-  # Return
-  return(cols)
-}
+# Set up parallel
+clust <- parallel::makeCluster(n_cores)
 
-cols = lapply(1:length(location_vec), get_col_names)
+# Export libraries to cluster
+parallel::clusterEvalQ(
+  cl = clust,
+  {
+    library(dplyr); 
+    library(janitor);
+    library(httr);
+    library(jsonlite);
+  }
+)
+
+# Export required objects to cluster
+parallel::clusterExport(
+  cl = clust,
+  varlist = c(
+    "get_api_content",
+    "location_vec",
+    "key",
+    "cqc_cols"
+  ),
+  envir = environment()
+)
+
+# Generate cqc details
+Sys.time()
+cqc_data <- parallel::parLapply(
+  cl = clust, 
+  X = 1:length(location_vec), 
+  fun = get_location_info_by_id
+)
+Sys.time()
 
+# Stop Cluster
+parallel::stopCluster(clust)
 
-data$assessment
 
-  unlist() %>% 
-  bind_rows()
 
 
 data$uprn
@@ -136,101 +179,8 @@ names(data)
 
 
 
-data$assessment
-data$assessmentServiceGroup
-data$numberOfBeds
-
-data$type
-data$locationTypes
-
-cqc_cols = c(
-  'name', 
-  'postalCode',
-  'uprn',
-  'locationId',
-  'providerId',
-  'organisationType',
-  'type',
-  'lastInspection',
-  'deregistrationDate',
-  'registrationStatus',
-  'registrationDate',
-  'postalAddressLine1', 
-  'postalAddressLine2',
-  'postalAddressTownCity',
-  'postalAddressCounty',
-  'numberOfBeds',
-  'gacServicesTypes',
-  'gacServicesTypesNames',
-  'regulatedActivities',
-  'specialisms',
-  
-)
-
-
-uprn = as.numeric(uprn),
-location_id,
-provider_id,
-last_inspection_date,
-registration_date,
-deregistration_date,
-single_line_address,
-postcode = toupper(gsub("[^[:alnum:]]", "", postal_code)),
-nursing_home_flag = as.integer(grepl(
-  "Nursing home", gac_service_types_names
-)),
-residential_home_flag = as.integer(grepl(
-  "Residential home", gac_service_types_names
-)),
-# type,
-number_of_beds = as.integer(number_of_beds),
-current_rating = current_ratings_overall_rating,
-key_question_names = current_ratings_overall_key_question_ratings_names,
-key_question_ratings = current_ratings_overall_key_question_ratings_ratings,
-cqc_date = download_date,
-ods_code,
-specialisms,
-regulated_activities_names,
-gac_service_types = gac_service_types_names,
-
-
-which
-
-c = data[names(data) %in% cqc_cols] %>% 
-  unlist() %>% 
-  bind_rows()
-c
-
-
-data$uprn
-
-
-
-get_cqc_locations_details <- function(page_num) {
-  
-  # Url with page number pasted inside
-  url = paste0(
-    "https://api.cqc.org.uk/public/v1/locations?careHome=Y&page=", 
-    page_num, 
-    "&perPage=10000"
-  )
-  
-  # Get api data
-  data = get_api_content(url)
-  
-  # Get location info as df within list
-  locations = data$locations
-  
-  # Return location info df
-  return(locations)
-}
 
-# Get all location info, with 10k records per page retrieved
-cqc_locations <- lapply(1:no_of_pages, get_cqc_locations_details) %>% 
-  bind_rows()
 
-# Vector of locations
-location_vec = cqc_locations %>% pull(locationId)
 
 # Function to query cqc api
 get_cqc_api_location_data <- function(loc_num) {

From 36b3c425fbe597d5960bd9076432377a616a9fad Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Fri, 21 Jun 2024 11:19:26 +0100
Subject: [PATCH 3/9] updated cqc script finished

---
 .../workflow/01_upload_cqc_data_from_api.R    | 297 ++++++------------
 1 file changed, 104 insertions(+), 193 deletions(-)

diff --git a/data-raw/workflow/01_upload_cqc_data_from_api.R b/data-raw/workflow/01_upload_cqc_data_from_api.R
index c0c1f46..7c1c435 100644
--- a/data-raw/workflow/01_upload_cqc_data_from_api.R
+++ b/data-raw/workflow/01_upload_cqc_data_from_api.R
@@ -4,6 +4,9 @@
 # Get cqc primary key from environ file
 key = Sys.getenv("CQC_PRIMARY_KEY")
 
+# Get current year_month
+download_date <- as.integer(format(today(), "%Y%m%d"))
+
 # Define cqc columns of interest 
 cqc_cols = c(
   'name', 
@@ -22,12 +25,26 @@ cqc_cols = c(
   'postalAddressTownCity',
   'postalAddressCounty',
   'numberOfBeds',
-  'gacServicesTypes',
-  'regulatedActivities',
+  'gacServiceTypes',
   'specialisms',
-  'current_ratings',
+  'currentRatings',
   'odsCode'
-)
+  )
+
+# Define provider cols
+provider_cols = c(
+  "providerId",
+  "name",
+  "postalAddressLine1",
+  "postalAddressLine2",
+  "postalAddressTownCity",
+  "postalAddressCounty",
+  "region",
+  "postalCode",
+  "uprn",
+  "companiesHouseNumber",
+  "lastInspection"
+  )
 
 # Function to get api content from url
 get_api_content <- function(url){
@@ -89,11 +106,34 @@ get_location_info_by_id <- function(loc_num) {
   
   # Get data
   data = get_api_content(url)
+
+  # Filter data 
+  filtered_data = data[names(data) %in% cqc_cols]
+  
+  # Flat data
+  flat_data = filtered_data %>% 
+    unlist() %>% 
+    bind_rows() %>% 
+    janitor::clean_names()
+  
+  # Return data
+  return(flat_data)
+}
+
+# Function to query cqc api
+get_provider_info_by_id = function(loc_num){
+  
+  # Paste location url with location_id
+  url = paste0(
+    "https://api.service.cqc.org.uk/public/v1/providers/", 
+    provider_vec[loc_num]
+  )
   
-  print(data)
+  # Get data
+  data = get_api_content(url)
   
   # Filter data 
-  filtered_data = data[names(data) %in% cqc_cols]
+  filtered_data = data[names(data) %in% provider_cols]
   
   # Flat data
   flat_data = filtered_data %>% 
@@ -105,7 +145,16 @@ get_location_info_by_id <- function(loc_num) {
   return(flat_data)
 }
 
-# Generate Output --------------------------------------------------------------
+# Concatenate all columns (uniquely and without NA) with same prefix
+concatenate_by_prefix = function(df, prefix){
+  df %>% 
+    mutate({{ prefix }} := pmap_chr(
+      select(., starts_with(prefix)), 
+      ~ paste(unique(na.omit(c(...))), collapse = "|")
+      ))
+}
+
+# Generate location output -----------------------------------------------------
 
 # Get total pages
 total_pages = get_number_of_pages()
@@ -145,130 +194,26 @@ parallel::clusterExport(
   envir = environment()
 )
 
-# Generate cqc details
-Sys.time()
+# Generate cqc details: ~25 mins
 cqc_data <- parallel::parLapply(
   cl = clust, 
   X = 1:length(location_vec), 
   fun = get_location_info_by_id
 )
-Sys.time()
 
 # Stop Cluster
 parallel::stopCluster(clust)
 
+# Format location output -------------------------------------------------------
 
-
-
-data$uprn
-
-a = get_location_info_by_id(48)
-
-cbind(
-  data["name"],
-  data$specialisms,
-  data$regulatedActivities,
-  data["locationId"]
-)
-
-
-names(data)
-
-
-
-
-
-
-
-
-
-# Function to query cqc api
-get_cqc_api_location_data <- function(loc_num) {
-  
-  # Paste location url with location_id
-  url = paste0(
-    "https://api.cqc.org.uk/public/v1/locations/", 
-    location_vec[loc_num]
-  )
-  
-  # Get data
-  data = get_api_content(url) %>% 
-    unlist() %>% 
-    bind_rows()
-
-  while (ncol(data) <= 2) {
-    Sys.sleep(0.05)
-    
-    data = get_api_content(url) %>% 
-      unlist() %>% 
-      bind_rows()
-  }
-  
-  
-  # Return data
-  return(data)
-}
-
-# Generate appropriate number of cores
-n_cores <- parallel::detectCores() - 2
-
-# Set up parallel
-clust <- parallel::makeCluster(n_cores)
-
-# Export libraries to cluster
-parallel::clusterEvalQ(
-  cl = clust,
-  {
-    library(dplyr); 
-    library(httr);
-    library(jsonlite);
-  }
-)
-
-# Export required objects to cluster
-parallel::clusterExport(
-  cl = clust,
-  varlist = c(
-    "get_api_content",
-    "get_cqc_api_location_data",
-    "location_vec"
-  ),
-  envir = environment()
-)
-
-# Print script update
-print("Now downloading CQC API location data ...")
-
-# Generate cqc details
-cqc_details <- parallel::parLapply(
-  cl = clust, 
-  X = 1:length(location_vec), 
-  fun = get_cqc_api_location_data
-)
-
-# Stop Cluster
-parallel::stopCluster(clust)
-
-cqc_details_check <- cqc_details %>% map(\(x) x$locationId)
-cqc_details_check <- cqc_details_check[is.na(cqc_details_check)]
-num_missing_locations <- length(cqc_details_check)
-
-if (num_missing_locations > 0) stop("Missing locations, probably due to timeout!")
-
-# Get current year_month
-download_date <- as.integer(format(today(), "%Y%m%d"))
-
-# Bind all dfs together and apply some transformations
-cqc_details_df <- cqc_details %>%
+# Bind all dfs together and clean column names
+cqc_details_df <- cqc_data %>%
   bind_rows() %>% 
   janitor::clean_names() %>% 
-  unite_to_plural(
-    specialisms,
-    regulated_activities_names,
-    current_ratings_overall_key_question_ratings_names,
-    current_ratings_overall_key_question_ratings_ratings,
-    gac_service_types_names
-  ) %>% 
+  concatenate_by_prefix(., 'specialisms') %>% 
+  concatenate_by_prefix('current_ratings_overall_key_question_ratings_name') %>% 
+  concatenate_by_prefix('current_ratings_overall_key_question_ratings_rating') %>% 
+  concatenate_by_prefix('gac_service_types_name') %>% 
   tidyr::unite(
     single_line_address,
     c(
@@ -291,25 +236,25 @@ cqc_details_df <- cqc_details %>%
     single_line_address,
     postcode = toupper(gsub("[^[:alnum:]]", "", postal_code)),
     nursing_home_flag = as.integer(grepl(
-      "Nursing home", gac_service_types_names
+      "Nursing home", gac_service_types_name
     )),
     residential_home_flag = as.integer(grepl(
-      "Residential home", gac_service_types_names
+      "Residential home", gac_service_types_name
     )),
     # type,
     number_of_beds = as.integer(number_of_beds),
     current_rating = current_ratings_overall_rating,
-    key_question_names = current_ratings_overall_key_question_ratings_names,
-    key_question_ratings = current_ratings_overall_key_question_ratings_ratings,
+    key_question_names = current_ratings_overall_key_question_ratings_name,
+    key_question_ratings = current_ratings_overall_key_question_ratings_rating,
     cqc_date = download_date,
     ods_code,
     specialisms,
-    regulated_activities_names,
-    gac_service_types = gac_service_types_names,
+    gac_service_types = gac_service_types_name,
     .keep = "none"
-  )
+  ) %>% 
+  addressMatchR::tidy_single_line_address(single_line_address)
 
-gc()
+# Generate provider output -----------------------------------------------------
 
 # Get provider id vec
 provider_vec = cqc_details_df %>% 
@@ -317,37 +262,8 @@ provider_vec = cqc_details_df %>%
   distinct() %>% 
   pull()
 
-# Function to query cqc api
-get_cqc_api_provider_data = function(loc_num){
-  
-  # Wait
-  Sys.sleep(0.05)
-  
-  # Paste location url with location_id
-  url = paste0(
-    "https://api.cqc.org.uk/public/v1/providers/", 
-    provider_vec[loc_num]
-  )
-  
-  # Get data
-  data = get_api_content(url) %>% 
-    unlist() %>% 
-    bind_rows()
-  
-  while (ncol(data) <= 2) {
-    Sys.sleep(0.05)
-    
-    data = get_api_content(url) %>% 
-      unlist() %>% 
-      bind_rows()
-  }
-  
-  # Return data
-  return(data)
-}
-
 # Generate appropriate number of cores
-n_cores <- parallel::detectCores() - 2
+n_cores <- parallel::detectCores() - 1
 
 # Set up parallel
 clust <- parallel::makeCluster(n_cores)
@@ -357,6 +273,7 @@ parallel::clusterEvalQ(
   cl = clust,
   {
     library(dplyr); 
+    library(janitor);
     library(httr);
     library(jsonlite);
   }
@@ -367,46 +284,42 @@ parallel::clusterExport(
   cl = clust,
   varlist = c(
     "get_api_content",
-    "get_cqc_api_provider_data",
-    "provider_vec"
+    "provider_vec",
+    "key",
+    "provider_cols"
   ),
   envir = environment()
 )
 
-# Print script update
-print("Now downloading CQC API provider data ...")
-
-# Generate cqc details
-cqc_details <- parallel::parLapply(
+# Generate provider details: ~15 mins
+provider_data <- parallel::parLapply(
   cl = clust, 
   X = 1:length(provider_vec), 
-  fun = get_cqc_api_provider_data
+  fun = get_provider_info_by_id
 )
 
 # Stop Cluster
 parallel::stopCluster(clust)
 
-cqc_details_check <- cqc_details %>% map(\(x) x$locationId)
-cqc_details_check <- cqc_details_check[is.na(cqc_details_check)]
-num_missing_providers <- length(cqc_details_check)
-
-if (num_missing_providers > 0) stop("Missing providers, probably due to timeout!")
+# Process provider output ------------------------------------------------------
 
 # Process the provider data
-cqc_providers_df = cqc_details %>% 
+cqc_providers_df = provider_data %>% 
   bind_rows() %>% 
   janitor::clean_names() %>% 
-  mutate(
-    # Paste fields together to create single line address
-    provider_sla = toupper(paste(
-      ifelse(is.na(name), "", name),
-      ifelse(is.na(postal_address_line1), "", postal_address_line1),
-      ifelse(is.na(postal_address_town_city), "", postal_address_town_city),
-      ifelse(is.na(region), "", region)
-    )),
-    # Postcode Cleaning
-    postal_code = toupper(gsub("[^[:alnum:]]", "", postal_code))
+  tidyr::unite(
+    provider_sla,
+    c(
+      name, 
+      postal_address_line1, 
+      postal_address_line2,
+      postal_address_town_city,
+      postal_address_county
+    ),
+    sep = " ",
+    na.rm = TRUE
   ) %>% 
+  mutate(postal_code = toupper(gsub("[^[:alnum:]]", "", postal_code))) %>% 
   select(
     provider_id, 
     provider_uprn = uprn,
@@ -414,21 +327,18 @@ cqc_providers_df = cqc_details %>%
     provider_sla,
     provider_postcode = postal_code,
     provider_last_inspection_date = last_inspection_date
-  )
+  ) %>% 
+  addressMatchR::tidy_single_line_address(provider_sla)
+
+# Join location and providder data then save -----------------------------------
 
 # Process the cqc df output, in preparation for matching
-cqc_process_df <- cqc_details_df %>% 
+cqc_final_df <- cqc_details_df %>% 
   left_join(cqc_providers_df, by = "provider_id") %>% 
-  addressMatchR::tidy_single_line_address(single_line_address) %>% 
-  addressMatchR::tidy_single_line_address(provider_sla) %>% 
   rename_with(toupper)
 
-# Check na count per column
-print("NA count per column")
-print(colSums(is.na(cqc_process_df)))
-
 # Create table name
-table_name <- paste0("INT646_CQC_", download_date)
+table_name <- paste0("CQC_BASE_", download_date)
 
 # Set up connection to the DB
 con <- nhsbsaR::con_nhsbsa(database = "DALP")
@@ -436,9 +346,10 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 # Drop table if it exists already
 drop_table_if_exists_db(table_name)
 
-# Upload to DB...
-cqc_process_df %>% write_table_long_chars(con, table_name)
-# ...and add indexes
+# Upload to DB
+cqc_final_df %>% write_table_long_chars(con, table_name)
+
+# Add indexes
 con %>% add_indexes(table_name, c("UPRN", "POSTCODE"))
 
 # Grant access

From c1fe5e7fffd9101859a3f24df1b5578334d975e0 Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Fri, 21 Jun 2024 14:16:01 +0100
Subject: [PATCH 4/9] abp from dall_ref

---
 data-raw/workflow/02c_ab_data_from_dall_ref.R | 122 ++++++++++++++++++
 data-raw/workflow/workflow_run_23_24.R        |   2 +-
 2 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 data-raw/workflow/02c_ab_data_from_dall_ref.R

diff --git a/data-raw/workflow/02c_ab_data_from_dall_ref.R b/data-raw/workflow/02c_ab_data_from_dall_ref.R
new file mode 100644
index 0000000..ddd2e6a
--- /dev/null
+++ b/data-raw/workflow/02c_ab_data_from_dall_ref.R
@@ -0,0 +1,122 @@
+source("data-raw/workflow/workflow_packages.R")
+source("data-raw/workflow/workflow_helpers.R")
+
+# Identify closest dall_ref release date ---------------------------------------
+
+# Define end date
+end_date = "2024-03-31"
+
+# Set up connection to the DB
+con <- nhsbsaR::con_nhsbsa(database = "DALP")
+
+# Connect to ab plus in dall_ref
+ab <- con %>%
+  tbl(from = in_schema("DALL_REF", "ADDRESSBASE_PLUS"))
+
+# Get closest release date
+select_date = ab %>% 
+  select(RELEASE_DATE) %>% 
+  distinct() %>% 
+  collect() %>% 
+  mutate(
+    SELECT_DATE = as.Date(end_date),
+    RELEASE_DATE = as.Date(RELEASE_DATE),
+    DIFF = as.integer(abs(RELEASE_DATE - SELECT_DATE)),
+    DB_DATE = as.integer(gsub("-", "", SELECT_DATE))
+    ) %>% 
+  filter(DIFF == min(DIFF)) %>% 
+  select(DB_DATE) %>% 
+  pull()
+
+# Filter ab by most appropriate release date
+ab = ab %>% filter(TO_NUMBER(TO_CHAR(RELEASE_DATE, 'YYYYMMDD')) == db_date)
+
+# Temp table just with cleaned geo and dpa sla ---------------------------------
+
+# Define temp table name
+table_name_temp = "TEMP_AB_PLUS"
+
+# Check if temp table exists
+drop_table_if_exists_db(table_name_temp)
+
+# Sla generation and clean
+ab_sla = ab %>% 
+  # Create CH flag
+  mutate(CH_FLAG = ifelse(CLASS == "RI01", 1L, 0L)) %>% 
+  # SLA creation plus formatting
+  addressMatchR::calc_addressbase_plus_dpa_single_line_address() %>%
+  addressMatchR::calc_addressbase_plus_geo_single_line_address() %>%
+  addressMatchR::tidy_single_line_address(col = DPA_SINGLE_LINE_ADDRESS) %>%
+  addressMatchR::tidy_single_line_address(col = GEO_SINGLE_LINE_ADDRESS) %>% 
+  select(
+    UPRN,
+    PARENT_UPRN,
+    POSTCODE,
+    DPA_SINGLE_LINE_ADDRESS,
+    GEO_SINGLE_LINE_ADDRESS,
+    CH_FLAG,
+    RELEASE_DATE
+  ) 
+
+# Save intermediate table under temp table name
+ab_sla %>%
+  compute(
+    name = table_name_temp,
+    temporary = FALSE
+  )
+
+# Re-connect to temp table and finish processing -------------------------------
+
+# Connect and process 
+ab_plus = con %>%
+  tbl(from = table_name_temp) %>%
+  nhsbsaR::oracle_merge_strings(
+    first_col = "DPA_SINGLE_LINE_ADDRESS",
+    second_col = "GEO_SINGLE_LINE_ADDRESS",
+    merge_col = "CORE_SINGLE_LINE_ADDRESS"
+  ) %>% 
+  select(
+    UPRN,
+    PARENT_UPRN,
+    POSTCODE,
+    DPA_SINGLE_LINE_ADDRESS,
+    GEO_SINGLE_LINE_ADDRESS,
+    CORE_SINGLE_LINE_ADDRESS,
+    CH_FLAG,
+    RELEASE_DATE
+  ) 
+
+# Define table name
+table_name = paste0("ABP_", db_date)
+
+# Drop table if it exists already
+drop_table_if_exists_db(table_name)
+
+# Write the table back to the DB with indexes
+ab_plus %>%
+  compute(
+    name = table_name,
+    temporary = FALSE,
+    indexes = c("UPRN", "PARENT_UPRN", "POSTCODE")
+  )
+
+# Drop intermediate tables now the final table is done
+drop_table_if_exists_db(table_name_temp)
+
+# Grant access
+c("MIGAR", "ADNSH", "MAMCP") %>% grant_table_access (table_name)
+
+# Disconnect connection to database
+DBI::dbDisconnect(con)
+
+# Print that table has been created
+print(paste0("This script has created table: ", table_name))
+
+# Return to project directory
+setwd(project_dir)
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
diff --git a/data-raw/workflow/workflow_run_23_24.R b/data-raw/workflow/workflow_run_23_24.R
index 66cf611..8f24682 100644
--- a/data-raw/workflow/workflow_run_23_24.R
+++ b/data-raw/workflow/workflow_run_23_24.R
@@ -8,7 +8,7 @@ keep_vars = c(ls(), 'keep_vars')
 
 # FY 22/23 ---------------------------------------------------------------------
 
-# 1. Get latest cqc data: 0.5hr - Run once in first epoch script
+# 1. Get latest cqc data: ~1hr - Run once in first epoch script (new API code)
 get_latest_cqc_data()
 
 # 2. Get latest ab plus epoch: ~2hr

From 49544612de75028211ac7ddd0e17d540599fa041 Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Fri, 21 Jun 2024 18:22:05 +0100
Subject: [PATCH 5/9] edit

---
 data-raw/workflow/02c_ab_data_from_dall_ref.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-raw/workflow/02c_ab_data_from_dall_ref.R b/data-raw/workflow/02c_ab_data_from_dall_ref.R
index ddd2e6a..42ccf84 100644
--- a/data-raw/workflow/02c_ab_data_from_dall_ref.R
+++ b/data-raw/workflow/02c_ab_data_from_dall_ref.R
@@ -87,7 +87,7 @@ ab_plus = con %>%
   ) 
 
 # Define table name
-table_name = paste0("ABP_", db_date)
+table_name = paste0("ADDRESSBASE_PLUS_", db_date)
 
 # Drop table if it exists already
 drop_table_if_exists_db(table_name)

From ad6796bd0df971db6af516017d48e304ea4d042c Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Wed, 26 Jun 2024 13:44:30 +0100
Subject: [PATCH 6/9] workflow production update

---
 data-raw/app/01_headline_figures_df.R         |  2 +-
 data-raw/workflow/02c_ab_data_from_dall_ref.R |  5 ----
 data-raw/workflow/03_address_base_cqc_merge.R |  3 ++-
 data-raw/workflow/workflow_production.R       | 15 ++++++++++-
 data-raw/workflow/workflow_run_23_24.R        | 27 +++++++++----------
 .../workflow_union_3_years_in_one_table.sql   | 10 +++----
 6 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/data-raw/app/01_headline_figures_df.R b/data-raw/app/01_headline_figures_df.R
index 372c8a1..103025b 100644
--- a/data-raw/app/01_headline_figures_df.R
+++ b/data-raw/app/01_headline_figures_df.R
@@ -8,7 +8,7 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Create a lazy table from year month dim table in DWCP
 data_db <- con %>%
-  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # Key findings used within analysis summary text
 data_db %>% 
diff --git a/data-raw/workflow/02c_ab_data_from_dall_ref.R b/data-raw/workflow/02c_ab_data_from_dall_ref.R
index 42ccf84..4a2f906 100644
--- a/data-raw/workflow/02c_ab_data_from_dall_ref.R
+++ b/data-raw/workflow/02c_ab_data_from_dall_ref.R
@@ -1,11 +1,6 @@
-source("data-raw/workflow/workflow_packages.R")
-source("data-raw/workflow/workflow_helpers.R")
 
 # Identify closest dall_ref release date ---------------------------------------
 
-# Define end date
-end_date = "2024-03-31"
-
 # Set up connection to the DB
 con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
diff --git a/data-raw/workflow/03_address_base_cqc_merge.R b/data-raw/workflow/03_address_base_cqc_merge.R
index b7bbde7..d18da00 100644
--- a/data-raw/workflow/03_address_base_cqc_merge.R
+++ b/data-raw/workflow/03_address_base_cqc_merge.R
@@ -10,7 +10,8 @@ cqc_db <- con %>%
 
 # Create a lazy table addressbase data
 ab_plus_db <- con %>%
-  tbl(from = ab_plus_data)
+  tbl(from = ab_plus_data) %>% 
+  rename(EPOCH = RELEASE_DATE)
 
 # Get 4 values for final output
 ab_epoch = pull_date_string(ab_plus_db, EPOCH)
diff --git a/data-raw/workflow/workflow_production.R b/data-raw/workflow/workflow_production.R
index b3b7c90..408e1c4 100644
--- a/data-raw/workflow/workflow_production.R
+++ b/data-raw/workflow/workflow_production.R
@@ -34,6 +34,19 @@ get_abp_from_os = function(epoch_year){
 }
 
 
+#' @description downloads a single epoch of ab plus closest to end_date
+#' @param end_date: end date as a char in format 'YYYY-MM-DD'
+#' @noRd
+get_abp_from_dall_ref = function(end_date){
+  
+  # Assign function inputs to global env
+  assign("end_date", end_date, envir = globalenv())
+  
+  # Get nearest ab plus to end date with cqc postcodes within time frame
+  tictoc::tic(); source("data-raw/workflow/02c_ab_data_from_dall_ref.R"); tictoc::toc(); print(Sys.time())
+}
+
+
 #' @description downloads a single epoch of ab plus closest to end_date
 #' @param ab_plus_data: name of the ab plus db table
 #' @param cqc_data: the name of the cqc db table
@@ -79,6 +92,7 @@ create_care_home_address_match = function(patient_address_data, lookup_address_d
   tictoc::tic(); source("data-raw/workflow/05_address_match.R"); tictoc::toc(); print(Sys.time())
 }
 
+
 #' @description creates the postcode lookup table in the DB containing latest available (hard-coded) mappings
 create_postcode_lookup = function(){
   
@@ -87,7 +101,6 @@ create_postcode_lookup = function(){
 }
 
 
-
 #' @description gets prescription info for matched records
 #' @param match_data: matched address data
 #' @noRd
diff --git a/data-raw/workflow/workflow_run_23_24.R b/data-raw/workflow/workflow_run_23_24.R
index 8f24682..7827cdc 100644
--- a/data-raw/workflow/workflow_run_23_24.R
+++ b/data-raw/workflow/workflow_run_23_24.R
@@ -6,42 +6,41 @@ source("data-raw/workflow/workflow_production.R")
 # Specify variables to retain at end of each script
 keep_vars = c(ls(), 'keep_vars')
 
-# FY 22/23 ---------------------------------------------------------------------
+# FY 23/24 ---------------------------------------------------------------------
 
 # 1. Get latest cqc data: ~1hr - Run once in first epoch script (new API code)
 get_latest_cqc_data()
 
 # 2. Get latest ab plus epoch: ~2hr
-get_abp_from_api(
+get_abp_from_dall_ref(
   end_date = "2024-03-31"
 )
 
 # 3. Merge and process cqc and ab plus: ~3 mins
 create_ab_plus_cqc_data(
-  ab_plus_data = "INT646_ABP_20230331",
-  cqc_data = "INT646_CQC_20230602",
-  start_date = "2022-04-01",
-  end_date =   "2023-03-31"
+  ab_plus_data = "ADDRESSBASE_PLUS_20240516",
+  cqc_data = "CQC_BASE_20240621",
+  start_date = "2023-04-01",
+  end_date =   "2024-03-31"
 )
 
 # 4. Create form level fact for records with a ch-postcode: ~11-14hr
 create_form_level_patient_addresses(
-  address_data = "INT646_ABP_CQC_20220401_20230331"
+  address_data = "INT646_ABP_CQC_20230401_20240331"
 )
 
 # 5. Match patient details against ch-postcode uprn and process: ~30-40 mins
 create_care_home_address_match(
-  patient_address_data = "INT646_FORMS_20220401_20230331",
-  lookup_address_data = "INT646_ABP_CQC_20220401_20230331",
-  parent_uprn_data = "INT646_ABP_20230331"
+  patient_address_data = "INT646_FORMS_20230401_20240331",
+  lookup_address_data = "INT646_ABP_CQC_20230401_20240331",
+  parent_uprn_data = "ADDRESSBASE_PLUS_20240516"
 )
 
 # 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min
-# create_postcode_lookup() # Run once in first epoch script
-
+create_postcode_lookup() # Run once in first epoch script
 
 # 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs
 create_matched_prescription_base_table(
-  match_data = "INT646_MATCH_20220401_20230331",
-  form_data = "INT646_FORMS_20220401_20230331"
+  match_data = "INT646_MATCH_20230401_20240331",
+  form_data = "INT646_FORMS_20230401_20240331"
 )
diff --git a/data-raw/workflow/workflow_union_3_years_in_one_table.sql b/data-raw/workflow/workflow_union_3_years_in_one_table.sql
index e368201..277da80 100644
--- a/data-raw/workflow/workflow_union_3_years_in_one_table.sql
+++ b/data-raw/workflow/workflow_union_3_years_in_one_table.sql
@@ -1,13 +1,13 @@
 -- To run from DALL_REF
 
-drop table int646_base_20200401_20230331 purge;
-create table int646_base_20200401_20230331 nologging compress for query low as
+drop table int646_base_20200401_20240331 purge;
+create table int646_base_20200401_20240331 nologging compress for query low as
 
-select '2020/21' fy, y1.* from migar.int646_base_20200401_20210331 y1
+select '2020/21' fy, y1.* from mamcp.int646_base_20200401_20210331 y1
 union all
-select '2021/22' fy, y2.* from migar.int646_base_20210401_20220331 y2
+select '2021/22' fy, y2.* from mamcp.int646_base_20210401_20220331 y2
 union all
-select '2022/23' fy, y3.* from migar.int646_base_20220401_20230331 y3
+select '2022/23' fy, y3.* from mamcp.int646_base_20220401_20230331 y3
 ;
 
 grant select on int646_base_20200401_20230331 to migar, adnsh, mamcp;

From bc07d87f3fe8aafa1b704ff50a3eac8ca5c19335 Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Thu, 27 Jun 2024 15:00:13 +0100
Subject: [PATCH 7/9] sql file additions

---
 data-raw/app/00_run_all.R                     |   5 +
 data-raw/workflow/workflow_fy_comparison.sql  | 163 ++++++++++++++++++
 .../workflow_union_3_years_in_one_table.sql   |   2 +
 .../workflow_union_4_years_in_one_table.sql   |  53 ++++++
 data-raw/workflow/wworkflow_fy_comparison.sql |  40 +++++
 5 files changed, 263 insertions(+)
 create mode 100644 data-raw/workflow/workflow_fy_comparison.sql
 create mode 100644 data-raw/workflow/workflow_union_4_years_in_one_table.sql
 create mode 100644 data-raw/workflow/wworkflow_fy_comparison.sql

diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R
index 560a7c7..cbd114b 100644
--- a/data-raw/app/00_run_all.R
+++ b/data-raw/app/00_run_all.R
@@ -1,8 +1,12 @@
+Sys.time()
+# Load library and generate base geo data
 source("data-raw/app/data_raw_helpers.R")
 source("data-raw/app/geo_data.R")
 
+# Define vars to retain in workflow
 keep_vars = c(ls(), 'keep_vars', 'get_metrics')
 
+# Run all scripts that generate an Rda file
 source("data-raw/app/01_headline_figures_df.R")
 source("data-raw/app/02_patients_age_gender_df.R")
 source("data-raw/app/03_patients_by_imd_df.R")
@@ -11,3 +15,4 @@ source("data-raw/app/05_metrics_age_gender_df.R")
 source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R")
 source("data-raw/app/07_ch_flag_drug_df.R")
 source("data-raw/app/08_geo_ch_flag_drug_df.R")
+Sys.time()
\ No newline at end of file
diff --git a/data-raw/workflow/workflow_fy_comparison.sql b/data-raw/workflow/workflow_fy_comparison.sql
new file mode 100644
index 0000000..b4785ef
--- /dev/null
+++ b/data-raw/workflow/workflow_fy_comparison.sql
@@ -0,0 +1,163 @@
+
+-- STEP 1: AB Check ------------------------------------------------------------
+
+-- 20/21: 29.1m
+-- 21/22: 29.4m
+-- 22/23: 29.7m
+-- 23/24: 38.0m (no filters, such as country, applied to DALL_REF ABP)
+
+select /* +parallel(16) */ count(*) from mamcp.INT646_ABP_20210324
+union all
+select /* +parallel(16) */ count(*) from mamcp.INT646_ABP_20220324
+union all
+select /* +parallel(16) */ count(*) from mamcp.INT646_ABP_20230331
+union all
+select /* +parallel(16) */ count(*) from adnsh.ADDRESSBASE_PLUS_20240516;
+
+-- STEP 2: CQC Check (single epoch used for initial 3 FY -----------------------
+
+-- MAMCP: 29.3k
+-- ADNSH: 29.9k
+
+select /* +parallel(16) */ count(*) from mamcp.INT646_CQC_20230602
+union all
+select /* +parallel(16) */ count(*) from adnsh.CQC_BASE_20240621;
+
+-- STEP 3: AB-CQC Union Check --------------------------------------------------
+
+-- 20/21: 846k
+-- 21/22: 834k
+-- 22/23: 837k
+-- 23/24: 743k
+
+select /* +parallel(16) */ count(*) from mamcp.int646_abp_cqc_20200401_20210331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_abp_cqc_20210401_20220331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_abp_cqc_20220401_20230331
+union all
+select /* +parallel(16) */ count(*) from adnsh.int646_abp_cqc_20230401_20240331;
+
+-- STEP 4: Form Check ----------------------------------------------------------
+
+-- 20/21: 257m
+-- 21/22: 264m
+-- 22/23: 274m
+-- 23/24: 285m
+
+select /* +parallel(16) */ count(*) from mamcp.int646_forms_20200401_20210331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_forms_20210401_20220331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_forms_20220401_20230331
+union all
+select /* +parallel(16) */ count(*) from adnsh.int646_forms_20230401_20240331;
+
+-- STEP 5: Match Output Check --------------------------------------------------
+
+-- 20/21: 20.8m
+-- 21/22: 20.9m
+-- 22/23: 21.9m
+-- 23/24: 22.9m
+
+select /* +parallel(16) */ count(*) from mamcp.int646_match_20200401_20210331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_match_20210401_20220331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_match_20220401_20230331
+union all
+select /* +parallel(16) */ count(*) from adnsh.int646_match_20230401_20240331;
+
+-- STEP 5.1: Match Output Match Type Check -------------------------------------
+
+-- Order: NON-EXACT / EXACT  
+
+-- 20/21: 15m / 4.6m 
+-- 21/22: 15m / 4.7m 
+-- 22/23: 16m / 5.1m 
+-- 23/24: 17m / 5.3m
+
+select /* +parallel(16) */ match_type, count(*) from mamcp.int646_match_20200401_20210331 group by match_type 
+union all
+select /* +parallel(16) */ match_type, count(*) from mamcp.int646_match_20210401_20220331 group by match_type
+union all
+select /* +parallel(16) */ match_type, count(*) from mamcp.int646_match_20220401_20230331 group by match_type
+union all
+select /* +parallel(16) */ match_type, count(*) from adnsh.int646_match_20230401_20240331 group by match_type;
+
+-- STEP 5.2: Match Output CH Flag Check ----------------------------------------
+
+-- Order: Y / N
+
+-- 20/21: 15.9m / 4.9m
+-- 21/22: 16.0m / 4.9m
+-- 22/23: 16.9m / 4.9m
+-- 23/24: 18.6m / 4.3m
+
+select /* +parallel(16) */ ch_flag, count(*) from mamcp.int646_match_20200401_20210331 group by ch_flag
+union all
+select /* +parallel(16) */ ch_flag, count(*) from mamcp.int646_match_20210401_20220331 group by ch_flag
+union all
+select /* +parallel(16) */ ch_flag, count(*) from mamcp.int646_match_20220401_20230331 group by ch_flag
+union all
+select /* +parallel(16) */ ch_flag, count(*) from adnsh.int646_match_20230401_20240331 group by ch_flag;
+
+-- STEP 6: Postcode Lookup Check -----------------------------------------------
+
+-- MAMCP: 224k
+-- ADNSH: 224k
+
+select /* +parallel(16) */ count(*) from mamcp.int646_postcode_lookup
+union all
+select /* +parallel(16) */ count(*) from adnsh.int646_postcode_lookup;
+
+-- STEP 7: Final Base Table Check ----------------------------------------------
+
+-- 20/21: 595m
+-- 21/22: 602m
+-- 22/23: 598m
+-- 23/24: 639m
+
+select /* +parallel(16) */ count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2020/21'
+union all
+select /* +parallel(16) */ count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2021/22'
+union all
+select /* +parallel(16) */ count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2022/23'
+union all
+select /* +parallel(16) */ count(*) from adnsh.int646_base_20230401_20240331;
+
+-- STEP 7.1: Final Base Table Match Type Check ---------------------------------
+
+-- Order: NON-EXACT / EXACT / NO MATCH / DOUBLE KW / SINGLE KW / PATIENT COUNT
+
+-- 20/21: 34.5m / 10.3m / 548m
+-- 21/22: 34.3m / 10.6m / 555m
+-- 22/23: 35.3m / 11.1m / 550m
+-- 23/24: 37.7m / 11.6m / 589m
+
+select /* +parallel(16) */ fy, match_type, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2020/21' group by fy, match_type 
+union all
+select /* +parallel(16) */ fy, match_type, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2021/22' group by fy, match_type
+union all
+select /* +parallel(16) */ fy, match_type, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2022/23' group by fy, match_type
+union all
+select /* +parallel(16) */ '2023/24'  as  fy, match_type, count(*) from adnsh.int646_base_20230401_20240331 group by '2023/24', match_type order by fy, match_type;
+
+-- STEP 7.2: Final Base Table CH Flag Check ------------------------------------
+
+-- Order: Y / N
+
+-- 20/21: 35m / 560m
+-- 21/22: 35m / 566m
+-- 22/23: 37m / 561m
+-- 23/24: 40m / 599m
+
+select /* +parallel(16) */ fy, ch_flag, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2020/21' group by fy, ch_flag
+union all
+select /* +parallel(16) */ fy, ch_flag, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2021/22' group by fy, ch_flag
+union all
+select /* +parallel(16) */ fy, ch_flag, count(*) from dall_ref.int646_base_20200401_20230331 where fy = '2022/23' group by fy, ch_flag
+union all
+select /* +parallel(16) */ '2023/24'  as  fy, ch_flag, count(*) from adnsh.int646_base_20230401_20240331 group by '2023/24', ch_flag order by fy, ch_flag;
+
+--------------------------------------------------------------------------------
\ No newline at end of file
diff --git a/data-raw/workflow/workflow_union_3_years_in_one_table.sql b/data-raw/workflow/workflow_union_3_years_in_one_table.sql
index 277da80..f28bfd2 100644
--- a/data-raw/workflow/workflow_union_3_years_in_one_table.sql
+++ b/data-raw/workflow/workflow_union_3_years_in_one_table.sql
@@ -8,6 +8,8 @@ union all
 select '2021/22' fy, y2.* from mamcp.int646_base_20210401_20220331 y2
 union all
 select '2022/23' fy, y3.* from mamcp.int646_base_20220401_20230331 y3
+union all
+select '2023/24' fy, y4.* from adnsh.int646_base_20230401_20240331 y4
 ;
 
 grant select on int646_base_20200401_20230331 to migar, adnsh, mamcp;
diff --git a/data-raw/workflow/workflow_union_4_years_in_one_table.sql b/data-raw/workflow/workflow_union_4_years_in_one_table.sql
new file mode 100644
index 0000000..e5b9af7
--- /dev/null
+++ b/data-raw/workflow/workflow_union_4_years_in_one_table.sql
@@ -0,0 +1,53 @@
+-- To run from DALL_REF
+
+drop table int646_base_20200401_20240331 purge;
+create table int646_base_20200401_20240331 nologging compress for query low as
+
+select * from dall_ref.int646_base_20200401_20230331 
+union all
+select '2023/24' fy, y4.* from adnsh.int646_base_20230401_20240331 y4 
+;
+
+grant select on int646_base_20200401_20240331 to migar, adnsh, mamcp;
+
+create index int646_i01 on int646_base_20200401_20240331 (fy);
+create index int646_i02 on int646_base_20200401_20240331 (fy, year_month);
+
+create index int646_i03 on int646_base_20200401_20240331 (fy, age_band, gender);
+create index int646_i04 on int646_base_20200401_20240331 (fy, age_band, gender, pcd_region_code, pcd_region_name);
+create index int646_i05 on int646_base_20200401_20240331 (fy, age_band, gender, pcd_icb_code, pcd_icb_name);
+create index int646_i06 on int646_base_20200401_20240331 (fy, age_band, gender, pcd_lad_code, pcd_lad_name);
+create index int646_i07 on int646_base_20200401_20240331 (fy, imd_decile);
+create index int646_i08 on int646_base_20200401_20240331 (ch_flag);
+create index int646_i09 on int646_base_20200401_20240331 (fy, ch_flag);
+create index int646_i10 on int646_base_20200401_20240331 (fy, nursing_home_flag);
+create index int646_i11 on int646_base_20200401_20240331 (fy, residential_home_flag);
+
+create index int646_i12 on int646_base_20200401_20240331 (fy, ch_flag, pcd_region_code, pcd_region_name);
+create index int646_i13 on int646_base_20200401_20240331 (fy, ch_flag, pcd_icb_code, pcd_icb_name);
+create index int646_i14 on int646_base_20200401_20240331 (fy, ch_flag, pcd_lad_code, pcd_lad_name);
+
+create index int646_i15 on int646_base_20200401_20240331 (fy, chapter_descr, pcd_region_code, pcd_region_name);
+create index int646_i16 on int646_base_20200401_20240331 (fy, chapter_descr, pcd_icb_code, pcd_icb_name);
+create index int646_i17 on int646_base_20200401_20240331 (fy, chapter_descr, pcd_lad_code, pcd_lad_name);
+
+create index int646_i18 on int646_base_20200401_20240331 (fy, section_descr, pcd_region_code, pcd_region_name);
+create index int646_i19 on int646_base_20200401_20240331 (fy, section_descr, pcd_icb_code, pcd_icb_name);
+create index int646_i20 on int646_base_20200401_20240331 (fy, section_descr, pcd_lad_code, pcd_lad_name);
+
+create index int646_i21 on int646_base_20200401_20240331 (fy, paragraph_descr, pcd_region_code, pcd_region_name);
+create index int646_i22 on int646_base_20200401_20240331 (fy, paragraph_descr, pcd_icb_code, pcd_icb_name);
+create index int646_i23 on int646_base_20200401_20240331 (fy, paragraph_descr, pcd_lad_code, pcd_lad_name);
+
+create index int646_i24 on int646_base_20200401_20240331 (fy, chemical_substance_bnf_descr, pcd_region_code, pcd_region_name);
+create index int646_i25 on int646_base_20200401_20240331 (fy, chemical_substance_bnf_descr, pcd_icb_code, pcd_icb_name);
+create index int646_i26 on int646_base_20200401_20240331 (fy, chemical_substance_bnf_descr, pcd_lad_code, pcd_lad_name);
+
+create index int646_i27 on int646_base_20200401_20240331 (uprn_flag);
+
+create index int646_i28 on int646_base_20200401_20240331 (fy, year_month, nhs_no);
+create index int646_i29 on int646_base_20200401_20240331 (fy, year_month, pcd_region_code, pcd_region_name, nhs_no);
+create index int646_i30 on int646_base_20200401_20240331 (fy, year_month, pcd_icb_code, pcd_icb_name, nhs_no);
+create index int646_i31 on int646_base_20200401_20240331 (fy, year_month, pcd_lad_code, pcd_lad_name, nhs_no);
+
+create index int646_i32 on int646_base_20200401_20240331 (fy, age_band, gender, ch_flag, year_month, nhs_no);
\ No newline at end of file
diff --git a/data-raw/workflow/wworkflow_fy_comparison.sql b/data-raw/workflow/wworkflow_fy_comparison.sql
new file mode 100644
index 0000000..1f5fc6f
--- /dev/null
+++ b/data-raw/workflow/wworkflow_fy_comparison.sql
@@ -0,0 +1,40 @@
+
+
+-- STEP 3: AB-CQC Union Check --------------------------------------------------
+
+
+-- STEP 4: Form Check ----------------------------------------------------------
+
+-- 20/21: 878m
+-- 21/22: 885m
+-- 22/23: 880m
+-- 23/24: 639m
+
+select /* +parallel(16) */ count(*) from mamcp.int646_forms_20200401_20210331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_forms_20210401_20220331
+union all
+select /* +parallel(16) */ count(*) from mamcp.int646_forms_20220401_20230331
+union all
+select /* +parallel(16) */ count(*) from adnsh.int646_forms_20230401_20240331;
+
+-- STEP 5: Match Output Check --------------------------------------------------
+
+-- STEP 6: Postcode Lookup Check -----------------------------------------------
+
+-- STEP 7: Final Base Table Check ----------------------------------------------
+
+-- 20/21: 878m
+-- 21/22: 885m
+-- 22/23: 880m
+-- 23/24: 639m
+
+select count(*) from mamcp.int646_forms_20200401_20210331
+union all
+select count(*) from mamcp.int646_forms_20210401_20220331
+union all
+select count(*) from mamcp.int646_forms_20220401_20230331
+union all
+select count(*) from adnsh.int646_forms_20230401_20240331;
+
+--------------------------------------------------------------------------------
\ No newline at end of file

From 69631d45f81ff4ac577431f0fadea30e87470f63 Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Fri, 28 Jun 2024 14:44:15 +0100
Subject: [PATCH 8/9] run_all script amendments

---
 data-raw/app/00_run_all.R                     | 22 +++++-----
 data-raw/app/01_headline_figures_df.R         |  9 +++-
 data-raw/app/02_patients_age_gender_df.R      | 15 +++++--
 data-raw/app/03_patients_by_imd_df.R          | 11 ++++-
 .../app/04_metrics_by_ch_type_85_split_df.R   | 21 ++++++---
 data-raw/app/05_metrics_age_gender_df.R       | 13 ++++--
 .../app/06_metrics_by_geo_and_ch_flag_df.R    | 44 +++++++++++++------
 data-raw/app/07_ch_flag_drug_df.R             | 16 ++++---
 data-raw/app/08_geo_ch_flag_drug_df.R         |  3 +-
 data-raw/workflow/07_item_level_base.R        |  1 -
 data-raw/workflow/workflow_run_22_23.R        |  1 -
 11 files changed, 108 insertions(+), 48 deletions(-)

diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R
index cbd114b..f79c3b7 100644
--- a/data-raw/app/00_run_all.R
+++ b/data-raw/app/00_run_all.R
@@ -1,18 +1,20 @@
 Sys.time()
 # Load library and generate base geo data
+library(tictoc)
 source("data-raw/app/data_raw_helpers.R")
 source("data-raw/app/geo_data.R")
 
 # Define vars to retain in workflow
-keep_vars = c(ls(), 'keep_vars', 'get_metrics')
+keep_vars = c(ls(), 'keep_vars')
 
 # Run all scripts that generate an Rda file
-source("data-raw/app/01_headline_figures_df.R")
-source("data-raw/app/02_patients_age_gender_df.R")
-source("data-raw/app/03_patients_by_imd_df.R")
-source("data-raw/app/04_metrics_by_ch_type_df.R")
-source("data-raw/app/05_metrics_age_gender_df.R")
-source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R")
-source("data-raw/app/07_ch_flag_drug_df.R")
-source("data-raw/app/08_geo_ch_flag_drug_df.R")
-Sys.time()
\ No newline at end of file
+tic(); source("data-raw/app/01_headline_figures_df.R"); toc()
+tic(); source("data-raw/app/02_patients_age_gender_df.R"); toc()
+tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc()
+tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc()
+tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc()
+tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc()
+tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc()
+tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc()
+Sys.time()
+
diff --git a/data-raw/app/01_headline_figures_df.R b/data-raw/app/01_headline_figures_df.R
index 103025b..e69e5db 100644
--- a/data-raw/app/01_headline_figures_df.R
+++ b/data-raw/app/01_headline_figures_df.R
@@ -8,6 +8,7 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Create a lazy table from year month dim table in DWCP
 data_db <- con %>%
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
   tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # Key findings used within analysis summary text
@@ -104,4 +105,10 @@ mod_headline_figures_df = rbind(annual_df, monthly_df)
 usethis::use_data(mod_headline_figures_df, overwrite = TRUE)
 
 # Disconnect from database
-DBI::dbDisconnect(con); rm(list = ls()); gc()
+DBI::dbDisconnect(con)
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
\ No newline at end of file
diff --git a/data-raw/app/02_patients_age_gender_df.R b/data-raw/app/02_patients_age_gender_df.R
index 2bbbc94..3de3c2d 100644
--- a/data-raw/app/02_patients_age_gender_df.R
+++ b/data-raw/app/02_patients_age_gender_df.R
@@ -1,5 +1,5 @@
-# Running time ~10 min
 
+# Running time ~10 min
 library(dplyr)
 library(dbplyr)
 devtools::load_all()
@@ -9,14 +9,14 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Item-level base table
 base_db <- con |>
-  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
  
 # Add a dummy overall column
 base_db <- base_db |>
    mutate(OVERALL = "Overall")
  
 # Loop over each geography and aggregate using purrr's map function approach
-
 patients_by_fy_geo_age_gender_fun <- function(geography_name) {
   
   # Identify geography cols
@@ -59,6 +59,7 @@ patients_by_fy_geo_age_gender_fun <- function(geography_name) {
   
 }
 
+# Map function
 patients_by_fy_geo_age_gender_df <- purrr::map(
   names(geographies),
   patients_by_fy_geo_age_gender_fun
@@ -74,6 +75,7 @@ patients_by_fy_geo_age_gender_df <-
     #PCT_PATIENTS = janitor::round_half_up(PCT_PATIENTS, 1)
   )
 
+# Calculate patient proportions
 patients_by_fy_geo_age_gender_df <- patients_by_fy_geo_age_gender_df |>
   group_by(CH_FLAG, FY, GEOGRAPHY, SUB_GEOGRAPHY_CODE, SUB_GEOGRAPHY_NAME) |>
   mutate(
@@ -121,4 +123,9 @@ usethis::use_data(
 
 # Disconnect from database
 DBI::dbDisconnect(con)
-rm(list = ls()); gc()
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
diff --git a/data-raw/app/03_patients_by_imd_df.R b/data-raw/app/03_patients_by_imd_df.R
index b5f2d09..960d8ab 100644
--- a/data-raw/app/03_patients_by_imd_df.R
+++ b/data-raw/app/03_patients_by_imd_df.R
@@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Create a lazy table from the item level base table
 fact_db <- con %>%
-  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # Count care home patients in each decile
 mod_patients_by_imd_df <- fact_db %>%
@@ -37,4 +38,10 @@ mod_patients_by_imd_df <- fact_db %>%
 usethis::use_data(mod_patients_by_imd_df, overwrite = TRUE)
 
 # Disconnect
-DBI::dbDisconnect(con); rm(list = ls()); gc()
+DBI::dbDisconnect(con)
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
diff --git a/data-raw/app/04_metrics_by_ch_type_85_split_df.R b/data-raw/app/04_metrics_by_ch_type_85_split_df.R
index 4dacdc7..45ed56f 100644
--- a/data-raw/app/04_metrics_by_ch_type_85_split_df.R
+++ b/data-raw/app/04_metrics_by_ch_type_85_split_df.R
@@ -1,11 +1,9 @@
 # Initial setup -----------------------------------------------------------
 
 # Expected run time ~35 minutes @parallel 24
-
 library(dplyr)
 library(dbplyr)
 library(tidyr)
-
 devtools::load_all()
 
 # Set up connection to DALP
@@ -17,7 +15,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Item-level base table
 base_db <- con %>%
-  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # Initial manipulation to create CH_TYPE column, later to be grouped by
 init_db <- base_db %>%
@@ -48,6 +47,7 @@ init_db <- base_db %>%
     )
   ) 
 
+# Union both initi_db variants
 init_db <- init_db %>% 
   union(
     init_db %>% 
@@ -57,6 +57,7 @@ init_db <- init_db %>%
 
 ## Process ----------------------------------------------------------------
 
+# Get metrics
 metrics_by_ch_type_85_split_df <- get_metrics(
   init_db,
   first_grouping = c(
@@ -73,6 +74,7 @@ metrics_by_ch_type_85_split_df <- get_metrics(
   )
 )
 
+# Generate age band categories
 metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>% 
   mutate(
     AGE_BAND = dplyr::case_match(
@@ -84,9 +86,16 @@ metrics_by_ch_type_85_split_df <- metrics_by_ch_type_85_split_df %>%
   ) %>% 
   dplyr::relocate(AGE_BAND, .after = CH_TYPE)
   
-## Save -------------------------------------------------------------------
+## Save ------------------------------------------------------------------------
 usethis::use_data(metrics_by_ch_type_85_split_df, overwrite = TRUE)
 
-# Cleanup -----------------------------------------------------------------
+# Cleanup ----------------------------------------------------------------------
+
+# Disconnect
 DBI::dbDisconnect(con)
-rm(list = ls())
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
diff --git a/data-raw/app/05_metrics_age_gender_df.R b/data-raw/app/05_metrics_age_gender_df.R
index c80f90a..052ade0 100644
--- a/data-raw/app/05_metrics_age_gender_df.R
+++ b/data-raw/app/05_metrics_age_gender_df.R
@@ -1,5 +1,6 @@
 # Running time ~35 min
 
+# Libraries and functions
 library(dplyr)
 library(dbplyr)
 devtools::load_all()
@@ -10,10 +11,11 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Item-level base table
 base_db <- con |>
-  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) |>
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331")) %>% 
   filter(GENDER %in% c("Male", "Female"))
  
-
+# Get metrics
 metrics_by_age_gender_and_ch_flag_df <- get_metrics(
   base_db,
   first_grouping = c(
@@ -46,4 +48,9 @@ usethis::use_data(
 
 # Disconnect from database
 DBI::dbDisconnect(con)
-rm(list = ls()); gc()
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
diff --git a/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R b/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R
index d7ee584..6851975 100644
--- a/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R
+++ b/data-raw/app/06_metrics_by_geo_and_ch_flag_df.R
@@ -1,28 +1,30 @@
-# Initial setup -----------------------------------------------------------
+# Initial setup ----------------------------------------------------------------
 
 # Expected run time ~40 minutes @parallel 24
 
+# Library and functions
 library(dplyr)
 library(dbplyr)
 library(stringr)
 library(glue)
 library(purrr)
-
 devtools::load_all()
 
 # Set up connection to DALP
 con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
-# Data validation ---------------------------------------------------------
+# Data validation --------------------------------------------------------------
 
-## Setup ------------------------------------------------------------------
+## Setup -----------------------------------------------------------------------
 
+# Distinct postcode-related fields
 PCD <- con %>%
   tbl(from = "INT646_POSTCODE_LOOKUP") %>%
   select(ends_with("CODE"), ends_with("NAME")) %>%
   distinct() %>%
   collect()
 
+# function to transform fields by geography fields
 transform_PCD <- function(data, geography) {
   data %>%
     select(starts_with(glue("PCD_{geography}"))) %>%
@@ -33,19 +35,22 @@ transform_PCD <- function(data, geography) {
     filter(!is.na(SUB_GEOGRAPHY_NAME))
 }
 
+# Generate function output
 PCD_list <- list(
   REGION = PCD %>% transform_PCD("REGION"),
   ICB    = PCD %>% transform_PCD("ICB"),
   LAD    = PCD %>% transform_PCD("LAD")
 )
 
+# Define gis list
 GIS_list <- geo_data_validation
 
 # Check sub-geography codes and names match exactly between PCD and GIS; script
 # will stop if not
 
-## Check sub-geography codes ----------------------------------------------
+## Check sub-geography codes ---------------------------------------------------
 
+# Generate check
 check_sub_geo_codes <- list(
   in_GIS_only = map2(
     GIS_list,
@@ -59,6 +64,7 @@ check_sub_geo_codes <- list(
   )
 )
 
+# Stop if check fails
 stopifnot(
   "Some difference in geo codes: check `check_sub_geo_codes`"= {
     character(0) == check_sub_geo_codes %>%
@@ -67,8 +73,9 @@ stopifnot(
   }
 )
 
-## Check sub-geography names ----------------------------------------------
+## Check sub-geography names ---------------------------------------------------
 
+# Generate check
 check_sub_geo_names <- list(
   in_GIS_only = map2(
     GIS_list,
@@ -82,6 +89,7 @@ check_sub_geo_names <- list(
   )
 )
 
+# Stop if check fails
 stopifnot(
   "Some difference in geo names: check `check_sub_geo_names`"= {
     character(0) == check_sub_geo_names %>%
@@ -90,13 +98,14 @@ stopifnot(
   }
 )
 
-# Data prep ---------------------------------------------------------------
+# Data prep --------------------------------------------------------------------
 
-## Setup ------------------------------------------------------------------
+## Setup -----------------------------------------------------------------------
 
 # Item-level base table
 base_db <- con %>%
-  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # Aggregate by a geography
 aggregate_by_geo <- function(geography_name) {
@@ -141,21 +150,28 @@ aggregate_by_geo <- function(geography_name) {
     rename(!!!geography_cols)
 }
 
-## Process ----------------------------------------------------------------
+## Process ---------------------------------------------------------------------
 metrics_by_geo_and_ch_flag_df <- names(geographies)[-1] %>% 
   map(aggregate_by_geo) %>%
   list_rbind()
 
-## Format -----------------------------------------------------------------
+## Format ----------------------------------------------------------------------
 metrics_by_geo_and_ch_flag_df <- metrics_by_geo_and_ch_flag_df %>%
   mutate(CH_FLAG = as.logical(CH_FLAG)) %>% 
   filter(!is.na(SUB_GEOGRAPHY_NAME)) %>% 
   format_data_raw("CH_FLAG") %>% 
   suppressWarnings() # We do not have Overall in this data
 
-## Save -------------------------------------------------------------------
+## Save ------------------------------------------------------------------------
 usethis::use_data(metrics_by_geo_and_ch_flag_df, overwrite = TRUE)
 
-# Cleanup -----------------------------------------------------------------
+# Cleanup ----------------------------------------------------------------------
+
+# Disconnect
 DBI::dbDisconnect(con)
-rm(list = ls())
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
diff --git a/data-raw/app/07_ch_flag_drug_df.R b/data-raw/app/07_ch_flag_drug_df.R
index 17270a0..6cc3792 100644
--- a/data-raw/app/07_ch_flag_drug_df.R
+++ b/data-raw/app/07_ch_flag_drug_df.R
@@ -1,5 +1,4 @@
 
-
 # Library
 library(dplyr)
 library(dbplyr)
@@ -9,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Create a lazy table from the item level base table
 fact_db <- con %>%
-  dplyr::tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # BNF columns
 bnf_cols = c(
@@ -91,7 +91,7 @@ get_geo_bnf_prop = function(index){
     BNF_PARENT = unique(df$BNF_PARENT),
     BNF_CHILD = unique(df$BNF_CHILD),
     METRIC = unique(df$METRIC)
-  ) %>%
+    ) %>%
     left_join(df) %>%
     mutate(VALUE = ifelse(is.na(VALUE), 0, VALUE))
 }
@@ -211,6 +211,12 @@ mod_ch_flag_drug_df %>% count(FY, CH_FLAG, BNF_PARENT)
 usethis::use_data(mod_ch_flag_drug_df, overwrite = TRUE)
 
 # Disconnect
-DBI::dbDisconnect(con); rm(list = ls()); gc()
+DBI::dbDisconnect(con)
+
+# Remove vars specific to script
+remove_vars <- setdiff(ls(), keep_vars)
+
+# Remove objects and clean environment
+rm(list = remove_vars, remove_vars); gc()
 
-#-------------------------------------------------------------------------------
+#-------------------------------------------------------------------------------
\ No newline at end of file
diff --git a/data-raw/app/08_geo_ch_flag_drug_df.R b/data-raw/app/08_geo_ch_flag_drug_df.R
index dab2966..ff66a34 100644
--- a/data-raw/app/08_geo_ch_flag_drug_df.R
+++ b/data-raw/app/08_geo_ch_flag_drug_df.R
@@ -8,7 +8,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Create a lazy table from the item level base table
 fact_db <- con %>%
-  dplyr::tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  #tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20240331"))
 
 # BNF columns
 bnf_cols = c(
diff --git a/data-raw/workflow/07_item_level_base.R b/data-raw/workflow/07_item_level_base.R
index 12c0278..1f65078 100644
--- a/data-raw/workflow/07_item_level_base.R
+++ b/data-raw/workflow/07_item_level_base.R
@@ -199,7 +199,6 @@ presc_db = presc_db %>%
     PRESCRIBER_CODE = PRESCRIBER_LTST_CDE
     )
 
-
 # Process form fact
 form_db = form_db %>% 
   select(
diff --git a/data-raw/workflow/workflow_run_22_23.R b/data-raw/workflow/workflow_run_22_23.R
index 3c29c4d..18893e3 100644
--- a/data-raw/workflow/workflow_run_22_23.R
+++ b/data-raw/workflow/workflow_run_22_23.R
@@ -39,7 +39,6 @@ create_care_home_address_match(
 # 6. Create postcode lookup table (latest available mappings) for joining in the next step: ~5 min
 # create_postcode_lookup() # Run once in first epoch script
 
-
 # 7. Join to fact table and get non ch-postcode records within time frame: ~9 hrs
 create_matched_prescription_base_table(
   match_data = "INT646_MATCH_20220401_20230331",

From 33efb77f06fb583f014c5bd189ee92cf856a40ec Mon Sep 17 00:00:00 2001
From: Adnan Shroufi <adnan.shroufi@nhsbsa.nhs.uk>
Date: Tue, 2 Jul 2024 12:12:30 +0100
Subject: [PATCH 9/9] eda edit

---
 EDA/eda_parent_uprn_and_pat_threshold.R          |  7 ++++---
 data-raw/app/00_run_all.R                        | 14 +++++++-------
 data-raw/app/04_metrics_by_ch_type_85_split_df.R |  2 +-
 data-raw/app/data_raw_helpers.R                  |  6 +++---
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/EDA/eda_parent_uprn_and_pat_threshold.R b/EDA/eda_parent_uprn_and_pat_threshold.R
index 585f1ae..5e796de 100644
--- a/EDA/eda_parent_uprn_and_pat_threshold.R
+++ b/EDA/eda_parent_uprn_and_pat_threshold.R
@@ -9,7 +9,8 @@ con <- nhsbsaR::con_nhsbsa(database = "DALP")
 
 # Create a lazy table from year month dim table in DWCP
 data <- con %>%
-  tbl(from = in_schema("ADNSH", "INT646_BASE_20210401_20220331"))
+  tbl(from = in_schema("DALL_REF", "INT646_BASE_20200401_20230331")) %>% 
+  filter(FY == "2020/21")
 
 # Get chapter info
 chapters = con %>% 
@@ -39,7 +40,7 @@ df_std = data %>%
     NURSING_HOME_FLAG = max(NURSING_HOME_FLAG),
     RESIDENTIAL_HOME_FLAG = max(RESIDENTIAL_HOME_FLAG),
     MAX_MONTHLY_PATIENTS = max(MAX_MONTHLY_PATIENTS),
-    NUMBER_OF_BEDS = max(NUMBER_OF_BEDS),
+    #NUMBER_OF_BEDS = max(NUMBER_OF_BEDS),
     MONTHS = n_distinct(YEAR_MONTH),
     PATS = n_distinct(NHS_NO),
     ITEMS = sum(ITEM_COUNT),
@@ -65,7 +66,7 @@ df_merge = data %>%
     NURSING_HOME_FLAG = max(NURSING_HOME_FLAG),
     RESIDENTIAL_HOME_FLAG = max(RESIDENTIAL_HOME_FLAG),
     MAX_MONTHLY_PATIENTS = max(MAX_MONTHLY_PATIENTS),
-    NUMBER_OF_BEDS = max(NUMBER_OF_BEDS),
+    #NUMBER_OF_BEDS = max(NUMBER_OF_BEDS),
     MONTHS = n_distinct(YEAR_MONTH),
     PATS = n_distinct(NHS_NO),
     ITEMS = sum(ITEM_COUNT),
diff --git a/data-raw/app/00_run_all.R b/data-raw/app/00_run_all.R
index f79c3b7..8c23e20 100644
--- a/data-raw/app/00_run_all.R
+++ b/data-raw/app/00_run_all.R
@@ -8,13 +8,13 @@ source("data-raw/app/geo_data.R")
 keep_vars = c(ls(), 'keep_vars')
 
 # Run all scripts that generate an Rda file
-tic(); source("data-raw/app/01_headline_figures_df.R"); toc()
+tic(); source("data-raw/app/01_headline_figures_df.R"); toc()                   # 10 mins
 tic(); source("data-raw/app/02_patients_age_gender_df.R"); toc()
-tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc()
-tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc()
-tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc()
-tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc()
-tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc()
-tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc()
+tic(); source("data-raw/app/03_patients_by_imd_df.R"); toc()                    # 10 mins
+tic(); source("data-raw/app/04_metrics_by_ch_type_85_split_df.R"); toc()        # 3  hours
+tic(); source("data-raw/app/05_metrics_age_gender_df.R"); toc()                 # 30 mins
+tic(); source("data-raw/app/06_metrics_by_geo_and_ch_flag_df.R"); toc()         # 90 mins
+tic(); source("data-raw/app/07_ch_flag_drug_df.R"); toc()                       # 30 mins
+tic(); source("data-raw/app/08_geo_ch_flag_drug_df.R"); toc()                   # 90 mins
 Sys.time()
 
diff --git a/data-raw/app/04_metrics_by_ch_type_85_split_df.R b/data-raw/app/04_metrics_by_ch_type_85_split_df.R
index 45ed56f..4ef664f 100644
--- a/data-raw/app/04_metrics_by_ch_type_85_split_df.R
+++ b/data-raw/app/04_metrics_by_ch_type_85_split_df.R
@@ -1,6 +1,6 @@
 # Initial setup -----------------------------------------------------------
 
-# Expected run time ~35 minutes @parallel 24
+# Expected run time ~35 minutes @parallel 36
 library(dplyr)
 library(dbplyr)
 library(tidyr)
diff --git a/data-raw/app/data_raw_helpers.R b/data-raw/app/data_raw_helpers.R
index 330ca39..f8e8aea 100644
--- a/data-raw/app/data_raw_helpers.R
+++ b/data-raw/app/data_raw_helpers.R
@@ -5,7 +5,7 @@
 #' @param first_grouping A vector of col names to do initial grouping by.
 #' @param second_grouping A vector of col names to do secondary grouping by.
 #' @param nest_cols A vector of col names to use nesting on when completing.
-#' @param num_parallel By default, query will ask for 24 parallel, but can ask 
+#' @param num_parallel By default, query will ask for 36 parallel, but can ask 
 #'   for different value if you think it is needed.
 #'
 #' @return
@@ -34,13 +34,13 @@
 #'     "SUB_GEOGRAPHY_CODE",
 #'     "SUB_GEOGRAPHY_NAME"
 #'   ),
-#'   num_parallel = 32
+#'   num_parallel = 36
 #' )
 get_metrics <- function(init_db,
                         first_grouping,
                         second_grouping,
                         nest_cols = c(),
-                        num_parallel = 24) {
+                        num_parallel = 36) {
   # Collect data and calculate raw metrics
   init_db %>% 
     group_by(across(all_of(first_grouping))) %>%