From 74a94a93b2055001d28709c4b2031a2025a0d3ef Mon Sep 17 00:00:00 2001 From: Kayoung Goffe Date: Mon, 9 Oct 2023 21:21:38 +0100 Subject: [PATCH] create tidy dummy data --- .gitignore | 1 + DESCRIPTION | 9 +- R/data.R | 16 ++ R/headcount_data_class.R | 202 ++++++++++++++++++++++++++ data-raw/01_headcount_by_gender_afc.R | 53 +++++++ 5 files changed, 279 insertions(+), 2 deletions(-) create mode 100644 R/data.R create mode 100644 R/headcount_data_class.R create mode 100644 data-raw/01_headcount_by_gender_afc.R diff --git a/.gitignore b/.gitignore index 15d99ad..dc34566 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ rsconnect # Excel files (including macros and templates) # ################################################ *.[xX][lL][sS][xXmMtT]? +data/* diff --git a/DESCRIPTION b/DESCRIPTION index 23d4473..a09b939 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,6 @@ Depends: R (>= 4.0) Imports: config, - dplyr, golem, highcharter, htmltools, @@ -21,7 +20,13 @@ Imports: rlang, scrollytell, shiny, - shinyjs + shinyjs, + dplyr (>= 1.1.3), + dbplyr (>= 2.3.3), + forcats (>= 1.0.0), + futile.logger (>= 1.4.3), + stringr, + formatR Suggests: pkgload, testthat (>= 3.0.0), diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..72da395 --- /dev/null +++ b/R/data.R @@ -0,0 +1,16 @@ +#' NHSBSA employee headcount +#' +#' A dataset containing NHSBSA employee headcount +#' since financial year 2017/18, split by gender, AFC band +#' and FTE (full time or part time) +#' +#' @format A data frame +#' \describe{ +#' \item{FINANCIAL_YEAR}{01-Mar-year, factor} +#' \item{GENDER}{Male or Female, character} +#' \item{PAY_GRADE_NAME}{AFC band} +#' \item{FTE_GROUP}{employee full time or part time info} +#' \item{HEADCOUNT}To get number of employees} +#' ... +#' } +"headcount" \ No newline at end of file diff --git a/R/headcount_data_class.R b/R/headcount_data_class.R new file mode 100644 index 0000000..7884bf2 --- /dev/null +++ b/R/headcount_data_class.R @@ -0,0 +1,202 @@ +#' @title tidy data set for first headcount related two graphs. +#' +#' @description \code{headcount_data} is the class used for the creation of +#' #' first two headcount related figures. +#' +#' @details The \code{headcount_data} class expects a \code{data.frame} with at +#' least five columns: FINANCIAL_YEAR, GENDER, PAY_GRADE_NAME, FTE_GROUP, HEADCOUNT. Each +#' row represents aggregated headcount by four columns. +#' +#' Once initiated, the class has five slots: \code{df}: the basic \code{data.frame}, +#' \code{colnames}: a character vector containing the column names from the +#' \code{df}, \code{reporting_headcount}: a numeric vector containing +#' reporting financial year's headcount, \code{diffs}: a numeric vector +#' containing differences from previous financial year headcount +#' to current reporting financial year headcount, \code{ending_fy}: a character +#' vector containing ending reporting period (e.g. 31 March 2022) +#' +#' +#' @param x Input data frame. +#' @param log_level The severity level at which log messages are written from +#' least to most serious: TRACE, DEBUG, INFO, WARN, ERROR, FATAL. Default is +#' level is INFO. See \code{?flog.threshold()} for additional details. +#' @param eda If TRUE an graphical data analysis is conducted for a human to check. +#' +#' @return If the class is not instantiated correctly, nothing is returned. +#' +#' @examples +#' +#' library(nhsbsaGPG) +#' +#' df <- headcount_data(headcount) +#' +#' @export + + +headcount_data <- function(x, log_level = futile.logger::WARN, + eda = FALSE) { + # Set logger severity threshold, defaults to WARN + futile.logger::flog.threshold(log_level) + + + # Checks + futile.logger::flog.info("Initiating HEADCOUNT_data class. + \n\nExpects a data.frame with at + least five columns: FINANCIAL_YEAR, gender, + PAY_GRADE_NAME, FTE_GROUP and HEADCOUNT. + Each row represents an aggregated headcount + from four columns. + This class is given by ?headcount_data().") + + # Integrity checks on incoming data ---- + + # Check the structure of the data is as expected: data.frame containing no + # missing values and at least five columns, containing FINANCIAL_YEAR, + # gender, PAY_GRADE_NAME, FTE_GROUP and HEADCOUNT. + + futile.logger::flog.info("\n*** Running integrity checks on input dataframe (x):") + futile.logger::flog.debug("\nChecking input is properly formatted...") + + futile.logger::flog.debug("Checking x is a data.frame...") + if (!is.data.frame(x)) { + futile.logger::flog.error("x must be a data.frame", + x, + capture = TRUE + ) + } + + futile.logger::flog.debug("Checking x has correct columns...") + if (length(colnames(x)) < 5) { + futile.logger::flog.error("x must have at least five columns: + FINANCIAL_YEAR, + GENDER, PAY_GRADE_NAME, + FTE_GROUP, HEADCOUNT") + } + + futile.logger::flog.debug("Checking x contains a FINANCIAL_YEAR column...") + if (!"FINANCIAL_YEAR" %in% colnames(x)) { + stop("x must contain FINANCIAL_YEAR column") + } + + futile.logger::flog.debug("Checking x contains a GENDER column...") + if (!"GENDER" %in% colnames(x)) stop("x must contain GENDER column") + + futile.logger::flog.debug("Checking x contains a PAY_GRADE_NAME column...") + if (!"PAY_GRADE_NAME" %in% colnames(x)) { + stop("x must contain PAY_GRADE_NAME column") + } + + futile.logger::flog.debug("Checking x contains a FTE_GROUP column...") + if (!"FTE_GROUP" %in% colnames(x)) { + stop("x must contain FTE_GROUP column") + } + + futile.logger::flog.debug("Checking x contains a HEADCOUNT column...") + if (!"HEADCOUNT" %in% colnames(x)) { + stop("x must contain HEADCOUNT column") + } + + futile.logger::flog.debug("Checking x does not contain missing values...") + if (anyNA(x)) stop("x cannot contain any missing values") + + futile.logger::flog.debug("Checking for the correct number of rows...") + if (nrow(x) < 260) { + futile.logger::flog.warn("x does not appear to be well formed. nrow(x) should be + greater than 260 as of 2022/23 report.") + } + + + + futile.logger::flog.info("...passed") + + + # Check sensible range for year + + futile.logger::flog.debug("Checking beginning financial years in a sensible + range e.g.(2017:2022)...") + + + + if (any(as.numeric(stringr::str_sub(x$FINANCIAL_YEAR, 1, 4)) < 2017)) { + futile.logger::flog.warn("The dates should start from + 2017/18 financial year. Please check data-raw script.") + } + + + + futile.logger::flog.info("...passed") + + # Reset threshold to package default + futile.logger::flog.threshold(futile.logger::INFO) + # Reset so that log is appended to console (the package default) + futile.logger::flog.appender(futile.logger::appender.console()) + + # Message required to pass a test + message("Checks completed successfully: + object of 'headcount_data' class produced!") + + # EDA + # some people like to eyeball stuff + # number of HEADCOUNT per financial year + if (eda == TRUE) { + agg_data <- aggregate(HEADCOUNT ~ FINANCIAL_YEAR, x, sum) + barplot(agg_data$HEADCOUNT, + names.arg = agg_data$FINANCIAL_YEAR, + las = 2, + ylab = "Financial Year", + xlab = "Headcount" + ) + } + + + # Calculate the latest and previous years + + # Calculate the latest and previous years + # This values are required to add to the interactive document + start_latest_year <- max(as.numeric(stringr::str_sub(x$FINANCIAL_YEAR, 1, 4))) + start_prev_year <- start_latest_year - 1 + # Financial year of interest for the report + latest_fy <- paste0( + start_latest_year, "/", + as.numeric(stringr::str_sub(start_latest_year, 3, 4)) + 1 + ) + previous_fy <- paste0( + start_prev_year, "/", + stringr::str_sub(start_latest_year, 3, 4) + ) + + # First aggregate by financial year + agg_data <- x |> + filter(FINANCIAL_YEAR %in% c(latest_fy, previous_fy)) |> + group_by(FINANCIAL_YEAR) |> + summarise(TOTAL_HEADCOUNT = sum(HEADCOUNT, na.rm = TRUE)) |> + arrange(FINANCIAL_YEAR) + + # Extract the values + reporting_headcount <- + agg_data$TOTAL_HEADCOUNT[agg_data$FINANCIAL_YEAR == latest_fy] + previous_reporting_headcount <- + agg_data$TOTAL_HEADCOUNT[agg_data$FINANCIAL_YEAR == previous_fy] + + diffs <- reporting_headcount - previous_reporting_headcount + + ending_fy <- as.character(start_latest_year + 1) + + + + # Define the class here ---- + + structure( + list( + df = x, + colnames = colnames(x), + reporting_headcount = reporting_headcount, + diffs = diffs, + ending_fy = ending_fy + ), + class = "headcount_data" + ) +} + + + diff --git a/data-raw/01_headcount_by_gender_afc.R b/data-raw/01_headcount_by_gender_afc.R new file mode 100644 index 0000000..3c0c2e5 --- /dev/null +++ b/data-raw/01_headcount_by_gender_afc.R @@ -0,0 +1,53 @@ +# This is dummy tidy dataset but it will include +# mean and median pay per AFC band to create one file + +# This dummy data includes maternity leave, sick leave etc +# Therefore, headcounts are slightly higher then reported +# figure + +# Library + +library(dplyr) +library(dbplyr) + +# Set up connection to DALP +con <- nhsbsaR::con_nhsbsa(database = "DALP") + +# Create a lazy table from cleaned employee table in DALP +data_db <- con |> + tbl(from = in_schema("DALL_REF", "EMPLOYEE_DASHBOARD_COMBINED_EMPLOYMENT_DATA")) + +# Summary headcount table of Financial Year, Gender, AFC band + +headcount <- data_db |> + filter(substr(ESR_MONTH, 1, 6) == '01-MAR', + as.numeric(substr(ESR_MONTH, 8, 9)) %in% c(18, 19, 20, 21, 22, 23)) |> + mutate( + FINANCIAL_YEAR = case_when( + as.numeric(substr(ESR_MONTH, 8, 9)) == 18 ~ '2017/18', + as.numeric(substr(ESR_MONTH, 8, 9)) == 19 ~ '2018/19', + as.numeric(substr(ESR_MONTH, 8, 9)) == 20 ~ '2019/20', + as.numeric(substr(ESR_MONTH, 8, 9)) == 21 ~ '2020/21', + as.numeric(substr(ESR_MONTH, 8, 9)) == 22 ~ '2021/22', + as.numeric(substr(ESR_MONTH, 8, 9)) == 23 ~ '2022/23', + TRUE ~ 'unknown' + ) + ) |> + group_by(FINANCIAL_YEAR, GENDER, PAY_GRADE_NAME , FTE_GROUP) |> + summarise(HEADCOUNT = sum(HEADCOUNT, na.rm = TRUE)) |> + ungroup() |> + arrange(FINANCIAL_YEAR, GENDER, PAY_GRADE_NAME, FTE_GROUP) |> + collect() |> + # In case we want to report by year (keep it as factor) + mutate(FINANCIAL_YEAR = factor(FINANCIAL_YEAR, + levels = unique(FINANCIAL_YEAR)), + PAY_GRADE_NAME = factor(PAY_GRADE_NAME)) + + +# Add to data +usethis::use_data(headcount, overwrite = TRUE) + +DBI::dbDisconnect(con) +rm(list = ls()) +gc() +