Skip to content

Commit

Permalink
Merge pull request #3 from nhsbsa-data-analytics/feature/create_data
Browse files Browse the repository at this point in the history
create tidy dummy data
  • Loading branch information
kygoffe authored Oct 9, 2023
2 parents 8711034 + 74a94a9 commit 9b0a135
Show file tree
Hide file tree
Showing 5 changed files with 279 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ rsconnect
# Excel files (including macros and templates) #
################################################
*.[xX][lL][sS][xXmMtT]?
data/*
9 changes: 7 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ Depends:
R (>= 4.0)
Imports:
config,
dplyr,
golem,
highcharter,
htmltools,
Expand All @@ -21,7 +20,13 @@ Imports:
rlang,
scrollytell,
shiny,
shinyjs
shinyjs,
dplyr (>= 1.1.3),
dbplyr (>= 2.3.3),
forcats (>= 1.0.0),
futile.logger (>= 1.4.3),
stringr,
formatR
Suggests:
pkgload,
testthat (>= 3.0.0),
Expand Down
16 changes: 16 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#' NHSBSA employee headcount
#'
#' A dataset containing NHSBSA employee headcount
#' since financial year 2017/18, split by gender, AFC band
#' and FTE (full time or part time)
#'
#' @format A data frame
#' \describe{
#' \item{FINANCIAL_YEAR}{01-Mar-year, factor}
#' \item{GENDER}{Male or Female, character}
#' \item{PAY_GRADE_NAME}{AFC band}
#' \item{FTE_GROUP}{employee full time or part time info}
#' \item{HEADCOUNT}To get number of employees}
#' ...
#' }
"headcount"
202 changes: 202 additions & 0 deletions R/headcount_data_class.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
#' @title tidy data set for first headcount related two graphs.
#'
#' @description \code{headcount_data} is the class used for the creation of
#' #' first two headcount related figures.
#'
#' @details The \code{headcount_data} class expects a \code{data.frame} with at
#' least five columns: FINANCIAL_YEAR, GENDER, PAY_GRADE_NAME, FTE_GROUP, HEADCOUNT. Each
#' row represents aggregated headcount by four columns.
#'
#' Once initiated, the class has five slots: \code{df}: the basic \code{data.frame},
#' \code{colnames}: a character vector containing the column names from the
#' \code{df}, \code{reporting_headcount}: a numeric vector containing
#' reporting financial year's headcount, \code{diffs}: a numeric vector
#' containing differences from previous financial year headcount
#' to current reporting financial year headcount, \code{ending_fy}: a character
#' vector containing ending reporting period (e.g. 31 March 2022)
#'
#'
#' @param x Input data frame.
#' @param log_level The severity level at which log messages are written from
#' least to most serious: TRACE, DEBUG, INFO, WARN, ERROR, FATAL. Default is
#' level is INFO. See \code{?flog.threshold()} for additional details.
#' @param eda If TRUE an graphical data analysis is conducted for a human to check.
#'
#' @return If the class is not instantiated correctly, nothing is returned.
#'
#' @examples
#'
#' library(nhsbsaGPG)
#'
#' df <- headcount_data(headcount)
#'
#' @export


headcount_data <- function(x, log_level = futile.logger::WARN,
eda = FALSE) {
# Set logger severity threshold, defaults to WARN
futile.logger::flog.threshold(log_level)


# Checks
futile.logger::flog.info("Initiating HEADCOUNT_data class.
\n\nExpects a data.frame with at
least five columns: FINANCIAL_YEAR, gender,
PAY_GRADE_NAME, FTE_GROUP and HEADCOUNT.
Each row represents an aggregated headcount
from four columns.
This class is given by ?headcount_data().")

# Integrity checks on incoming data ----

# Check the structure of the data is as expected: data.frame containing no
# missing values and at least five columns, containing FINANCIAL_YEAR,
# gender, PAY_GRADE_NAME, FTE_GROUP and HEADCOUNT.

futile.logger::flog.info("\n*** Running integrity checks on input dataframe (x):")
futile.logger::flog.debug("\nChecking input is properly formatted...")

futile.logger::flog.debug("Checking x is a data.frame...")
if (!is.data.frame(x)) {
futile.logger::flog.error("x must be a data.frame",
x,
capture = TRUE
)
}

futile.logger::flog.debug("Checking x has correct columns...")
if (length(colnames(x)) < 5) {
futile.logger::flog.error("x must have at least five columns:
FINANCIAL_YEAR,
GENDER, PAY_GRADE_NAME,
FTE_GROUP, HEADCOUNT")
}

futile.logger::flog.debug("Checking x contains a FINANCIAL_YEAR column...")
if (!"FINANCIAL_YEAR" %in% colnames(x)) {
stop("x must contain FINANCIAL_YEAR column")
}

futile.logger::flog.debug("Checking x contains a GENDER column...")
if (!"GENDER" %in% colnames(x)) stop("x must contain GENDER column")

futile.logger::flog.debug("Checking x contains a PAY_GRADE_NAME column...")
if (!"PAY_GRADE_NAME" %in% colnames(x)) {
stop("x must contain PAY_GRADE_NAME column")
}

futile.logger::flog.debug("Checking x contains a FTE_GROUP column...")
if (!"FTE_GROUP" %in% colnames(x)) {
stop("x must contain FTE_GROUP column")
}

futile.logger::flog.debug("Checking x contains a HEADCOUNT column...")
if (!"HEADCOUNT" %in% colnames(x)) {
stop("x must contain HEADCOUNT column")
}

futile.logger::flog.debug("Checking x does not contain missing values...")
if (anyNA(x)) stop("x cannot contain any missing values")

futile.logger::flog.debug("Checking for the correct number of rows...")
if (nrow(x) < 260) {
futile.logger::flog.warn("x does not appear to be well formed. nrow(x) should be
greater than 260 as of 2022/23 report.")
}



futile.logger::flog.info("...passed")


# Check sensible range for year

futile.logger::flog.debug("Checking beginning financial years in a sensible
range e.g.(2017:2022)...")



if (any(as.numeric(stringr::str_sub(x$FINANCIAL_YEAR, 1, 4)) < 2017)) {
futile.logger::flog.warn("The dates should start from
2017/18 financial year. Please check data-raw script.")
}



futile.logger::flog.info("...passed")

# Reset threshold to package default
futile.logger::flog.threshold(futile.logger::INFO)
# Reset so that log is appended to console (the package default)
futile.logger::flog.appender(futile.logger::appender.console())

# Message required to pass a test
message("Checks completed successfully:
object of 'headcount_data' class produced!")

# EDA
# some people like to eyeball stuff
# number of HEADCOUNT per financial year
if (eda == TRUE) {
agg_data <- aggregate(HEADCOUNT ~ FINANCIAL_YEAR, x, sum)
barplot(agg_data$HEADCOUNT,
names.arg = agg_data$FINANCIAL_YEAR,
las = 2,
ylab = "Financial Year",
xlab = "Headcount"
)
}


# Calculate the latest and previous years

# Calculate the latest and previous years
# This values are required to add to the interactive document
start_latest_year <- max(as.numeric(stringr::str_sub(x$FINANCIAL_YEAR, 1, 4)))
start_prev_year <- start_latest_year - 1
# Financial year of interest for the report
latest_fy <- paste0(
start_latest_year, "/",
as.numeric(stringr::str_sub(start_latest_year, 3, 4)) + 1
)
previous_fy <- paste0(
start_prev_year, "/",
stringr::str_sub(start_latest_year, 3, 4)
)

# First aggregate by financial year
agg_data <- x |>
filter(FINANCIAL_YEAR %in% c(latest_fy, previous_fy)) |>
group_by(FINANCIAL_YEAR) |>
summarise(TOTAL_HEADCOUNT = sum(HEADCOUNT, na.rm = TRUE)) |>
arrange(FINANCIAL_YEAR)

# Extract the values
reporting_headcount <-
agg_data$TOTAL_HEADCOUNT[agg_data$FINANCIAL_YEAR == latest_fy]
previous_reporting_headcount <-
agg_data$TOTAL_HEADCOUNT[agg_data$FINANCIAL_YEAR == previous_fy]

diffs <- reporting_headcount - previous_reporting_headcount

ending_fy <- as.character(start_latest_year + 1)



# Define the class here ----

structure(
list(
df = x,
colnames = colnames(x),
reporting_headcount = reporting_headcount,
diffs = diffs,
ending_fy = ending_fy
),
class = "headcount_data"
)
}



53 changes: 53 additions & 0 deletions data-raw/01_headcount_by_gender_afc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# This is dummy tidy dataset but it will include

Check warning on line 1 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=1,col=49,[trailing_whitespace_linter] Trailing whitespace is superfluous.
# mean and median pay per AFC band to create one file

# This dummy data includes maternity leave, sick leave etc
# Therefore, headcounts are slightly higher then reported
# figure

# Library

library(dplyr)
library(dbplyr)

# Set up connection to DALP
con <- nhsbsaR::con_nhsbsa(database = "DALP")

# Create a lazy table from cleaned employee table in DALP
data_db <- con |>

Check warning on line 17 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=17,col=18,[trailing_whitespace_linter] Trailing whitespace is superfluous.
tbl(from = in_schema("DALL_REF", "EMPLOYEE_DASHBOARD_COMBINED_EMPLOYMENT_DATA"))

# Summary headcount table of Financial Year, Gender, AFC band

headcount <- data_db |>

Check warning on line 22 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=22,col=24,[trailing_whitespace_linter] Trailing whitespace is superfluous.
filter(substr(ESR_MONTH, 1, 6) == '01-MAR',

Check warning on line 23 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=23,col=37,[quotes_linter] Only use double-quotes.

Check warning on line 23 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=23,col=46,[trailing_whitespace_linter] Trailing whitespace is superfluous.
as.numeric(substr(ESR_MONTH, 8, 9)) %in% c(18, 19, 20, 21, 22, 23)) |>

Check warning on line 24 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=24,col=80,[trailing_whitespace_linter] Trailing whitespace is superfluous.
mutate(
FINANCIAL_YEAR = case_when(
as.numeric(substr(ESR_MONTH, 8, 9)) == 18 ~ '2017/18',

Check warning on line 27 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=27,col=51,[quotes_linter] Only use double-quotes.
as.numeric(substr(ESR_MONTH, 8, 9)) == 19 ~ '2018/19',

Check warning on line 28 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=28,col=51,[quotes_linter] Only use double-quotes.
as.numeric(substr(ESR_MONTH, 8, 9)) == 20 ~ '2019/20',

Check warning on line 29 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=29,col=51,[quotes_linter] Only use double-quotes.
as.numeric(substr(ESR_MONTH, 8, 9)) == 21 ~ '2020/21',

Check warning on line 30 in data-raw/01_headcount_by_gender_afc.R

View workflow job for this annotation

GitHub Actions / lint

file=data-raw/01_headcount_by_gender_afc.R,line=30,col=51,[quotes_linter] Only use double-quotes.
as.numeric(substr(ESR_MONTH, 8, 9)) == 22 ~ '2021/22',
as.numeric(substr(ESR_MONTH, 8, 9)) == 23 ~ '2022/23',
TRUE ~ 'unknown'
)
) |>
group_by(FINANCIAL_YEAR, GENDER, PAY_GRADE_NAME , FTE_GROUP) |>
summarise(HEADCOUNT = sum(HEADCOUNT, na.rm = TRUE)) |>
ungroup() |>
arrange(FINANCIAL_YEAR, GENDER, PAY_GRADE_NAME, FTE_GROUP) |>
collect() |>
# In case we want to report by year (keep it as factor)
mutate(FINANCIAL_YEAR = factor(FINANCIAL_YEAR,
levels = unique(FINANCIAL_YEAR)),
PAY_GRADE_NAME = factor(PAY_GRADE_NAME))


# Add to data
usethis::use_data(headcount, overwrite = TRUE)

DBI::dbDisconnect(con)
rm(list = ls())
gc()

0 comments on commit 9b0a135

Please sign in to comment.