0_Load_and_clean.Rmd

---
title: "Load and clean the data"
author: "Leon Di Stefano"
date: "`r Sys.Date()`"
output: 
    html_document:
      keep_md: false
params:
  outcome_min: 28
  outcome_max: 35
---

```{r echo=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# Reading Stata files
require(haven) 

# here library
require(here)
here::i_am(file.path("hcq_pooling_analysis", "0_Load_and_clean.Rmd"))
source(here("hcq_pooling_analysis", "common.R"))

out_stub <- paste(params$outcome_min, params$outcome_max, sep = '-')
output_dir <- here("hcq_pooling_analysis", "output", out_stub)

if(!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE) 
}
```

Load the raw data:

```{r message=FALSE}
patients_raw <- read_dta(here(
  "hcq_pooling_analysis", "data", "2021-09-29_HCQ_Analysis_Datasets", "HCQ_Analysis_Datasets", "8Site_PatientData.dta"))

# outcomes_baseline_raw <- read_dta(here(
#   "hcq_pooling_analysis", "data", "2021-09-29_HCQ_Analysis_Datasets", "HCQ_Analysis_Datasets", "8Sites_Outcome_BL.dta"))

outcomes_all_raw <- read_dta(here(
  "hcq_pooling_analysis", "data", "2021-09-29_HCQ_Analysis_Datasets", "HCQ_Analysis_Datasets", "8Sites_Outcome_Data.dta"))

outcomes_baseline_raw <-
  outcomes_all_raw %>% filter(days_post_enrollment == 0)
```

## Patient data

Load the assignment of study arms to HCQ vs. no HCQ:

```{r}
study_arm <- read_tsv(
    here("hcq_pooling_analysis", "resources", "study_arm_tbl.csv"))
```

Join treatment arm classifications:

```{r}
patients_1 <-
  patients_raw %>%
  mutate(patient_id = str_c(siteid, patient_id, sep = "__")) %>%
  # Add trial arm classification
  left_join(
    study_arm
  )
```

Now to create some useful factor versions of variables. This will be an "everything" table which for the sake of manageability we'll subset for e.g. model fitting.

```{r}
sites_in_desc_n_order <- 
  names(sort(table(patients_raw$siteid), decreasing = TRUE))

patients_2 <-
  patients_1 %>%
  mutate(
    # Nice labels for sites, in order of decreasing n
    siteid = factor(
      siteid, 
      levels = sites_in_desc_n_order, 
      labels = site_label_vec[sites_in_desc_n_order],
      ordered = TRUE
    ),
    sex_fct = 
      factor(
        sex,
        levels = 1:2,
        labels = c("male", "female")), #  %>%
      # forcats::fct_explicit_na(),
    race_fct =
      factor(
        race,
        levels = 1:8,
        labels = c(     
          "1 = American Indian/Alaska Native",
          "2 = Asian",
          "3 = Black/African American",
          "4 = Native Hawaiian/Pacific Islander",
          "5 = White",
          "6 = Multiple", 
          "7 = Other/declined", 
          "8 = Unknown/unavailable")),
    ethnic_fct = factor(
      ethnic,
      levels = 0:2,
      labels = c("not hispanic", "hispanic", "unknown")),
    race_simplified_fct =
      factor(race_simplified),
    azithro_fct =
      factor(
        azithro, 
        levels = c(F, T), 
        labels = c("no azithro", "azithro")),
    # make sure no_HCQ is the reference level
    factor(treat, levels = c("no_HCQ", "HCQ"))
  )
```

Before writing this out, we'll append the outcome data, which we compute next.

## Outcome data

First, to clean the outcome data.

```{r}
outcomes_baseline <-
  outcomes_baseline_raw %>%
  mutate(
    # Make unique patient ids
    patient_id = str_c(siteid, patient_id, sep = "__"),
    # Decorate the outcome
    covid_scale = factor(
      covid_scale, levels = 1:7, labels = niaid_levels, ordered = TRUE),
    siteid = factor(siteid, siteid_vec)
  )

write_rds(outcomes_baseline, file.path(output_dir,"outcomes_baseline.rds"))

outcomes_all <-
  outcomes_all_raw %>%
  mutate(
    # Make unique patient ids
    patient_id = str_c(siteid, patient_id, sep = "__"),
    # Decorate the outcome
    covid_scale = factor(covid_scale, levels = 1:7, labels = niaid_levels, ordered = TRUE),
    siteid = factor(siteid, siteid_vec)
  )

write_rds(outcomes_all, file.path(output_dir,"outcomes_all.rds"))
```

### Defining the outcome variable

We need to

-   "carry forward" death, from two sources: the outcome data, and the patient-level variables
-   define the primary outcome as the closest measurement to d28 within d28-d35 inclusive (i.e., earliest measurement in d28-d35)

Carrying forward death:

```{r}
outcome_min <- params$outcome_min # inclusive
outcome_max <- params$outcome_max # inclusive

all_days_post_enrollment <- 0:max(outcomes_all$days_post_enrollment)

patient_time_grid <-
  expand_grid(patient_id           = unique(outcomes_all$patient_id),
              days_post_enrollment = all_days_post_enrollment)

outcomes_death_carried_forward <-
  patient_time_grid %>%
  left_join(outcomes_all) %>%
  group_by(patient_id) %>%
  arrange(patient_id, days_post_enrollment) %>%
  mutate(
    # For diagnostic purposes
    covid_scale_raw = covid_scale,
    died = cumsum(ifelse(is.na(covid_scale), 0, covid_scale == "1 - death")) > 0,
    covid_scale = ifelse(died, "1 - death", as.character(covid_scale_raw)),
    imputed = died & (is.na(covid_scale_raw) | (covid_scale_raw != "1 - death")),
    inconsistent = died & (!is.na(covid_scale_raw)) & (covid_scale_raw != "1 - death")
  ) %>%
  ungroup() %>%
  filter(!is.na(covid_scale)) %>%
  arrange(patient_id, days_post_enrollment)
```

Sanity checks: 

1)

```{r}
patient_time_grid %>% 
  left_join(outcomes_all) %>% 
  filter(
    !is.na(covid_scale)) == 
  sum(!is.na(outcomes_all$covid_scale)) %>%
  as.data.frame() %>%
  colSums()
```

2)

```{r}
outcomes_death_carried_forward %>% filter(inconsistent) %>% nrow()
```

3)

```{r}
sum(outcomes_death_carried_forward$imputed)
```

4)

```{r}
outcomes_death_carried_forward %>% filter(covid_scale == "1 - death") 
```

Compare naive and "carried forward" versions of the primary outcome:

```{r}
outcomes_main_naive <-
  outcomes_all %>% 
  group_by(patient_id) %>% 
  arrange(days_post_enrollment) %>% 
  filter(
    days_post_enrollment >= outcome_min, 
    days_post_enrollment <= outcome_max) %>%
  summarise(
    covid_scale = covid_scale[1]
  )
```

```{r}
outcomes_main_death_carried_forward <-
  outcomes_death_carried_forward %>% 
  group_by(patient_id) %>% 
  arrange(days_post_enrollment) %>% 
  filter(
    days_post_enrollment >= outcome_min, 
    days_post_enrollment <= outcome_max) %>%
  summarise(
    covid_scale = covid_scale[1]
  )
```

Comparison:

```{r}
nrow(outcomes_main_naive); nrow(outcomes_main_death_carried_forward)

nrow(inner_join(outcomes_main_naive, outcomes_main_death_carried_forward))
```

```{r}
outcomes_main_naive %>%
  bind_rows(outcomes_main_death_carried_forward) %>%
  filter(patient_id %in% 
           setdiff(outcomes_main_naive$patient_id, 
                   outcomes_main_death_carried_forward$patient_id))
```

Finally, add the baseline outcomes and primary outcome data to the patient data, format properly, and write out:

```{r}
patients_3 <-
  patients_2 %>%
  # Add NIAID scale at baseline
  left_join(
    outcomes_baseline %>%
      select(-days_post_enrollment, -siteid) %>%
      rename(niaid_baseline = covid_scale)
    ) %>%
  # Add NIAID scale value for main outcome
  left_join(
    outcomes_main_death_carried_forward %>%
      rename(niaid_outcome = covid_scale)
  ) %>%
  mutate(
    niaid_baseline_fct = 
      factor(
        as.numeric(niaid_baseline), 
        levels = 2:5,
        labels = niaid_levels[2:5],
        ordered = TRUE),
    niaid_outcome =
      factor(
        niaid_outcome,
        labels = niaid_levels,
        ordered = TRUE
      ))
```

Write this out:

```{r}
write_rds(patients_3, file.path(output_dir, "patients.rds"))
```


```{r}
sessionInfo()
```


```{r}
Sys.time()
```