From 0b969e6566939649d71b1cfa2ecad1920403aedf Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Tue, 9 Jan 2024 23:19:10 -0500 Subject: [PATCH 01/16] adding to summary --- modules/Data_Cleaning/Data_Cleaning.Rmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index e9a484e6b..2475125e9 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -364,7 +364,8 @@ Pay attention to your data and your `NA` values! ## Summary - `is.na()`,`any(is.na())`, `count()`, and functions from `naniar` - like `gg_miss_var()` can help determine if we have `NA` values + like `gg_miss_var()` and `miss_var_summary` can help determine if we have `NA` values +- `miss_var_which` can help you drop columns that have any missing values. - `filter()` automatically removes `NA` values - can't confirm or deny if condition is met (need `| is.na()` to keep them) - `drop_na()` can help you remove `NA` values from a variable or an From 96af35fbb0ebaf957c545ffb2f224ea51419c412 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Tue, 9 Jan 2024 23:22:05 -0500 Subject: [PATCH 02/16] making one of part 1 questions a bonus --- modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd | 4 +++- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd index d257a54aa..921954eec 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd @@ -53,7 +53,9 @@ Hint: You first need to `pull` out the vector version of this variable to use th ``` -4. What percentage of the `subType` variable is complete of `bike` ? Hint: use another `naniar` function. +**Bonus / Extra practice** + +What percentage of the `subType` variable is complete of `bike` ? Hint: use another `naniar` function. ```{r} diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 4e344da72..6bd5bbe05 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -53,7 +53,9 @@ gg_miss_var(bike) ``` -4. What percentage of the `subType` variable is complete of `bike` ? Hint: use another `naniar` function. +**Bonus / Extra practice** + +What percentage of the `subType` variable is complete of `bike` ? Hint: use another `naniar` function. ```{r} pull(bike, subType) %>% pct_complete() # this From 2189e265f9014cdb4055a03b0f947557bed58b2e Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Tue, 9 Jan 2024 23:31:48 -0500 Subject: [PATCH 03/16] no more gender --- .../Data_Cleaning/lab/Data_Cleaning_Lab.Rmd | 14 ++++---- .../lab/Data_Cleaning_Lab_Key.Rmd | 33 ++++++++++--------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd index 921954eec..57f35db42 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd @@ -70,11 +70,11 @@ Let's say we the data like so: ```{r} BloodType <- tibble( - gender = + weight_loss = c( - "M", "Male", "Female", "F", "M", - "Male", "Other", "M", "F", "Other", - "F", "Male", NA, "Male", "Female" + "Y", "No", "Yes", "y", "no", + "n", "No", "N", "yes", "Yes", + "No", "N", NA, "N", "Other" ), type = c( "A.-", "AB.+", "O.-", "O.+", "AB.-", @@ -93,13 +93,13 @@ BloodType There are some issues with this data that we need to figure out! -1. Determine how many `NA` values there are for `gender`. +1. Determine how many `NA` values there are for `weight_loss` (assume you know that`N` and `n` is for no). . ```{r} ``` -2. Recode the `gender` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Don't forget to use quotes! +2. Recode the `weight_loss` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Don't forget to use quotes! ```{r} @@ -108,7 +108,7 @@ There are some issues with this data that we need to figure out! ``` -Check to see how many values `gender` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process. +Check to see how many values `weight_loss` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process. 3. ```{r} diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 6bd5bbe05..516c6682a 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -71,11 +71,11 @@ Let's say we the data like so: ```{r} BloodType <- tibble( - gender = + weight_loss = c( - "M", "Male", "Female", "F", "M", - "Male", "Other", "M", "F", "Other", - "F", "Male", NA, "Male", "Female" + "Y", "No", "Yes", "y", "no", + "n", "No", "N", "yes", "Yes", + "No", "N", NA, "N", "Other" ), type = c( "A.-", "AB.+", "O.-", "O.+", "AB.-", @@ -94,39 +94,40 @@ BloodType There are some issues with this data that we need to figure out! -1. Determine how many `NA` values there are for `gender`. +1. Determine how many `NA` values there are for `weight_loss` (assume you know that`N` and `n` is for no). ```{r} -count(BloodType, gender) # the simple way -sum(is.na(pull(BloodType, gender))) # another way +count(BloodType, weight_loss) # the simple way +sum(is.na(pull(BloodType, weight_loss))) # another way BloodType %>% # another way - pull(gender) %>% + pull(weight_loss) %>% is.na() %>% sum() ``` -2. Recode the `gender` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Don't forget to use quotes! +2. Recode the `weight_loss` variable of the `BloodType` data so that it is consistent. Use `case_when()`. Keep "Other" as "Other". Don't forget to use quotes! ```{r} BloodType <- BloodType %>% - mutate(gender = case_when( - gender %in% c("M", "m", "Male") ~ "Male", - gender %in% c("F", "female", "Female") ~ "Female", - gender %in% c("Other") ~ "Other" + mutate(weight_loss = case_when( + weight_loss %in% c("N", "n", "No", "no") ~ "No", + weight_loss %in% c("Y", "y", "Yes", "yes") ~ "Yes", + weight_loss %in% c("Other") ~ "Other" )) -count(BloodType, gender) +count(BloodType, weight_loss) ``` -Check to see how many values `gender` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process. +Check to see how many values `weight_loss` has for each category (hint: use `count`). It's good practice to regularly check your data throughout the data wrangling process. 3. ```{r} -BloodType %>% count(gender) +BloodType %>% count(wight_loss) ``` 4. Recode the `type` variable of the `BloodType` data to be consistent. Use `recode`. Hint: the inconsistency has to do with lower case `o` and capital `O`. Don't forget to use quotes! + ```{r} BloodType <- BloodType %>% mutate(type = recode(type, From 1c854b1412da37e11e27506c1aee13a32baa0e60 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Tue, 9 Jan 2024 23:33:03 -0500 Subject: [PATCH 04/16] remove word filter from lab --- modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd | 2 +- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd index 57f35db42..c616de34f 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd @@ -39,7 +39,7 @@ Hint: You first need to `pull` out the vector version of this variable to use th -2. Filter rows of bike, so that only rows remain that do NOT have missing values for the `route` variable, using `drop_na`. Assign this to the object `have_route.` +2. Clean rows of bike, so that only rows remain that do NOT have missing values for the `route` variable, using `drop_na`. Assign this to the object `have_route.` ```{r} diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 516c6682a..67b11dae3 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -40,7 +40,7 @@ bike %>% -2. Filter rows of bike, so that only rows remain that do NOT have missing values for the `route` variable, using `drop_na`. Assign this to the object `have_route.` +2. Clean rows of bike, so that only rows remain that do NOT have missing values for the `route` variable, using `drop_na`. Assign this to the object `have_route.` ```{r} have_rout <- bike %>% drop_na(route) From 9c1d0744479ea3bd5d7478020ea68c76e8cd6477 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Tue, 9 Jan 2024 23:38:48 -0500 Subject: [PATCH 05/16] fixing typo --- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 67b11dae3..9d3aac516 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -123,7 +123,7 @@ Check to see how many values `weight_loss` has for each category (hint: use `cou 3. ```{r} -BloodType %>% count(wight_loss) +BloodType %>% count(weight_loss) ``` 4. Recode the `type` variable of the `BloodType` data to be consistent. Use `recode`. Hint: the inconsistency has to do with lower case `o` and capital `O`. Don't forget to use quotes! From 382c94516a890facead98c26ec7abbe5a45f92da Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 00:02:41 -0500 Subject: [PATCH 06/16] treatment instead of gender --- modules/Data_Cleaning/Data_Cleaning.Rmd | 147 +++++++++++++----------- 1 file changed, 77 insertions(+), 70 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 2475125e9..f31e45839 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -388,17 +388,17 @@ Pay attention to your data and your `NA` values! set.seed(124) data_diet <- tibble(Diet = rep(c("A", "B", "B"), times = 4), - Gender = c("Male", - "m", + Treatment = c("Ginger", + "g", "Other", - "F", - "Female", - "M", - "f", + "peppermint", + "Peppermint", + "G", + "Mint", "O", - "Man", - "f", - "F", + "ginger", + "mint", + "Mint", "O"), Weight_start = sample(100:250, size = 12), Weight_change = sample(-10:20, size = 12)) @@ -418,12 +418,12 @@ This needs lots of recoding. ```{r} data_diet %>% - count(Gender) + count(Treatment) ``` ## `dplyr` can help!{.codesmall} -Using Excel to find all of the different ways `gender` has been coded, +Using Excel to find all of the different ways `Treatment` has been coded, could be hectic! In `dplyr` you can use the `recode` function. ::: {style="color: red;"} @@ -448,13 +448,14 @@ Need quotes for new values! Tolerates quotes for old values. ```{r, eval = FALSE} data_diet %>% - mutate(Gender = recode(Gender, M = "Male", - m = "Male", - Man = "Male", - O = "Other", - f = "Female", - F = "Female")) %>% - count(Gender, Diet) + mutate(Treatment = recode(Treatment, G = "Ginger", + g = "Ginger", + ginger = "Ginger", + O = "Other", + Mint = "Peppermint", + mint = "Peppermint", + peppermint = "Peppermint")) %>% + count(Treatment, Diet) ``` ## `recode()` @@ -462,13 +463,14 @@ data_diet %>% ```{r, eval = TRUE} data_diet %>% - mutate(Gender = recode(Gender, M = "Male", - m = "Male", - Man = "Male", - O = "Other", - f = "Female", - F = "Female")) %>% - count(Gender) + mutate(Treatment = recode(Treatment, G = "Ginger", + g = "Ginger", + ginger = "Ginger", + O = "Other", + Mint = "Peppermint", + mint = "Peppermint", + peppermint = "Peppermint")) %>% + count(Treatment, Diet) ``` @@ -490,26 +492,28 @@ Need quotes for conditions and new values! ```{r} data_diet %>% - mutate(Gender = case_when(Gender == "M" ~ "Male", - Gender == "m" ~ "Male", - Gender == "Man" ~ "Male", - Gender == "O" ~ "Other", - Gender == "f" ~ "Female", - Gender == "F" ~ "Female")) %>% - count(Gender) + mutate(Treatment = case_when(Treatment == "G" ~ "Ginger", + Treatment == "g" ~ "Ginger", + Treatment == "ginger" ~ "Ginger", + Treatment == "O" ~ "Other", + Treatment == "Mint" ~ "Peppermint", + Treatment == "mint" ~ "Peppermint", + Treatment == "peppermint" ~ "Peppermint")) %>% + count(Treatment) ``` ## What happened?{.codesmall} -```{r} +```{r, eval = FALSE} data_diet %>% - mutate(Gender = case_when(Gender == "M" ~ "Male", - Gender == "m" ~ "Male", - Gender == "Man" ~ "Male", - Gender == "O" ~ "Other", - Gender == "f" ~ "Female", - Gender == "F" ~ "Female")) + mutate(Treatment = case_when(Treatment == "G" ~ "Ginger", + Treatment == "g" ~ "Ginger", + Treatment == "ginger" ~ "Ginger", + Treatment == "O" ~ "Other", + Treatment == "Mint" ~ "Peppermint", + Treatment == "mint" ~ "Peppermint", + Treatment == "peppermint" ~ "Peppermint")) ``` ## Original data @@ -540,14 +544,15 @@ or it can be the original values of the column ```{r} data_diet %>% - mutate(Gender = case_when(Gender == "M" ~ "Male", - Gender == "m" ~ "Male", - Gender == "Man" ~ "Male", - Gender == "O" ~ "Other", - Gender == "f" ~ "Female", - Gender == "F" ~ "Female", - TRUE ~ Gender)) %>% - count(Gender) + mutate(Treatment = case_when(Treatment == "G" ~ "Ginger", + Treatment == "g" ~ "Ginger", + Treatment == "ginger" ~ "Ginger", + Treatment == "O" ~ "Other", + Treatment == "Mint" ~ "Peppermint", + Treatment == "mint" ~ "Peppermint", + Treatment == "peppermint" ~ "Peppermint", + TRUE ~ Treatment)) %>% + count(Treatment) ``` @@ -555,10 +560,12 @@ data_diet %>% ```{r} data_diet %>% - mutate(Gender = case_when( - Gender %in% c("M", "male", "Man", "m", "Male") ~ "Male", - Gender %in% c("F", "Female", "f", "female") ~ "Female", - Gender %in% c("O", "Other") ~ "Other")) + mutate(Treatment = case_when( + Treatment %in% c("G", "g", "Ginger", "ginger") ~ "Ginger", + Treatment %in% c("Mint", "mint", "Peppermint", "peppermint") ~ "Peppermint", + Treatment %in% c("O", "Other") ~ "Other")) %>% + + count(Treatment) ``` @@ -682,13 +689,13 @@ str_sub(string = Effect, start = 1, end = 3) ```{r} head(data_diet, n = 4) data_diet %>% - filter(str_detect(string = Gender, - pattern = "M")) + filter(str_detect(string = Treatment, + pattern = "int")) ``` ## OK back to our original problem ```{r} -count(data_diet, Gender) +count(data_diet, Treatment) ``` ## Recode was nice but what if miss something? @@ -696,13 +703,13 @@ count(data_diet, Gender) ```{r, eval = FALSE} data_diet %>% - mutate(Gender = recode(Gender, M = "Male", - m = "Male", - Man = "Male", - O = "Other", - f = "Female", - F = "Female")) %>% - count(Gender, Diet) + mutate(Treatment = recode(Treatment, G = "Ginger", + g = "Ginger", + ginger = "Ginger", + O = "Other", + Mint = "Peppermint", + mint = "Peppermint", + peppermint = "Peppermint")) ``` ## `case_when()` was an improvement @@ -711,10 +718,10 @@ But we still might miss a strange value ```{r, eval = FALSE} data_diet %>% - mutate(Gender = case_when( - Gender %in% c("M", "male", "Man", "m", "Male") ~ "Male", - Gender %in% c("F", "Female", "f", "female") ~ "Female", - Gender %in% c("O", "Other") ~ "Other")) + mutate(Treatment = case_when( + Treatment %in% c("G", "g", "Ginger", "ginger") ~ "Ginger", + Treatment %in% c("Mint", "mint", "Peppermint", "peppermint") ~ "Peppermint", + Treatment %in% c("O", "Other") ~ "Other")) ``` ## `case_when()` improved with `stringr` @@ -723,11 +730,11 @@ data_diet %>% ```{r} data_diet %>% - mutate(Gender = case_when( - str_detect(string = Gender, pattern = "^m|^M") ~ "Male", - str_detect(string = Gender, pattern = "^f|^F") ~ "Female", - str_detect(string = Gender, pattern = "^o|^O") ~ "Other")) %>% - count(Gender) + mutate(Treatment = case_when( + str_detect(string = Treatment, pattern = "^g|^G") ~ "Ginger", + str_detect(string = Treatment, pattern = "int") ~ "Peppermint", + str_detect(string = Treatment, pattern = "^o|^O") ~ "Other")) %>% + count(Treatment) ``` This is a more robust solution! It will catch typos as long as first letter is correct. From fd0c7095bc3349bece4a65b6080c4c0aeca7f803 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 00:22:14 -0500 Subject: [PATCH 07/16] adding all to summary --- modules/Data_Cleaning/Data_Cleaning.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index f31e45839..6cf344bf6 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -363,8 +363,7 @@ Pay attention to your data and your `NA` values! ## Summary -- `is.na()`,`any(is.na())`, `count()`, and functions from `naniar` - like `gg_miss_var()` and `miss_var_summary` can help determine if we have `NA` values +- `is.na()`,`any(is.na())`, `all(is.na())`,`count()`, and functions from `naniar` like `gg_miss_var()` and `miss_var_summary` can help determine if we have `NA` values - `miss_var_which` can help you drop columns that have any missing values. - `filter()` automatically removes `NA` values - can't confirm or deny if condition is met (need `| is.na()` to keep them) From 41f44bd36cad1116e46af6c21b75a68a53200266 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 00:58:12 -0500 Subject: [PATCH 08/16] adding some emo package stuff --- modules/Data_Cleaning/Data_Cleaning.Rmd | 32 ++++++++----------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 6cf344bf6..dce470929 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -4,19 +4,19 @@ output: ioslides_presentation: css: ../../docs/styles.css widescreen: yes -editor_options: - markdown: - wrap: 72 --- -```{r, echo = FALSE, message = FALSE} +```{r, echo = FALSE, message = FALSE, results='hide'} library(knitr) opts_chunk$set(comment = "") library(readr) suppressPackageStartupMessages(library(dplyr)) library(tidyverse) +devtools::install_github("hadley/emo") +library(emo) ``` + ## Recap on summarization - `summary(x)`: quantile information @@ -85,29 +85,17 @@ is.nan(test) is.infinite(test) ``` -## Useful checking functions {.small} -Do we have any `NA`s? (`any` can help) +## Useful checking functions + +`any()` can help you check if there are any `NA` values in a vector ```{r} -A <- c(1, 2, 3, NA) -B <- c(1, 2, 3, 4) -any(is.na(A)) # are there any NAs - YES/TRUE -any(is.na(B)) # are there any NAs- NO/FALSE +test +any(is.na(test)) ``` -## Useful checking functions {.small} - -Are all the values `NA`? (`all` can help) - -```{r isna} -A <- c(1, 2, 3, NA) -B <- c(1, 2, 3, 4) -all(is.na(A)) # are there any NAs - YES/TRUE -all(is.na(B)) # are there any NAs- NO/FALSE -``` - ## Finding `NA` values with `count()`{.codesmall} ::: {style="color: red;"} @@ -123,7 +111,7 @@ Check if rare values make sense. ```{r, message=FALSE} library(jhur) -bike <- read_bike() +bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv") bike %>% count(subType) ``` From a3109a5e1be607c888e71378259e02583693442e Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 01:17:01 -0500 Subject: [PATCH 09/16] adding cleaning data to github --- data/cleaning_diet_data.xlsx | Bin 0 -> 5366 bytes modules/Data_Cleaning/Data_Cleaning.Rmd | 4 ++++ 2 files changed, 4 insertions(+) create mode 100644 data/cleaning_diet_data.xlsx diff --git a/data/cleaning_diet_data.xlsx b/data/cleaning_diet_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0af7a01603e23a0d2b04322a8e79a194d387399a GIT binary patch literal 5366 zcmZ`-2UL?=)(ug5M@s0u_g<6|dNC9M=_*2i&_eGZ0>YysO+k7?;E;stK5)J@>e;ok0 zf&Iox4ess(b@#b%66gW-wh;+%bFD}laPMTJ&^S6L`hb$)vN18G;f+_ybfewCHw5+Y zfhZ(((pp7PY1$Pdx6<7c24@@A$s#`4&&i=8qGXO!$-BZOcN9RI>)e;mn`y~&*!kh< z#4NfkIIC&9ADvLK;X<;(LV*x!)?#%VHH2Xlfs$Ey3WLUp>_@0cP{A4-1}`$WERuJ} z#Bj5{2jN&cHbNO}vA(6G4UEZAJnYCc^`b%4SQ=FhWXQJib!D=7IlgM2)_ngFC_+v( z-MzsbFqr!&2Qm01r601#vM`(992Z&8t7 z&x+J`{Z26w6-S?ySHbFsg6I;<7kCajympfx^BH}~jHYPSjg(MM%pXB*{-Em;^ zbnO zWn0MRXnEGpXMdYH_Hp1CCMVi#tw6vKy>4@Nvgu6m)k|%3K|U(SiPE@Ce|f;N)t70; zhH!m2K~J^rvVkf4Nd%|A1)<%U#0ak}It1`{MZTyrBF3>w2w@dr!0rLAqQ6D)hkLm^ zgu`8arQmOYo({(W#Yti?N*aBkyplv54|@iXi>3!Sy1e`Nj?*Ld{<(E^e2s#C1?*6V z5uWUU?{2Q3U$e%|2EHoj^)g(2Q>8HJ^cmBlM4NA(@cK4}83u~Awb8$8^)0c8*}bVv z0LF?r?UB9iop4ESI4?e4GD0D1isLMc7xZLn zv$5u2W9vvopCza5J}dbwl}<$V=M<;`B)*Wi2wl5Z!vz!;Ozv3`gn`}V-x?SM@FU<{Dl~G z&%zNu1%?phMOUCpS39FN?!1NS>0`%#O9YzIuL(V+znzmlZ4?MTwhjrqIAxeS6DD3K z1@XSKY;yYEL}o%bmnLALn9s@$+W? zma)Fe6+^WzUQv+xG95`pJY}EN1>#;C|jN3%j zwSW@|bQ4 zEuEExuK*agDOezY_I6OLkifDq4rQQfhm@P>2GYICv03|*24^Pw;pXAk!dAdV?}lT^ z_pMjRG$u3p_XbBrY?c`+RO){;W#M1Yi z$^u+EYH%HSkcO1Up%PG5$e{fVhzz}|192F2gGHT=9%Ynmvi_8ox8$73C?w|gYvKCn zM!dUJ`wA}waMnXV47u~Pm$uqL#JZOC0>?Ow{T4`~&-QgN zrGqD&>2+Ru1a*;oPp8-^UStaE3Dgheck_^+F|MaXjH z%ym3t(dR$zqZ`lDwDyvSP9DhDdYvR(n|Q|s*DaHi&W=e919e~G{5`89LC)OT*dD`! z&1~{NS^clWooFoa7j7IgKuAPcTP07|SPl?Sqe`;#Lm>i8di=%5gJ1%-UO)`Xb z=hJN>r}H2}0^6`)vw;b4^_!xOEqRI+x6=+r%hS?u{_V5K6c!CK(^%77x`+>TlUUcwOmI!)@a2JwQFWmAbk& z`q9tz9s8Kf$N7D=reBB2goqKS*`+RD=`_lpG3?d20Dv*$@5YX;t3FOpH|X!@KdQ)< z1tfV?6V$RVsM!*^@_}rM#^PNc54xknS=P`q=ktx=-p_U)Z~OICMA#MUn!?8Qisyq$ zr#T*v5uZ!X+^IEL7CkuBO5Jrp+S|US1(U@(p1ogS4eM#}Is^}**(p=^irnO%In#2) z4qwZTa@!ore^d=+FY};!(0yYBRibWbRwrWJ{z8C59IW!P7cz_DC?0{#WGcDZ1;)PO zdK}Ct^6S8GKo8$G4e| z8`N_-dhqOcPoEl3L4nKhD!eZpnLx$pcQ6%)bfQ&V>zZ3|#<5@jKgjwxX4s#CqY% z_C6`ikT>MyobJ)1Q~sZu8~&oKkD%Rp%$KJIdFMYipE=}@JQi&U+v};}6>T{=8)ju; zT$}-t`yPjPs~1|W5^U{*TqOzE)I4S4z)`R#K&@~oK(IhL01gqCTv90d_Pzfh%l76>D}EuVqgvx3{>k$c#F}mIZxDr&)nQL7i-pQ^ z&`2MF)EQ&A*omM=$ImIQO)2UMhRqeXET$$?H)S=rQ4hGro( z!F0ZIlGO|BF{WJ9*By8=qfBQ5NDZiy5*b#WZ}h$}*WlqMFX^a~vljTvM;oNo9Ig#3 zY!EJ?@r6J5s^0w;gM?XY5NY)AtU?AGl4h4y+@n^;N+ls2%%@M8)Z!k#>&Y-8yjPLp zO6u9x*YB}Vte#u(27?xp;MMkX9ja>TL$O5tL9QXnF!POC>>ZIMt_KyXxvaeWpgc@qjU-!s z_4F5T^&EOQ+FRyCAiXhPv-^RKZQ=l&caT5aMc4aQjTFWNhFXJ^;ziX&wvN7`D`dA5 z;{>lAMYIbEt+Q~@FNLRzdnH86B}>1T{}e!59FHiw0oY@C`~b)c4!VzIZK*tu%pQBq+IGtZCA+118+-(+6+81^&-`BW+6m` zK&y1HZ>rEiRj~V7%)WRVI&osO@RQetn(+6u0&WE5?s!~o7xH}%Z`@yzd~&L4GL)yF z1Hbt1M5R%VnxQBP!vgoLfbo9O*Ndxy*+>O3Jx=Q%MVYjxz?b)jeS6KagMJXrEuho8 z<+GQYTTAS-i0xcBd&y12@CXu+Om0@zrGKf>>2iKM!X+VV5JD4!E$ju@Gn?Tkoz#lN-OFuIEdkg?Sv-Nlu* zS~Wyv(}``Xcd*ut^mkkJ_6c-_djGm1Hk&NO=ZH~+VeV7z&3n@X&C(q5J$a&PMdnT% zn#dq6ro%-~*W7x+qd8~eB>z?I{@|djKSaLZD=GA3TYp0y5=}ChB7JoRe5dZ#b`?yrGAFRZo;#;-Of}qT=$7yiUlRp72>Na}m3qOD?T;`6iXJ z96tkVgcR4*l1H9X0g34T1jE4_6rw%CQP6PhA|n^q(1PEf|2^sfvs&RR1=@8{Q4C8hg)L~33aUnZm)bh!;>{BAkF6|o`ZI3?EHdHy)JSv15H|<{v zD}*X%$J&OUIN3ciIt7+XTDl9zL!iyR-!iN_S;nsS))N>vq3ZqX4uT}>n4z{`h%O{& z+3L#?3^e%?kDk+e%8Wv-<7qTB=ni>^J*AK|oQp77+B1&JlthNZmF>efL$qd9@n?!c zdO@uw9)YC&vVXrs>Et?jb7P}zh`mHnVxw&jw=?vDdw7f5!M&irBA%Q2(HVP5)~P}YW}j41@33g|C$EgC1k&N}z=Gm@U!kU7O=O_!D~ z#0ArXY>OpcFoX)NpK$i8u4?X0FqD2`0cK14XL#H;Cx_1~>==Q2sd}&TSSO;y6{0;U ztkAcp`)pI|QTO(r>$bKh#lu^0tf?fr7&xBY@(Ka0xw94CxD$u75f(CYt*8whfp_Ct z32ql1{CZWVlUkLvd~IAXK4PLquHGL^+FiLB=RRU(92VC~TZK6qh3sF&geK|E{FO;Uzl0dqcffY6ht;kdi#b-pz5bUXDUytl2&eBD-&+uk(v_Ko|g2sG8-&>0?(5s&4?~uS9u(Y_|-pa7dV$xcZAwgOE z)#=k?9#%;yR+x)-X0t#P%PafbI8FJ7_%j8dccj`zEyu^vq=14&JMR=@jL8dP3zBFq z{lt*#IzOpB?RtSq6M{4wbjuHQZ;8DnTxnbv4BKHg_?g*!;ASWyoRa*lywG}7LD)oVco_WXWH<(2B>qImV8LU(=jGykxJ4My zP!9*2#MbD#6vM%P-__0yuX2j~UEW;ML*8AK+u`FZ|!5imN3$NM+Szg*Q-@KrzX2Rw!U5AaoYaFyU{yZJ-VgzY}q z|L~{zT!mh(U4NiuSU>pxD%n-|)nobxevUo0|Aznb&>HIDV$}cu2(fP^Y?KPAetr5s DFEl6K literal 0 HcmV?d00001 diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index dce470929..4556c4980 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -393,6 +393,10 @@ data_diet <- tibble(Diet = rep(c("A", "B", "B"), ``` +```{r, include = FALSE, echo = FALSE} +#writexl::write_xlsx(data_diet, here::here("data", "cleaning_diet_data.xlsx") +``` + ## Say we have some data about samples in a diet study: ```{r} From 4e49a64343dc100a2f1358d43cf341b2bde919ff Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 01:24:47 -0500 Subject: [PATCH 10/16] add reading in from excel --- modules/Data_Cleaning/Data_Cleaning.Rmd | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 4556c4980..fee5b0d85 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -396,6 +396,15 @@ data_diet <- tibble(Diet = rep(c("A", "B", "B"), ```{r, include = FALSE, echo = FALSE} #writexl::write_xlsx(data_diet, here::here("data", "cleaning_diet_data.xlsx") ``` +## Reading in the data if it were an excel sheet +Data is also here: + +http://jhudatascience.org/intro_to_r/data/cleaning_diet_data.xlsx + +```{r} +library(readxl) +data_diet<- read_excel(here::here("data", "cleaning_diet_data.xlsx")) +``` ## Say we have some data about samples in a diet study: From 723726e912e52f18fc8e73a606bede56db333c4d Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 01:28:26 -0500 Subject: [PATCH 11/16] adding bike readin of data --- modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd | 2 +- modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd index c616de34f..ae923a7d4 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd @@ -22,7 +22,7 @@ library(tidyverse) library(broom) library(naniar) -bike <- read_bike() +bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv") ``` diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index 9d3aac516..afef625e2 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -22,7 +22,7 @@ library(tidyverse) library(broom) library(naniar) -bike <- read_bike() +bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv") ``` # Part 1 From 7e4dd149626ccee4347dbe2411e3619488422e9a Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 01:30:42 -0500 Subject: [PATCH 12/16] remove emojis from emo --- modules/Data_Cleaning/Data_Cleaning.Rmd | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index fee5b0d85..59db4c50c 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -12,8 +12,7 @@ opts_chunk$set(comment = "") library(readr) suppressPackageStartupMessages(library(dplyr)) library(tidyverse) -devtools::install_github("hadley/emo") -library(emo) + ``` @@ -50,7 +49,7 @@ inaccuracies, or recoding it in a way that makes it more manageable. ::: {style="color: red;"} -`r emo::ji("warning")` MOST IMPORTANT RULE - LOOK `r emo::ji("eyes")` AT YOUR DATA! `r emo::ji("warning")` + MOST IMPORTANT RULE - LOOK AT YOUR DATA! ::: @@ -299,9 +298,9 @@ bike %>% ::: -`r emo::ji("warning")` Sometimes removing `NA` values leads to distorted math - be careful! + Sometimes removing `NA` values leads to distorted math - be careful! -`r emo::ji("warning")` Think about what your `NA` means for your data (are you sure ?). + Think about what your `NA` means for your data (are you sure ?). - Is an `NA` for values so low they could not be reported? @@ -317,13 +316,13 @@ survey reports 0 if student has tried cigarettes but did not smoke that week -`r emo::ji("warning")` You might want to keep the `NA` values so that you know the original sample size. +You might want to keep the `NA` values so that you know the original sample size. ## Word of caution {.codesmall} -`r emo::ji("warning")` Calculating percentages will give you a different result depending on your choice to include NA values.! + Calculating percentages will give you a different result depending on your choice to include NA values.! This is because the denominator changes. From ddb4a73df7d2c7838d7156376d9de8bf6255937c Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 01:35:45 -0500 Subject: [PATCH 13/16] swap out emo for emoji --- modules/Data_Cleaning/Data_Cleaning.Rmd | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 59db4c50c..41825a91b 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -12,7 +12,8 @@ opts_chunk$set(comment = "") library(readr) suppressPackageStartupMessages(library(dplyr)) library(tidyverse) - +install.packages("emoji", repos='http://cran.us.r-project.org') +library(emoji) ``` @@ -49,7 +50,7 @@ inaccuracies, or recoding it in a way that makes it more manageable. ::: {style="color: red;"} - MOST IMPORTANT RULE - LOOK AT YOUR DATA! +`r emoji("warning")` MOST IMPORTANT RULE - LOOK `r emoji("eyes")` AT YOUR DATA! `r emoji("warning")` ::: @@ -298,9 +299,9 @@ bike %>% ::: - Sometimes removing `NA` values leads to distorted math - be careful! +`r emoji("warning")` Sometimes removing `NA` values leads to distorted math - be careful! - Think about what your `NA` means for your data (are you sure ?). +`r emoji("warning")` Think about what your `NA` means for your data (are you sure ?). - Is an `NA` for values so low they could not be reported? @@ -316,13 +317,13 @@ survey reports 0 if student has tried cigarettes but did not smoke that week -You might want to keep the `NA` values so that you know the original sample size. +`r emoji("warning")` You might want to keep the `NA` values so that you know the original sample size. ## Word of caution {.codesmall} - Calculating percentages will give you a different result depending on your choice to include NA values.! +`r emoji("warning")` Calculating percentages will give you a different result depending on your choice to include NA values.! This is because the denominator changes. From bebd92a1393cc219133d679c13a5f9e41b4ea4c0 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 01:41:16 -0500 Subject: [PATCH 14/16] requiring that they read in the data for the cleaning lab --- .../Data_Cleaning/lab/Data_Cleaning_Lab.Rmd | 15 ++++++++++++--- .../lab/Data_Cleaning_Lab_Key.Rmd | 19 +++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd index ae923a7d4..194ab7032 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab.Rmd @@ -21,13 +21,22 @@ library(jhur) library(tidyverse) library(broom) library(naniar) - -bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv") - ``` # Part 1 +0. Read in the bike data, you can use the URL or download the data. + +Bike Lanes Dataset: BikeBaltimore is the Department of Transportation's bike program. +The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms + +You can Download as a CSV in your current working directory. Note its also available at: http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv + +```{r} + + +``` + 1. Use the `is.na()` and `any()` functions to check if the bike `dateInstalled` variable has any `NA` values. Use the pipe between each step. Hint: You first need to `pull` out the vector version of this variable to use the `is.na()` function. diff --git a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd index afef625e2..b9233f5ff 100644 --- a/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd +++ b/modules/Data_Cleaning/lab/Data_Cleaning_Lab_Key.Rmd @@ -5,12 +5,6 @@ editor_options: chunk_output_type: console --- -# Data used - -Bike Lanes Dataset: BikeBaltimore is the Department of Transportation's bike program. -The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms - -You can Download as a CSV in your current working directory. Note its also available at: http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv ```{r} library(readr) @@ -21,12 +15,21 @@ library(jhur) library(tidyverse) library(broom) library(naniar) - -bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv") ``` + # Part 1 +0. Read in the bike data, you can use the URL or download the data. + +Bike Lanes Dataset: BikeBaltimore is the Department of Transportation's bike program. +The data is from http://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms + +You can Download as a CSV in your current working directory. Note its also available at: http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv + +```{r} +bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.csv") +``` 1. Use the `is.na()` and `any()` functions to check if the bike `dateInstalled` variable has any `NA` values. Hint: You first need to `pull` out the vector version of this variable to use the `is.na()` function. From 152c3dc9d3a7dfd39a2e59ad50813e883c6646da Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 02:13:35 -0500 Subject: [PATCH 15/16] adding example of why you might want NA values --- data/cleaning_diet_data.xlsx | Bin 5366 -> 5361 bytes modules/Data_Cleaning/Data_Cleaning.Rmd | 89 +++++++++++++----------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/data/cleaning_diet_data.xlsx b/data/cleaning_diet_data.xlsx index 0af7a01603e23a0d2b04322a8e79a194d387399a..cf1e3058205df6c3b4a9ee60e23ab7a5d5c899d2 100644 GIT binary patch delta 1084 zcmV-C1jGCGDe)<=oC6L@+e%iP0ssJJ3IG5Y0F#gt8h@3RTW^~%6oB9ND-iD~7;>!$ zRlBI$9;!}jw>?e{IRvYKk?l6yzn{UxO{q^*UJyRl?;Kx~tG`w0y?~I6*K40#guVxL z#&@jVul=8ox09vsNk!`&EqM)V{~t*ID*cokxOkEUfbw*JTCV*&b=!+B6qR{JKxrT9n;p64us4bPq{sFevKprlIU%Yrr1 zPm>+fy0#uqK)&{`$>oN`evl?z>R${F^5oxBbo&4$WD0h=0lwE(w&nb(ResZ=uyqaI z`QCO^--Wk>oIaQ82S0oTwl9RM9jM{D0tyF1YNz*|SuOlnY!_&7H>KR4>P`3g_~~LQABY(pfmEx!c8}`bh9yYiy1f2G$1PGUd%Jo zz{L=;m%9n)mT*3%f0*%k!ntYUG#U^Z`N~bWu!M^-gqZOjU4RSIr&TzhGje2mEG^w~ z%nW13cSz@!hK;NSY(_qrx#3n8ZZ)QLn1AuTi)>}cA~GN|^3V-N!q+oD9D{~h=hW{>+L~CT_32ma}AQXpW@-c(ksYte*`CwlZNi$jtaeiq*p?&vxXP?@DS!L(_^2Rc31k8ju14D!9m6hZX}`sDaD9%Kfdh&QebF7gNXstg5t9`g4f|@L$6#owTgQOH?1debnk*1ja}Cj z-K4js{1ksJSzh1!RdZzyrxJjgN(v@C@MwKg;maZ&uH_*v(uvm!y?)e}4c)sqqwKs>h%004MwFLQKxY-MvUcx`M@O927^02BZK z00;m803iS`T}3G*0RR9{0ssIJ0000000001000000Hu?T6g~m?lg$((2AvWB0001u Ca_`ju delta 1077 zcmV-51j_sIDfTI_oC6NUcx&jO0ssJ93IG5Y0F#Ro8h@3RTW^~%6vyB9DG=`o7;+b( zN*A@;L)B{Sw#UgK2e1kl*-o>4`x#7}l-f~wLHJyJe*XI!-TW;J=M5Mqv|77p?7I$B z3Eh(_Tf0AB?nVpO;R08CT+j;E?myt}P5dP~QufMo0O80ED!z7eA?lUq@g#>5^D(WV zlAe^Z5`PQXWSPfn26*2lm4z4h{>&>esa(~=ihZUJnx-UyElmz3R6=#apuj@n^PJS& zjiWu0zGgfQgLLg~&}xf9*NdaJ^e+NOe(~=JynBWM5&?TDfa^3P+fn-3IKNAi-=umU za_?HIC+6%S#fL)tpvP}OvRuecrn=e|3miv`9)BIycFx;xa^QlNs)xo9M@i$~=m2RY zRY59v7EF4HEGptH0TFpZ!Zd9!>9TIGXT3<#MI)9W_3QQ2-|cw*$mwUM?CqzrlN?tW zyifO}AX=3`4?nQ!S^Pxxsqt<_rR3hvMClC zY%yeVZNU!~(H4pgEqiRrKAM{0mKtt3M1M6~@S}-rsmKD+9*7hKTSu5*kmn*v?jMz)jPIJeel+m2`}H5yL)gvv1rErjO_1KBPCvx>H8+ww!* z&G1pf&S4`w-x+AorD2UWu0a%bQ`|pOItS|S5AWhhQsWFBFp~?2I|Zb&sy~+J1|?H3 znD$@Ly8V+!@{S7mU^%sN`HTbBv?OmzsW>&7hmw96#{U7cUt*nl*Qoa7xtuIq|!(pyu0 zioX_GukZb;xiW`S2|!IH1rr{4w7#kEb&(F&@(>s4#A^leX`l3wx6>lWK9E@oz>aax z%ID;`&=WiY*$0#H2Njd06A+Wd4G;?e00001000000K1b;6d)W)ooI6< v0RR9{0ssIJ0000000001000000I3g?!4DIYofIko{gcQPBL<=p00000Ib-Oa diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index 41825a91b..aeabb2732 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -376,14 +376,14 @@ set.seed(124) data_diet <- tibble(Diet = rep(c("A", "B", "B"), times = 4), Treatment = c("Ginger", - "g", + "Ginger", "Other", "peppermint", - "Peppermint", - "G", + "peppermint", + "Ginger", "Mint", "O", - "ginger", + "Ginger", "mint", "Mint", "O"), @@ -394,7 +394,7 @@ data_diet <- tibble(Diet = rep(c("A", "B", "B"), ```{r, include = FALSE, echo = FALSE} -#writexl::write_xlsx(data_diet, here::here("data", "cleaning_diet_data.xlsx") +#writexl::write_xlsx(data_diet, here::here("data", "cleaning_diet_data.xlsx")) ``` ## Reading in the data if it were an excel sheet Data is also here: @@ -448,14 +448,12 @@ Need quotes for new values! Tolerates quotes for old values. ```{r, eval = FALSE} data_diet %>% - mutate(Treatment = recode(Treatment, G = "Ginger", - g = "Ginger", - ginger = "Ginger", + mutate(Treatment = recode(Treatment, O = "Other", Mint = "Peppermint", mint = "Peppermint", peppermint = "Peppermint")) %>% - count(Treatment, Diet) + count(Treatment) ``` ## `recode()` @@ -463,14 +461,12 @@ data_diet %>% ```{r, eval = TRUE} data_diet %>% - mutate(Treatment = recode(Treatment, G = "Ginger", - g = "Ginger", - ginger = "Ginger", + mutate(Treatment = recode(Treatment, O = "Other", Mint = "Peppermint", mint = "Peppermint", peppermint = "Peppermint")) %>% - count(Treatment, Diet) + count(Treatment) ``` @@ -492,9 +488,7 @@ Need quotes for conditions and new values! ```{r} data_diet %>% - mutate(Treatment = case_when(Treatment == "G" ~ "Ginger", - Treatment == "g" ~ "Ginger", - Treatment == "ginger" ~ "Ginger", + mutate(Treatment = case_when( Treatment == "O" ~ "Other", Treatment == "Mint" ~ "Peppermint", Treatment == "mint" ~ "Peppermint", @@ -505,11 +499,13 @@ data_diet %>% ## What happened?{.codesmall} +We seem to have `NA` values! + +We didn't specify what happens to values that were already `Other` or `Ginger`. + ```{r, eval = FALSE} data_diet %>% - mutate(Treatment = case_when(Treatment == "G" ~ "Ginger", - Treatment == "g" ~ "Ginger", - Treatment == "ginger" ~ "Ginger", + mutate(Treatment = case_when( Treatment == "O" ~ "Other", Treatment == "Mint" ~ "Peppermint", Treatment == "mint" ~ "Peppermint", @@ -524,7 +520,7 @@ data_diet ## `case_when()` drops unspecified values Note that automatically values not reassigned explicitly by -`case_when()` will be `NA` unless otherwise specified. +`case_when()` will be `NA` unless otherwise specified. ::: codeexample ```{r, eval = FALSE} @@ -544,9 +540,20 @@ or it can be the original values of the column ```{r} data_diet %>% - mutate(Treatment = case_when(Treatment == "G" ~ "Ginger", - Treatment == "g" ~ "Ginger", - Treatment == "ginger" ~ "Ginger", + mutate(Treatment = case_when( + Treatment == "O" ~ "Other", + Treatment == "Mint" ~ "Peppermint", + Treatment == "mint" ~ "Peppermint", + Treatment == "peppermint" ~ "Peppermint", + TRUE ~ Treatment)) %>% + count(Treatment) +``` + +## Typically it is good practice to include the TRUE statement + +```{r, eval = FALSE} +data_diet %>% + mutate(Treatment = case_when( Treatment == "O" ~ "Other", Treatment == "Mint" ~ "Peppermint", Treatment == "mint" ~ "Peppermint", @@ -555,13 +562,29 @@ data_diet %>% count(Treatment) ``` +You never know if you might be missing something - and if a value already was an NA it will stay that way. + +## But maybe we want NA? + +Perhaps we want values that are O or Other to actually be NA, then `case_when` can be helpful for this. We simply specify everything else. + +```{r} +data_diet %>% + mutate(Treatment = case_when(Treatment == "Ginger" ~ "Ginger", + Treatment == "Mint" ~ "Peppermint", + Treatment == "mint" ~ "Peppermint", + Treatment == "peppermint" ~ "Peppermint")) %>% + count(Treatment) +``` ## More complicated case_when() +`case_when` can do more complicated statements than `recode` and can match many patterns at a time. + ```{r} data_diet %>% mutate(Treatment = case_when( - Treatment %in% c("G", "g", "Ginger", "ginger") ~ "Ginger", + Treatment == "Ginger" ~ "Ginger", # keep it the same! Treatment %in% c("Mint", "mint", "Peppermint", "peppermint") ~ "Peppermint", Treatment %in% c("O", "Other") ~ "Other")) %>% @@ -573,7 +596,7 @@ data_diet %>% ## Another reason for `case_when()` -`case_when` can do very sophisticated comparisons +`case_when` can do very sophisticated comparisons! ```{r} @@ -592,22 +615,6 @@ head(data_diet) data_diet %>% count(Diet, Effect) ``` -## Taking a look at the data - -```{r, echo = FALSE} -diet_effect <- data_diet %>% - count(Diet, Effect) - -data_diet %>% - count(Diet, Effect)%>% - ggplot(aes(x = Effect,y = n, fill = Diet)) + - geom_col(position = position_dodge()) + - labs(y = "Individuals", - title = "Effect of diet A & B on participants") - - -``` - # Working with strings From 6db4ad095295573c6dc1d5396560eb65efd62716 Mon Sep 17 00:00:00 2001 From: carriewright11 Date: Wed, 10 Jan 2024 02:27:42 -0500 Subject: [PATCH 16/16] adding showing more of what is happening in recoding and case_when so we dont overwrite only --- modules/Data_Cleaning/Data_Cleaning.Rmd | 78 ++++++++++++++++--------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/modules/Data_Cleaning/Data_Cleaning.Rmd b/modules/Data_Cleaning/Data_Cleaning.Rmd index aeabb2732..f9cf6c12f 100644 --- a/modules/Data_Cleaning/Data_Cleaning.Rmd +++ b/modules/Data_Cleaning/Data_Cleaning.Rmd @@ -433,7 +433,7 @@ could be hectic! In `dplyr` you can use the `recode` function. ```{r, eval = FALSE} # General Format - this is not code! {data_input} %>% - mutate({variable_to_fix} = recode({Variable_fixing}, {old_value} = {new_value}, + mutate({variable_to_fix_or_new} = recode({Variable_fixing}, {old_value} = {new_value}, {another_old_value} = {new_value})) ``` @@ -448,12 +448,12 @@ Need quotes for new values! Tolerates quotes for old values. ```{r, eval = FALSE} data_diet %>% - mutate(Treatment = recode(Treatment, + mutate(Treatment_recoded = recode(Treatment, O = "Other", Mint = "Peppermint", mint = "Peppermint", peppermint = "Peppermint")) %>% - count(Treatment) + count(Treatment, Treatment_recoded) ``` ## `recode()` @@ -461,14 +461,27 @@ data_diet %>% ```{r, eval = TRUE} data_diet %>% - mutate(Treatment = recode(Treatment, + mutate(Treatment_recoded = recode(Treatment, O = "Other", Mint = "Peppermint", mint = "Peppermint", peppermint = "Peppermint")) %>% - count(Treatment) + count(Treatment, Treatment_recoded) ``` +## Can update or overwrite variables with recode too! + +Just use the same variable name to change the variable within mutate. + +```{r, eval = TRUE} +data_diet %>% + mutate(Treatment= recode(Treatment, + O = "Other", + Mint = "Peppermint", + mint = "Peppermint", + peppermint = "Peppermint")) %>% + count(Treatment) +``` ## Or you can use `case_when()` @@ -488,12 +501,12 @@ Need quotes for conditions and new values! ```{r} data_diet %>% - mutate(Treatment = case_when( + mutate(Treatment_recoded = case_when( Treatment == "O" ~ "Other", Treatment == "Mint" ~ "Peppermint", Treatment == "mint" ~ "Peppermint", Treatment == "peppermint" ~ "Peppermint")) %>% - count(Treatment) + count(Treatment, Treatment_recoded) ``` @@ -512,10 +525,6 @@ data_diet %>% Treatment == "peppermint" ~ "Peppermint")) ``` -## Original data -```{r} -data_diet -``` ## `case_when()` drops unspecified values @@ -540,34 +549,47 @@ or it can be the original values of the column ```{r} data_diet %>% - mutate(Treatment = case_when( + mutate(Treatment_recoded = case_when( Treatment == "O" ~ "Other", Treatment == "Mint" ~ "Peppermint", Treatment == "mint" ~ "Peppermint", Treatment == "peppermint" ~ "Peppermint", TRUE ~ Treatment)) %>% - count(Treatment) + count(Treatment, Treatment_recoded) ``` ## Typically it is good practice to include the TRUE statement +You never know if you might be missing something - and if a value already was an NA it will stay that way. + ```{r, eval = FALSE} data_diet %>% - mutate(Treatment = case_when( + mutate(Treatment_recoded = case_when( Treatment == "O" ~ "Other", Treatment == "Mint" ~ "Peppermint", Treatment == "mint" ~ "Peppermint", Treatment == "peppermint" ~ "Peppermint", TRUE ~ Treatment)) %>% - count(Treatment) + count(Treatment, Treatment_recoded) ``` -You never know if you might be missing something - and if a value already was an NA it will stay that way. ## But maybe we want NA? Perhaps we want values that are O or Other to actually be NA, then `case_when` can be helpful for this. We simply specify everything else. +```{r} +data_diet %>% + mutate(Treatment_recoded = case_when(Treatment == "Ginger" ~ "Ginger", + Treatment == "Mint" ~ "Peppermint", + Treatment == "mint" ~ "Peppermint", + Treatment == "peppermint" ~ "Peppermint")) %>% + count(Treatment, Treatment_recoded) +``` +## case_when() can also overwrite/update a variable + +Just like recode, just need to specify what we want in the first part of `mutate`. + ```{r} data_diet %>% mutate(Treatment = case_when(Treatment == "Ginger" ~ "Ginger", @@ -575,6 +597,7 @@ data_diet %>% Treatment == "mint" ~ "Peppermint", Treatment == "peppermint" ~ "Peppermint")) %>% count(Treatment) + ``` ## More complicated case_when() @@ -583,12 +606,12 @@ data_diet %>% ```{r} data_diet %>% - mutate(Treatment = case_when( + mutate(Treatment_recode = case_when( Treatment == "Ginger" ~ "Ginger", # keep it the same! Treatment %in% c("Mint", "mint", "Peppermint", "peppermint") ~ "Peppermint", Treatment %in% c("O", "Other") ~ "Other")) %>% - count(Treatment) + count(Treatment, Treatment_recode) ``` @@ -598,6 +621,8 @@ data_diet %>% `case_when` can do very sophisticated comparisons! +Here we create a new variable called `Effect`. + ```{r} data_diet <- data_diet %>% @@ -710,7 +735,7 @@ count(data_diet, Treatment) ```{r, eval = FALSE} data_diet %>% - mutate(Treatment = recode(Treatment, G = "Ginger", + mutate(Treatment_recoded = recode(Treatment, G = "Ginger", g = "Ginger", ginger = "Ginger", O = "Other", @@ -725,10 +750,11 @@ But we still might miss a strange value ```{r, eval = FALSE} data_diet %>% - mutate(Treatment = case_when( + mutate(Treatment_recoded = case_when( Treatment %in% c("G", "g", "Ginger", "ginger") ~ "Ginger", Treatment %in% c("Mint", "mint", "Peppermint", "peppermint") ~ "Peppermint", - Treatment %in% c("O", "Other") ~ "Other")) + Treatment %in% c("O", "Other") ~ "Other", + TRUE ~ Treatment)) ``` ## `case_when()` improved with `stringr` @@ -737,14 +763,14 @@ data_diet %>% ```{r} data_diet %>% - mutate(Treatment = case_when( - str_detect(string = Treatment, pattern = "^g|^G") ~ "Ginger", + mutate(Treatment_recoded = case_when( str_detect(string = Treatment, pattern = "int") ~ "Peppermint", - str_detect(string = Treatment, pattern = "^o|^O") ~ "Other")) %>% - count(Treatment) + str_detect(string = Treatment, pattern = "^o|^O") ~ "Other", + TRUE ~ Treatment)) %>% + count(Treatment, Treatment_recoded) ``` -This is a more robust solution! It will catch typos as long as first letter is correct. +This is a more robust solution! It will catch typos as long as first letter is correct or there is part of the word mint. ## That's better!