wrangling.Rmd

---
title: "Data Wrangling"
output: html_document
---

```{r}
library(tidyverse) # install.packages("tidyverse")

```

```{r}
## read gapminder csv. Note the readr:: prefix identifies which package it's in
gapminder <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv')

# inspect the data
gapminder # looks at all the data
head(gapminder) # gives the first 6 by default
tail(gapminder) # last observations, default is 6

head(gapminder, 3)
tail(gapminder, 10)

# summary statisics

str(gapminder)
summary(gapminder)

#install.packages("skimr")
skimr::skim(gapminder)

names(gapminder) # column names
dim(gapminder) # dimensions
ncol(gapminder) # number of columns
nrow(gapminder) # number of rows

# reverse-engineer dim
c(nrow(gapminder), ncol(gapminder))

```

## Dplyr!

```{r}
# filter by rows
filter(gapminder, lifeExp < 29)
filter(gapminder, country == "Mexico")
filter(gapminder, country %in% c("Mexico", "Rwanda"))
filter(gapminder, country == "Mexico", year == 2002)

#find mean life expectancy of Sweden
sweden <-  filter(gapminder, country == "Sweden")
mean(sweden$lifeExp)

```

```{r}
# select by columns
select(gapminder, year, lifeExp) # select only these columns
select(gapminder, -continent, -lifeExp) # will exclude these columns

# use select and filter together
gap_cambodia <- filter(gapminder, country == "Cambodia")
gap_cambodia2 <- select(gap_cambodia, -continent, -lifeExp)

```

Give 'Em the Pipe!!!!

```{r}
# this:
gapminder %>% head(3)
# is equivalent to:
head(gapminder, 3)

# let's improve with the Pipe!
gap_cambodia <- filter(gapminder, country == "Cambodia")
gap_cambodia2 <- select(gap_cambodia, -continent, -lifeExp)

gap_cambodia <- gapminder %>% filter(country == "Cambodia")
gap_cambodia2 <- gap_cambodia %>% select(-continent, -lifeExp)

gap_cambodia <- gapminder %>%
  filter(country == "Cambodia") %>%
  select(-continent, -lifeExp)

```

```{r}
# mutate adds new variables
gapminder %>% 
  mutate(index = 1:nrow(gapminder))

gapminder %>% 
  mutate(planet = "Earth")

gapminder %>% 
  mutate(gdp = pop * gdpPercap)

## Challenge:
#find the maximum gdpPercap of Egpyt and Vietnam. Create a new column

gapminder %>% 
  filter(country %in% c("Egypt", "Vietnam")) %>% 
  mutate(gdp = pop *gdpPercap,
         max_gdp = max(gdp))      # you don't have to repeat "mutate"


```

# "groupby()" groups!

```{r}
## with mutate()
gap_group <- gapminder %>%
  group_by(country) %>% 
  mutate(gdp = pop * gdpPercap,
         max_gdp = max(gdp)) %>% 
  ungroup()

## with summarize() or summarise()
gap_summarized <- gapminder %>% 
  group_by(country) %>% 
  mutate(gdp = pop * gdpPercap) %>% 
  summarize(max_gdp = max(gdp)) %>% 
  ungroup()

```

# "arrange()" arranges columns

```{r}
gap_summarized <- gapminder %>% 
  group_by(country) %>% 
  mutate(gdp = pop * gdpPercap) %>% 
  summarize(max_gdp = max(gdp)) %>% 
  ungroup() %>% 
  arrange(desc(max_gdp))

```

# Joining data

```{r}
## read in the data. (same URL as yesterday, with co2.csv instead of gapminder.csv)
co2 <- read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/co2.csv")

# explore
co2 %>% head()
co2 %>% dim() # 12 observations

# create a new gapminder variable with only 2007 data
gap_2007 <- gapminder %>%
  filter(year == 2007)

gap_2007 %>% dim() # 142 observations

# left_join gap_2007 to co2
lj <- left_join(gap_2007, co2, by = "country")
# another way to write this:
lj <- gap_2007 %>% 
  left_join(co2, by = "country")

# right_join gap_2007 to co2
rj <- right_join(gap_2007, co2, by = "country")
# the same as:
lj2 <- left_join(co2, gap_2007, by = "country")

```

### TIDYR Session ###

```{r}
## read in gapminder data in wide format
gap_wide <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder_wide.csv')

```

# tidyr::gather()

```{r}
gap_long <- gap_wide %>% 
  gather(key   = obstype_year,
         value = obs_values,
         -continent, -country) %>% 
  separate(col = obstype_year,
           into = c("obstype", "year"),
           sep = "_",
           convert = TRUE)

```

# plot long data

```{r}
canada_df <- gap_long %>% 
  filter(country == "Canada",
         obstype == "lifeExp")

ggplot(data = canada_df, aes(x = year, y =obs_values)) +
  geom_line()
```

# plot the life expectancy of all countries in the Americas

```{r}
life_df <- gap_long %>% 
  filter(obstype == "lifeExp",
         continent == "Americas")

ggplot(data = canada_df, aes(x = year, y = obs_values, color = country)) +
  geom_line()
```

7.5 Exercise

```{r}
continents <- gap_long %>% 
  filter(obstype == "lifeExp",
          year > 1980) %>% 
  group_by(continent, year) %>% 
  summarize(mean_le = mean(obs_values)) %>% 
  ungroup()

#plot using ggplot
ggplot(data = continents, aes(x = year, y = mean_le, color = continent)) +
  geom_line() +
  labs(x = "Year",
       y = "Age",
       title = "Mean Life Expectancy") +
  theme_bw()

```

# spread()

```{r}
gap_normal <- gap_long %>% 
  spread(key = obstype,
         value = obs_values)
```

# Exercise


    7.6 Exercise

        Convert “gap_long” all the way back to gap_wide. Hint: you’ll need to create appropriate labels for all our new variables (time*metric combinations) with the opposite of separate: tidyr::unite().

        Knit the R Markdown file and sync to Github (pull, stage, commit, push)

```{r}
gap_wide_again <- gap_long %>% 
  unite(col = var_names, obstype, year, sep = "_") %>% 
  spread(key = var_names,
         value = obs_values)
  
```