-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrangling.Rmd
258 lines (186 loc) · 5.47 KB
/
wrangling.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
---
title: "Data Wrangling"
output: html_document
---
```{r}
library(tidyverse) # install.packages("tidyverse")
```
```{r}
## read gapminder csv. Note the readr:: prefix identifies which package it's in
gapminder <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv')
# inspect the data
gapminder # looks at all the data
head(gapminder) # gives the first 6 by default
tail(gapminder) # last observations, default is 6
head(gapminder, 3)
tail(gapminder, 10)
# summary statisics
str(gapminder)
summary(gapminder)
#install.packages("skimr")
skimr::skim(gapminder)
names(gapminder) # column names
dim(gapminder) # dimensions
ncol(gapminder) # number of columns
nrow(gapminder) # number of rows
# reverse-engineer dim
c(nrow(gapminder), ncol(gapminder))
```
## Dplyr!
```{r}
# filter by rows
filter(gapminder, lifeExp < 29)
filter(gapminder, country == "Mexico")
filter(gapminder, country %in% c("Mexico", "Rwanda"))
filter(gapminder, country == "Mexico", year == 2002)
#find mean life expectancy of Sweden
sweden <- filter(gapminder, country == "Sweden")
mean(sweden$lifeExp)
```
```{r}
# select by columns
select(gapminder, year, lifeExp) # select only these columns
select(gapminder, -continent, -lifeExp) # will exclude these columns
# use select and filter together
gap_cambodia <- filter(gapminder, country == "Cambodia")
gap_cambodia2 <- select(gap_cambodia, -continent, -lifeExp)
```
Give 'Em the Pipe!!!!
```{r}
# this:
gapminder %>% head(3)
# is equivalent to:
head(gapminder, 3)
# let's improve with the Pipe!
gap_cambodia <- filter(gapminder, country == "Cambodia")
gap_cambodia2 <- select(gap_cambodia, -continent, -lifeExp)
gap_cambodia <- gapminder %>% filter(country == "Cambodia")
gap_cambodia2 <- gap_cambodia %>% select(-continent, -lifeExp)
gap_cambodia <- gapminder %>%
filter(country == "Cambodia") %>%
select(-continent, -lifeExp)
```
```{r}
# mutate adds new variables
gapminder %>%
mutate(index = 1:nrow(gapminder))
gapminder %>%
mutate(planet = "Earth")
gapminder %>%
mutate(gdp = pop * gdpPercap)
## Challenge:
#find the maximum gdpPercap of Egpyt and Vietnam. Create a new column
gapminder %>%
filter(country %in% c("Egypt", "Vietnam")) %>%
mutate(gdp = pop *gdpPercap,
max_gdp = max(gdp)) # you don't have to repeat "mutate"
```
# "groupby()" groups!
```{r}
## with mutate()
gap_group <- gapminder %>%
group_by(country) %>%
mutate(gdp = pop * gdpPercap,
max_gdp = max(gdp)) %>%
ungroup()
## with summarize() or summarise()
gap_summarized <- gapminder %>%
group_by(country) %>%
mutate(gdp = pop * gdpPercap) %>%
summarize(max_gdp = max(gdp)) %>%
ungroup()
```
# "arrange()" arranges columns
```{r}
gap_summarized <- gapminder %>%
group_by(country) %>%
mutate(gdp = pop * gdpPercap) %>%
summarize(max_gdp = max(gdp)) %>%
ungroup() %>%
arrange(desc(max_gdp))
```
# Joining data
```{r}
## read in the data. (same URL as yesterday, with co2.csv instead of gapminder.csv)
co2 <- read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/co2.csv")
# explore
co2 %>% head()
co2 %>% dim() # 12 observations
# create a new gapminder variable with only 2007 data
gap_2007 <- gapminder %>%
filter(year == 2007)
gap_2007 %>% dim() # 142 observations
# left_join gap_2007 to co2
lj <- left_join(gap_2007, co2, by = "country")
# another way to write this:
lj <- gap_2007 %>%
left_join(co2, by = "country")
# right_join gap_2007 to co2
rj <- right_join(gap_2007, co2, by = "country")
# the same as:
lj2 <- left_join(co2, gap_2007, by = "country")
```
### TIDYR Session ###
```{r}
## read in gapminder data in wide format
gap_wide <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder_wide.csv')
```
# tidyr::gather()
```{r}
gap_long <- gap_wide %>%
gather(key = obstype_year,
value = obs_values,
-continent, -country) %>%
separate(col = obstype_year,
into = c("obstype", "year"),
sep = "_",
convert = TRUE)
```
# plot long data
```{r}
canada_df <- gap_long %>%
filter(country == "Canada",
obstype == "lifeExp")
ggplot(data = canada_df, aes(x = year, y =obs_values)) +
geom_line()
```
# plot the life expectancy of all countries in the Americas
```{r}
life_df <- gap_long %>%
filter(obstype == "lifeExp",
continent == "Americas")
ggplot(data = canada_df, aes(x = year, y = obs_values, color = country)) +
geom_line()
```
7.5 Exercise
```{r}
continents <- gap_long %>%
filter(obstype == "lifeExp",
year > 1980) %>%
group_by(continent, year) %>%
summarize(mean_le = mean(obs_values)) %>%
ungroup()
#plot using ggplot
ggplot(data = continents, aes(x = year, y = mean_le, color = continent)) +
geom_line() +
labs(x = "Year",
y = "Age",
title = "Mean Life Expectancy") +
theme_bw()
```
# spread()
```{r}
gap_normal <- gap_long %>%
spread(key = obstype,
value = obs_values)
```
# Exercise
7.6 Exercise
Convert “gap_long” all the way back to gap_wide. Hint: you’ll need to create appropriate labels for all our new variables (time*metric combinations) with the opposite of separate: tidyr::unite().
Knit the R Markdown file and sync to Github (pull, stage, commit, push)
```{r}
gap_wide_again <- gap_long %>%
unite(col = var_names, obstype, year, sep = "_") %>%
spread(key = var_names,
value = obs_values)
```