-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathlesson5_student.rmd
407 lines (286 loc) · 8.23 KB
/
lesson5_student.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
Lesson 5
========================================================
### Multivariate Data
Notes:
***
### Moira Perceived Audience Size Colored by Age
Notes:
***
### Third Qualitative Variable
Notes:
```{r Third Qualitative Variable}
setwd("~/Repos//data-analysis-with-r")
library(ggplot2)
library(dplyr)
pf <- read.csv("./data/pseudo_facebook.tsv", sep ="\t")
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot() +
stat_summary(fun.y = mean, geom = "point", shape = 4)
ggplot(data = subset(pf, !is.na(gender)),
aes(x=age, y=friend_count)) +
geom_line(aes(color=gender), stat = "summary", fun.y = median )
# Write code to create a new data frame,
# called 'pf.fc_by_age_gender', that contains
# information on each age AND gender group.
# The data frame should contain the following variables:
# mean_friend_count,
# median_friend_count,
# n (the number of users in each age and gender grouping)
pf.fc_by_age_gender <- subset(pf, !is.na(gender)) %>%
group_by(age,gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(as.numeric(friend_count)),
n = n())
head(pf.fc_by_age_gender)
```
***
### Plotting Conditional Summaries
Notes:
```{r Plotting Conditional Summaries}
ggplot(pf.fc_by_age_gender, aes(x=age,y=median_friend_count)) +
geom_line(aes(color = gender))
```
***
### Thinking in Ratios
Notes:
***
### Wide and Long Format
Notes:
***
### Reshaping Data
Notes:
```{r}
install.packages('reshape2')
library(reshape2)
pf.fc_by_age_gender.wider <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = "median_friend_count")
```
***
### Ratio Plot
Notes:
```{r Ratio Plot}
# Plot the ratio of the female to male median
# friend counts using the data frame
# pf.fc_by_age_gender.wide.
# Think about what geom you should use.
# Add a horizontal line to the plot with
# a y intercept of 1, which will be the
# base line. Look up the documentation
# for geom_hline to do that. Use the parameter
# linetype in geom_hline to make the
# line dashed.
pf.fc_by_age_gender.wider$ratio <-
pf.fc_by_age_gender.wider$female / pf.fc_by_age_gender.wider$male
ggplot(pf.fc_by_age_gender.wider, aes(x = age, y = ratio)) +
geom_line() +
geom_hline(aes(yintercept=1), linetype = 2)
```
***
### Third Quantitative Variable
Notes:
```{r Third Quantitative Variable}
# Create a variable called year_joined
# in the pf data frame using the variable
# tenure and 2014 as the reference year.
# The variable year joined should contain the year
# that a user joined facebook.
pf$year_joined <- floor(2014 - pf$tenure/365)
summary(pf$year_joined)
table(pf$year_joined)
```
***
### Cut a Variable
Notes:
```{r Cut a Variable}
# Create a new variable in the data frame
# called year_joined.bucket by using
# the cut function on the variable year_joined.
# You need to create the following buckets for the
# new variable, year_joined.bucket
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
pf$year_joined.bucket <- cut(pf$year_joined,
breaks=c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)
```
***
### Plotting it All Together
Notes:
```{r Plotting it All Together}
# Create a line graph of friend_count vs. age
# so that each year_joined.bucket is a line
# tracking the median user friend_count across
# age. This means you should have four different
# lines on your plot.
# You should subset the data to exclude the users
# whose year_joined.bucket is NA.
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
```
***
### Plot the Grand Mean
Notes:
```{r Plot the Grand Mean}
# Write code to do the following:
# (1) Add another geom_line to code below
# to plot the grand mean of the friend count vs age.
# (2) Exclude any users whose year_joined.bucket is NA.
# (3) Use a different line type for the grand mean.
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender) & !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(stat = "summary", fun.y = mean, linetype = 2)
```
***
### Friending Rate
Notes:
```{r Friending Rate}
pf.tenure <- subset(pf, tenure > 0)
summary(pf$friending_rate)
```
***
### Friendships Initiated
Notes:
What is the median friend rate?
0.2205
What is the maximum friend rate?
417
```{r Friendships Initiated}
# Create a line graph of mean of friendships_initiated per day (of tenure)
# vs. tenure colored by year_joined.bucket.
ggplot(subset(pf,tenure >= 1), aes(x=tenure,y=friendships_initiated/tenure)) +
geom_smooth(aes(color = year_joined.bucket))
geom_line(stat="summary",fun.y=mean, aes(color = year_joined.bucket))
```
***
### Bias-Variance Tradeoff Revisited
Notes:
```{r Bias-Variance Tradeoff Revisited}
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
```
***
### Sean's NFL Fan Sentiment Study
Notes:
***
### Introducing the Yogurt Data Set
Notes:
***
### Histograms Revisited
Notes:
```{r Histograms Revisited}
yo <- read.csv("./data/yogurt.csv")
summary(yo)
yo$id <- as.factor(yo$id)
ggplot(yo, aes(x=price)) + geom_histogram()
hist(yo$price)
```
***
### Number of Purchases
Notes:
```{r Number of Purchases}
# Create a new variable called all.purchases,
# which gives the total counts of yogurt for
# each observation or household.
# The transform function produces a data frame
# so if you use it then save the result to 'yo'!
?transform
yo <- transform(yo, all.purchases = strawberry + blueberry +
pina.colada + plain + mixed.berry)
```
***
### Prices over Time
Notes:
```{r Prices over Time}
ggplot(yo, aes(x=time, y=price)) + geom_point(alpha = 1/10)
```
***
### Sampling Observations
Notes:
***
### Looking at Samples of Households
```{r Looking at Sample of Households}
set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
ggplot(data = subset(yo, id %in% sample.ids),
aes(x = time, y = price)) +
facet_wrap( ~ id ) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
plotSampleOfHouseholds <- function(yo, seed){
set.seed(seed)
sample.ids <- sample(levels(yo$id),16)
ggplot(data = subset(yo, id %in% sample.ids),
aes(x = time, y = price)) +
facet_wrap( ~ id ) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1) + ggtitle(paste("Seed: ", seed))
}
plotSampleOfHouseholds(yo,1)
plotSampleOfHouseholds(yo,2)
plotSampleOfHouseholds(yo,3)
```
***
### The Limits of Cross Sectional Data
Notes:
***
### Many Variables
Notes:
***
### Scatterplot Matrix
Notes:
***
```{r}
pf_sample <- subset(pf, select = 2:15)
install.packages('psych')
library(psych)
library(dplyr)
set.seed(1836)
pairs.panels(pf_sample,pch=".")
```
### Even More Variables
Notes:
***
### Heat Maps
Notes:
```{r}
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
```
```{r}
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
```
***
### Analyzing Three of More Variables
Reflection:
***
Click **KnitHTML** to see all of your hard work and to have an html
page of this lesson, your answers, and your notes!