-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathsharp_bikehelms.R
136 lines (120 loc) · 5.49 KB
/
sharp_bikehelms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
library("ggplot2")
library("dplyr")
require("grid")
require("MASS")
require("stats")
library("xtable")
helms <- read.table("../data/helmet_data.tsv", sep="\t")
colnames(helms) <- c("make", "model", "type", "price", "rating", "sizes")
## 1) Get price ranges per manufacturer
summary <- group_by(helms, make) %>% summarise(count=n(), middle=median(price))
summary <- summary[order(summary$middle),]
summary <- summary[summary$count > 5,]
h2 <- helms[helms$make %in% summary$make,]
h2$make <- factor(h2$make, levels=summary$make)
pdf("../figures/helmet_price_ranges.pdf", 5.5, 5.5)
ggplot(h2, aes(x=make, y=price)) +
geom_boxplot(fill=I("grey80"), width=I(.75),
colour=I("grey80"), outlier.size=0) +
theme_bw() +
coord_flip() + labs(x="", y="Price (GBP)") +
stat_summary(geom = "crossbar", width=0.65, fatten=1.2,
color="white", fun.y=median, fun.ymin=median, fun.ymax=median) +
stat_summary(geom="text", aes(label=make), hjust=1, size=I(4), colour=I("grey20"),
fun.y=function(d){
#boxplot.stats(d)$stats[2]}
# apparently done diff, see https://groups.google.com/forum/#!topic/ggplot2/_CJRfU0Cx6w
min(d[d > (quantile(d, .25) - 1.5*IQR(d))]) }) +
theme(axis.text.y=element_blank(), panel.grid.major.y=element_blank(),
axis.ticks.y=element_blank()) +
ggtitle("Price ranges of motorcycle helmet brands") +
scale_y_continuous(breaks=seq(0,600, by=100), limits=c(0,600))# + ylim(0, 600)
dev.off()
## 2) Look at helmet type vs. safety rating
h3 <- group_by(helms, type, rating) %>% summarise(n=n())
n <- group_by(helms, type) %>% summarise(totals=n())
# Somehow can't figure out how to do this properly, oh well..
h3[h3$type == "System","n"] <- h3[h3$type == "System","n"] / n[n$type=="System",]$totals
h3[h3$type == "Full face","n"] <- h3[h3$type == "Full face","n"] / n[n$type=="Full face",]$totals
h3$type <- ifelse(h3$type == "System", "Flip-front", "Full face")
offset <- position_dodge(width=.6)
pdf("../figures/helmet_types.pdf", 6, 6)
ggplot(h3, aes(x=paste0(rating, "*"), y=100*n,
col=type, fill=type, group=type)) +
theme_bw() + coord_flip() +
geom_bar(width=.5, stat="identity", col=NA,
position=offset) +
geom_text(aes(label=100*round(n, 3)), hjust=-0.2,
position=offset, col=I("black")) +
labs(x="SHARP rating", y="Percent of helmets tested", fill="Helmet") + ylim(0,50) +
theme(legend.position=c(.8,.2), panel.grid.major.y=element_blank()) +
scale_fill_manual(values=c("grey80", "grey60")) +
ggtitle("Safety ratings by helmet type")
dev.off()
## 3) Helmet brands -- does expensive == better?
many <- names(which(table(helms$make) > 8))
# simplify plot, rm those I don't know off-hand
many <- many[!many %in% c("Box", "KBC", "Lazer")]
pdf("../figures/helmets_acrossbrands.pdf", 8, 5)
ggplot(subset(helms, make %in% many), aes(x=price, y=rating, col=make)) +
geom_jitter(aes(shape=type)) +
geom_smooth(method="rlm", se=F) +
theme_bw() +
labs(x="Price (GBP)", y="SHARP rating", col="Brand", shape="Type") +
ggtitle("Trends across price ranges per brand") +
theme_bw() +
scale_color_brewer(type="qual", palette="Paired")
dev.off()
## Are more epensive helmets more protective overall?
lm_eqn = function(df){
## from: http://stackoverflow.com/a/7549819/1274516
m = lm(rating ~ price, df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(coef(m)[1], digits = 2),
b = format(coef(m)[2], digits = 2),
r2 = format(summary(m)$r.squared, digits = 2)))
as.character(as.expression(eq));
}
pdf("../figures/helmets_overall_lm.pdf", 7, 6)
ggplot(helms, aes(x=price, y=rating)) +
geom_jitter(aes(shape=type, col=type)) +
geom_smooth(method="lm", se=T, col="#ffa775",alpha=I(.2)) +
labs(x="Price (GBP)", y="SHARP rating", col="Type", shape="Type") +
ggtitle("Are more expensive helmets more protective?") +
theme_bw() + theme(legend.position=c(.8,.2)) + xlim(0,600) +
scale_color_manual(values=c("black", "grey40")) +
annotate("text", x=400, y=.5, label=lm_eqn(helms), parse=T)
dev.off()
# (Should really be using ordered logit model or probit)
## 10 best / worst helmets in terms of value for money
top <- helms[helms$rating == 5,]
top <- head(top[order(top$price, decreasing=F),-6], 10)
rownames(top) <- NULL
print(xtable(top), type="html")
bottom <- helms[helms$rating %in% c(1,2),]
bottom <- head(bottom[order(bottom$price, decreasing=T),-6], 10)
rownames(bottom) <- NULL
print(xtable(bottom), type="html")
# ranked of median price vs. rank of median safety:
h5 <- group_by(helms, make) %>%
summarise(n=n(), median.price=median(price),
median.rating=mean(rating)) %>%
filter(n >5)
h5$median.price <- rank(h5$median.price)
pdf("../figures/helmets_brand_summary.pdf", 6, 6)
ggplot(h5, aes(x=median.rating, y=median.price,
label=make)) +
labs(x="Mean safety rating",
y="Median price (rank / 22)") +
annotate("rect", xmin=1, xmax=5, ymin=0, ymax=7.33,
fill=I("grey80"), alpha=I(.4)) +
annotate("rect", xmin=quantile(h5$median.rating, .66),
xmax=5, ymin=0, ymax=23,
fill=I("grey80"), alpha=I(.4)) +
geom_text() + theme_bw() +
annotate("text", x=1.5, y=6, label="Cheapest",
col=I("#ffa775"), face="bold", size=6) +
annotate("text", x=4.5, y=22, label="Safest",
col=I("#ffa775"), face="bold", size=6) +
ggtitle("Brand summary over all tested helmets")
dev.off()