-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlab5.R
136 lines (102 loc) · 4.1 KB
/
lab5.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
####Stat 10 - Lab #5
## Name: Callista Wu
## SID: 904886275
## TA: Gabriel Ruiz
## Lab/Discussion Time: 1C/10AM
######################################################
#Lab 5 - Hypothesis Testing Bonanza!
######################################################
# Exercise 1: Hypothesis testing with one proportion
#Set working directory
flint <- read.csv("flint_2015.csv", header=TRUE)
#a) null vs alternative hypotheses
# one-sided vs two-sided tests
#b) Sample proportion and sample standard deviation for households with dangerous lead levels.
n <- nrow(flint)
dangerous_lead_indicator <- (flint$Pb >= 15)
p_hat <- mean(dangerous_lead_indicator)
sd_sample <- sqrt(p_hat*(1-p_hat)/n)
print(p_hat)
print(sd_sample)
#c) Standard Error and z-statistic under the Null: p=0.10.
p_null <- 0.10
se_null <- sqrt(p_null*(1-p_null)/n)
z_stat <- (p_hat-p_null)/se_null
print(z_stat)
print(se_null)
#d) p-value for above z-statistic and alternative hypothesis:
# H1: p > 0.10.
# Note: pnorm function gives probability to the left of input value
# in standard normal distribution.
p_value <- 1-pnorm(z_stat,sd=1,mean=0)
print(p_value)
# Review question 1: If we actually had the alternative hypothesis
# "H1: p < 0.10", would we take the probability to the right or left
# of our z_stat?
# Review question 2: If we actually had the alternative hypothesis
# "H1: p is not equal 0.10", how would we calculate the p value?
#e) Reject null hypothesis?
#f) See lab manual.
#g) Same hypothesis test with ’prop.test’ function in Mosaic.
library(mosaic)
prop.test(x=sum(dangerous_lead_indicator),n=n,p=0.10,alternative="greater")
#h) 99% Confidence interval with "greater than" alternative hypothesis.
prop.test(x=sum(dangerous_lead_indicator),n=n,p=0.10,
alternative="greater",conf.level = 0.99)
######################################################
# Exercise 2: Hypothesis testing with two proportions
flint <- read.csv("flint_2015.csv", header=TRUE)
#a) null and alternative hypotheses
# one-sided or two-sided test?
#b) Computing necessary values for z statistic.
flint_north <- flint[flint$Region=="North" , ]
n_north <- nrow(flint_north) #sample size for north
flint_south <- flint[flint$Region=="South" , ]
n_south <- nrow(flint_south) #sample size for south
#Sample proporition of households with dangerous Pb levels.
p_hat_north <- mean(flint_north$Pb>=15)
p_hat_south <- mean(flint_south$Pb>=15)
#Pooled Proportion of households with dangerous Pb levels.
#Note: same p hat as earlier.
p_hat_pooled <- mean(flint$Pb >= 15)
#Standard deviation for two proportion test (see lab manual).
SE <- sqrt( p_hat_pooled*(1-p_hat_pooled) * (1/n_north + 1/n_south) )
#Finally, the z-statistic for a 2 proportion test:
z_stat <- (p_hat_north-p_hat_south-0)/SE
print(z_stat)
#c) P-value for this test.
# Why do we multiply by 2?
p_value <- (1-pnorm(z_stat,sd=1,mean=0) )*2
print(p_value)
#d) Reject null hypothesis?
#e) 2 sample proportion hypothesis test using 'prop.test' in mosaic package.
library(mosaic)
x_north <- sum(flint_north$Pb>=15)
x_south <- sum(flint_south$Pb>=15)
prop.test(x=c(x_north,x_south),n=c(n_north,n_south),
alternative="two.sided")
######################################################
# Exercise 3: Hypothesis testing with means
flint <- read.csv("flint_2015.csv", header=TRUE)
#a) null and alternative hypotheses
#one-sided or two-sided test?
#b) Mean and SD of Cu levels in Flint, MI.
( xbar <- mean(flint$Cu) )
( s <- sd(flint$Cu) )
#c) Standard error of xbar. Refer to lecture notes.
n <- nrow(flint)
(SE <- s/sqrt(n))
#d) t statistic and P value for this test using t distribution with n-1 df.
(t_stat <- (xbar-40)/SE)
p_value <- (1-pt(t_stat,df=n-1) )*2
print(p_value)
#e) Reject null hypothesis?
#f) Same test with 't.test' function in mosaic package.
library(mosaic)
t.test(flint$Cu, mu = 40, alt = "two.sided", conf.level = 0.99)
######################################################
# Extra Credit: Hypothesis testing in regression
soil <- read.table("http://www.stat.ucla.edu/~nchristo/statistics_c173_c273/soil_complete.txt",
header=TRUE)
linear_model <- lm(soil$lead ~ soil$zinc)
summary(linear_model)