-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_forest_and_boosting.R
71 lines (53 loc) · 1.9 KB
/
random_forest_and_boosting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
###########################
# Random Forest #
###########################
require(randomForest)
require(MASS)
# define train and test sets
set.seed(101)
dim(Boston)
train = sample(1:nrow(Boston), 300)
# check details of the dataset
?Boston
# response is "medv"
# Random Forest
rf.boston = randomForest(medv~., data=Boston, subset=train)
rf.boston
# tuning parameter : number of variables used at each split
# to decide how to split, "mtry"
oob.err=double(13)
test.err=double(13)
for(mtry in 1:13) {
fit = randomForest(medv~., data=Boston, subset=train, mtry=mtry, ntree=400)
oob.err[mtry] = fit$mse[400]
pred = predict(fit,Boston[-train,])
test.err[mtry] = with(Boston[-train,], mean((medv-pred)^2))
}
matplot(1:mtry, cbind(test.err, oob.err),
pch=19, col=c("red","blue"),
type="b",
ylab="Mean Squared Error")
legend("topright", legend=c("OOB","Test"), pch=19, col=c("red","blue"))
##################################
# Boosting #
##################################
require(gbm)
boost.boston = gbm(medv~., data=Boston[train,],
distribution="gaussian", # squared error loss
n.trees = 10000,
shrinkage = 0.01, # how much we shrink each tree back
interaction.depth = 4) # number of splits per tree
summary(boost.boston)
# plot relation of vars with response
plot(boost.boston, i="lstat")
plot(boost.boston, i="rm")
# shrinkage can be tuned with CV...
# make predictions
n.trees = seq(from=100, to=10000, by=100)
predmat = predict(boost.boston, newdata = Boston[-train,],
n.trees = n.trees)
berr = with(Boston[-train,], apply( (predmat-medv)^2, 2, mean))
plot(n.trees, berr, pch=19, ylab = "Mean Squared Error", xlab = "# trees",
main = "Boosting Test Error")
# draw a red line for the minimum test error from random forest
abline(h = min(test.err), col = "red")