-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhyperopt-rf.py
104 lines (83 loc) · 3.22 KB
/
hyperopt-rf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import (cross_val_score, cross_val_predict,
train_test_split,
GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
import os
import logging
# Let OpenMP use 1 thread to avoid possible subprocess call hangs
os.environ['OMP_NUM_THREADS'] = '4'
import hyperopt
# Hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
# Set up logging for XGBoost param tuning.
logging.basicConfig(filename="logs/rf_hyperopt.log", level=logging.INFO)
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)
# Divide dataset into X and y
y = train.Y
X = train.drop(["Y"], axis=1)
X_test = test
# Impute missing features
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X)
# TODO: Impute dataframe so that F5 uses median
# and F19 uses mean. For now, we'll impute via mean for both.
X = pd.DataFrame(train_xform, columns=X.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)
# Split data.
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.10, random_state=42)
# Scoring and optimization functions
def score(params):
logging.info("Training with params: ")
logging.info(params)
rf = RandomForestClassifier(
n_estimators=params['n_estimators'],
max_depth=params['max_depth'],
min_samples_split=params['min_samples_split'],
min_samples_leaf=params['min_samples_leaf'],
max_features=params['max_features']
)
scores = cross_val_score(
rf, X, y, scoring='roc_auc', cv=5, n_jobs=-1).mean()
score = scores.mean()
std = scores.std() # Standard deviation
logging.info("\tScore {0}\n\n".format(score))
logging.info("\Std Dev {0}\n\n".format(std))
# The score function should return the loss (1-score)
# since the optimize function looks for the minimum
loss = 1 - score
return {'loss': loss, 'status': STATUS_OK}
SEED = 42
def optimize(
# trials,
random_state=SEED):
"""
This is the optimization function that given a space (space here) of
hyperparameters and a scoring function (score here), finds the best hyperparameters.
"""
space = {
'n_estimators': hp.choice('n_estimators', [120, 300, 500, 800, 1200]),
'max_depth': hp.choice('max_depth', np.arange(5, 30, 5, dtype=int)),
'min_samples_split': hp.choice('min_samples_split', [1, 2, 5, 10, 15, 100]),
'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 5, 10]),
'max_features': hp.choice('max_features', ['log2', 'sqrt', None]),
}
# Use the fmin function from Hyperopt to find the best hyperparameters
best = fmin(score, space, algo=tpe.suggest,
# trials=trials,
max_evals=250)
return best
best_hyperparams = optimize(
# trials
)
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)