-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgb.py
128 lines (105 loc) · 5.1 KB
/
xgb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, FunctionTransformer, Normalizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation as cv
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from util import *
# necessary for multi-threading
if __name__ == '__main__':
train_file = "data/train_feat.csv"
test_file = "data/test_feat.csv"
ensem_pred_file = "data/ensem_xgb_pred.csv"
test_pred_file = "data/xgb_blend/test_xgb_88.csv"
do_grid_search = False
pred_ensem = False
pred_test = True
# load data
train = load_train(train_file)
# list of features
# ['AgeuponOutcome', 'Breed', 'Color', 'Mix', 'Multicolor', 'Name', 'OutcomeType', 'SexuponOutcome',
# 'Hour', 'Weekday', 'Month', 'Day', 'Workday', 'Year']
# setup a data frame mapper, convert breed to breed mean, encode the two categorical variables,
# pass the rest through
# skipping standardization since it's not needed for decision tree based classifiers
mapper = DataFrameMapper([('Breed', LabelBinarizer()),
('Color', LabelBinarizer()),
('SexuponOutcome', LabelBinarizer())],
default=None, sparse=True)
# XGB classifier instance
xgb = XGBClassifier(max_depth=8, learning_rate=0.02, n_estimators=500, objective='multi:softprob',
subsample=0.8, colsample_bytree=0.8, nthread=1)
# pipeline used for grid search and cross validation
pipeline = Pipeline([
('mapper', mapper),
('xgb', xgb)])
# grid search parameters
param_grid = {'xgb__max_depth': [10],
'xgb__learning_rate': [0.02],
'xgb__n_estimators': [500],
'xgb__subsample': [0.8],
'xgb__colsample_bytree': [0.7],
'xgb__reg_lambda': [1]}
# data set to use for grid search
data_set = 'dog'
X, y = train[data_set]
# cross validation strategy
skf = cv.StratifiedKFold(y, n_folds=10, shuffle=True)
# grid search, verbose 0 is minimal output, 1 is some output, 2 is detailed output
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='log_loss', cv=skf,
n_jobs=5, refit=False, verbose=1)
# comment added to the first line of the log file
note_str = "{} subset using XGB classifier, binarize breed, color, and sex".format(data_set)
if do_grid_search:
gs.fit(X, y)
print("best score: {s:.4f}".format(s=-gs.best_score_))
write_search_log(gs, note_str)
# found by grid search, binarize breed, color, and sex
dog_params = {'xgb__colsample_bytree': 0.8, 'xgb__n_estimators': 500, 'xgb__learning_rate': 0.02,
'xgb__subsample': 0.8, 'xgb__max_depth': 10, 'xgb__nthread': 5}
cat_params = {'xgb__colsample_bytree': 0.8, 'xgb__n_estimators': 500, 'xgb__learning_rate': 0.02,
'xgb__subsample': 0.8, 'xgb__max_depth': 10, 'xgb__nthread': 5}
# train pipeline on both animal types
param_dict = {'cat': cat_params, 'dog': dog_params}
fitted = fit_animals(pipeline, param_dict, train)
# load unused data
ensem = load_train(train_file, subset='ensem')
ensem_pred_lst = []
for animal, clf in fitted.items():
X, y = train[animal]
y_pred = clf.predict_proba(X)
print("log loss on the training {a:} subset {l:.4f}".format(a=animal, l=log_loss(y, y_pred)))
X, y = ensem[animal]
y_pred = clf.predict_proba(X)
print("log loss on the ensemble {a:} subset {l:.4f}".format(a=animal, l=log_loss(y, y_pred)))
ensem_df = pd.DataFrame(index=y.index)
ensem_df['animal'] = animal
ensem_df['outcome'] = y
ensem_df = ensem_df.reindex(columns=list(ensem_df.columns)+list(clf.steps[1][1].classes_))
ensem_df[list(clf.steps[1][1].classes_)] = y_pred
ensem_pred_lst.append(ensem_df)
if pred_ensem:
print("saving ensemble subset predictions")
ensem_pred_df = pd.concat(ensem_pred_lst, axis=0)
ensem_pred_df.sort_index(inplace=True)
ensem_pred_df.to_csv(ensem_pred_file)
if pred_test:
print("training on the full set and saving test predictions")
# load test data and full train data
test = load_test(test_file)
full = load_train(train_file, subset='all')
full_fit = fit_animals(pipeline, param_dict, full)
test_pred_lst = []
for animal, clf in fitted.items():
X, id_df = test[animal]
y_pred = clf.predict_proba(X)
test_df = id_df.reindex(columns=list(id_df.columns)+list(clf.steps[1][1].classes_))
test_df[list(clf.steps[1][1].classes_)] = y_pred
test_pred_lst.append(test_df)
test_df = pd.concat(test_pred_lst, axis=0)
test_df.sort_index(inplace=True)
test_df.to_csv(test_pred_file, index=False)