-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperiments.py
172 lines (135 loc) · 6.18 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
from random_search import RandomSearch
from bo import BayesianOptimization
import random
import numpy as np
import pickle
import shutil
import openml
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
if not os.path.exists('results'):
os.mkdir('results')
class Config:
"""
Class to store configurations
"""
def __init__(self):
self.out_dir = 'results/openml'
self.validation_samples = 500
self.iterations = 250
self.debugging = False
self.noise_rate = 0.25
if not os.path.exists(self.out_dir):
os.mkdir(self.out_dir)
if os.path.exists('smac3_output'):
shutil.rmtree('smac3_output')
config = Config()
def custom_split(X, y, seed=0):
"""
Function to shuffle and split any dataset
"""
# Negative regression labels cause errors, so add minimum to all labels
if min(y) < 0:
min_label = min(y)
y = [x - min_label for x in y]
# Add label to dataframe and shuffle
X['label'] = y
df = X.sample(frac=1.0)
df.reset_index(drop=True)
# Separate label and drop empty columns
y = df['label']
cols_to_drop = [c for c in X.columns if list(set(X[c])) == [None]] + ['label']
X = df.drop(columns=cols_to_drop, inplace=False)
# When the dataset is small, we split .4/.2/.4
if X.shape[0] < 5 * config.validation_samples:
# If the dataset is small enough, proceed with a simple three-way split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.6, random_state=seed, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2 / 3, random_state=seed,
shuffle=True)
# When the dataset is large, we split validation_samples * 2/validation_samples/rest
else:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=2 * config.validation_samples,
random_state=seed, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size=config.validation_samples,
random_state=seed,
shuffle=True)
# In case of extremely large datasets, we only sample the first 100,000 instances for the test set
if X_test.shape[0] > 100000:
X_test, y_test = X_test[:100000], y_test[:100000]
return X_train, X_val, X_test, y_train, y_val, y_test
def seed_everything(seed: int) -> None:
"""
Function to seed random and numpy
:param seed: Seed to be used
"""
np.random.seed(seed)
random.seed(seed)
def draw_seed() -> int:
"""
Function to draw valid random seed
:return: Seed
"""
return random.randint(1000000, 2 ** 32 - 1)
def reset_smac() -> None:
"""
Function to remove smac3 history to ensure separate runs
"""
if os.path.exists('smac3_output'):
shutil.rmtree('smac3_output')
def run_experiment_on_dataset(dataset_id, cv=None, problem_type='binary'):
"""
Main function that performs the experiment on a dataset
:param dataset_id: Id of openml dataset to be used
:param cv: Number of folds in cross-validation. If None, the holdout method is used
:param problem_type: binary/multiclass/regression
"""
seed = draw_seed()
seed_everything(seed)
# Retrieve a dataset by its ID
dataset = openml.datasets.get_dataset(dataset_id)
# Get the dataset's data (features and labels)
X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
# Map classes to integers if required
if problem_type != 'regression':
classes = list(set(y))
class_map = dict(zip(classes, range(len(classes))))
y = [class_map[x] for x in y]
# Split data
X_train, X_val, X_test, y_train, y_val, y_test = custom_split(X, y, seed=seed)
print(f"\nRunning seed {seed} on ds {dataset_id} with shapes ({len(X_train)}, {len(X_val)}, {len(X_test)}) "
f"and {X_train.shape[1]} features for cv={cv}")
# Initialize HPO algorithms
rs = RandomSearch(iterations=config.iterations, verbose=config.debugging, problem_type=problem_type, cv=cv)
bo = BayesianOptimization(iterations=config.iterations, problem_type=problem_type, verbose=config.debugging,
prevention=None, cv=cv, seed=seed)
bo_thresholdout = BayesianOptimization(iterations=config.iterations, verbose=config.debugging,
prevention='thresholdout', tho_noise=config.noise_rate,
seed=seed, problem_type=problem_type, cv=cv)
# Fit all HPO algorithms
rs.fit(X_train, y_train, X_val, y_val, X_test, y_test)
bo.fit(X_train, y_train, X_val, y_val, X_test, y_test)
bo_thresholdout.fit(X_train, y_train, X_val, y_val, X_test, y_test)
# Store results in dictionary
results = {'dataset_id': dataset_id,
'seed': seed,
'iterations': config.iterations,
'rs_public': rs.public_scores,
'rs_private': rs.private_scores,
'rs_configs': rs.configs,
'bo_public': bo.public_scores,
'bo_private': bo.private_scores,
'bo_configs': bo.configs,
'tho_public': bo_thresholdout.public_scores,
'tho_private': bo_thresholdout.private_scores,
'tho_configs': bo_thresholdout.configs,
'train_val_test': (len(X_train), len(X_val), len(X_test)),
'n_features': X_train.shape[1]}
# Store results as pickle
with open(f'{config.out_dir}/{dataset_id}_{seed}_val-{X_val.shape[0]}_cv-{cv}.pkl', 'wb') as f:
pickle.dump(results, f)
reset_smac()
if __name__ == '__main__':
# Example usage to perform one repetition of the experiment on adult dataset
run_experiment_on_dataset(1590, cv=None, problem_type='binary')