-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_rscorer.py
116 lines (90 loc) · 4.35 KB
/
make_rscorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
fixed_seed = 1
import random
import numpy as np
random.seed(fixed_seed)
np.random.seed(fixed_seed)
import os
import logging
import argparse
import pandas as pd
from helpers.utils import init_logger
from helpers.data import get_scaler
from models.data import IHDP, JOBS, TWINS, NEWS
from models.scorers import RScorerWrapper
def get_parser():
parser = argparse.ArgumentParser()
# General
parser.add_argument('--data_path', type=str)
parser.add_argument('--dtype', type=str, choices=['ihdp', 'jobs', 'news', 'twins'])
parser.add_argument('--sf', dest='splits_file', type=str)
parser.add_argument('--iters', type=int, default=-1)
parser.add_argument('-o', type=str, dest='output_path', default='./')
parser.add_argument('--seed', type=int, default=1)
parser.add_argument('--scaler', type=str, choices=['minmax', 'std'], default='std')
parser.add_argument('--cv', type=int, default=5)
parser.add_argument('--n_jobs', type=int, default=-1)
# Estimation
parser.add_argument('--bm', dest='base_model', type=str, choices=['l1', 'l2', 'tr', 'dt', 'rf', 'et', 'kr', 'cb', 'lgbm'], default='l1')
return parser
def get_dataset(name, path, iters):
result = None
if name == 'ihdp':
result = IHDP(path, iters)
elif name == 'jobs':
result = JOBS(path, iters)
elif name == 'twins':
result = TWINS(path, iters, static_splits=True)
elif name == 'news':
result = NEWS(path, iters, static_splits=True)
else:
raise ValueError('Unknown dataset type selected.')
return result
def scale_x(X_train, X_test, opt, cont_vars):
scaler_x = get_scaler(opt.scaler)
if cont_vars:
# Scale only continuous features.
X_train[:, cont_vars] = scaler_x.fit_transform(X_train[:, cont_vars])
X_test[:, cont_vars] = scaler_x.transform(X_test[:, cont_vars])
else:
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)
return X_train, X_test
if __name__ == "__main__":
parser = get_parser()
options = parser.parse_args()
# Check if output folder exists and create if necessary.
if not os.path.isdir(options.output_path):
os.mkdir(options.output_path)
# Initialise the logger (writes simultaneously to a file and the console).
init_logger(options)
logging.debug(options)
# (iters, folds, idx)
splits = np.load(options.splits_file, allow_pickle=True)
n_iters = options.iters if options.iters > 0 else splits.shape[0]
dataset = get_dataset(options.dtype, options.data_path, n_iters)
scorer = RScorerWrapper(options)
base_scores_val = []
base_scores_test = []
# Data iterations
for i in range(n_iters):
train, test = dataset._get_train_test(i)
X_tr, t_tr, y_tr = dataset.get_xty(train)
X_test, t_test, y_test = dataset.get_xty(test)
# CV iterations
for k, (train_idx, valid_idx) in enumerate(zip(splits['train'][i], splits['valid'][i])):
train_idx = train_idx.astype(int)
valid_idx = valid_idx.astype(int)
X_tr_fold = X_tr[train_idx]
X_val_fold, t_val_fold, y_val_fold = X_tr[valid_idx], t_tr[valid_idx], y_tr[valid_idx]
# Scale train/val AFTER the split.
X_tr_fold, X_val_fold = scale_x(X_tr_fold, X_val_fold, options, dataset.contfeats)
(Y_res, T_res), base_score_val = scorer.run(X_val_fold, t_val_fold, y_val_fold)
np.savez_compressed(os.path.join(options.output_path, f'rs_{options.base_model}_iter{i+1}_fold{k+1}'), y_res=Y_res, t_res=T_res)
base_scores_val.append([i+1, k+1, base_score_val])
# Scale train/test.
X_tr_scaled, X_test_scaled = scale_x(X_tr, X_test, options, dataset.contfeats)
(Y_res, T_res), base_score_test = scorer.run(X_test_scaled, t_test, y_test)
np.savez_compressed(os.path.join(options.output_path, f'rs_{options.base_model}_iter{i+1}'), y_res=Y_res, t_res=T_res)
base_scores_test.append([i+1, base_score_test])
pd.DataFrame(base_scores_val, columns=['iter_id', 'fold_id', 'base_score']).to_csv(os.path.join(options.output_path, f'rs_{options.base_model}_base_scores.csv'), index=False)
pd.DataFrame(base_scores_test, columns=['iter_id', 'base_score']).to_csv(os.path.join(options.output_path, f'rs_{options.base_model}_base_scores_test.csv'), index=False)