-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathELLA.py
141 lines (128 loc) · 6.75 KB
/
ELLA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
""" Alpha version of a version of ELLA that plays nicely with sklearn
@author: Paul Ruvolo
"""
from math import log
import numpy as np
from scipy.misc import logsumexp
from scipy.linalg import sqrtm, inv, norm
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression, Lasso
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, explained_variance_score
class ELLA(object):
""" The ELLA model """
def __init__(self, d, k, base_learner, base_learner_kwargs = {}, mu = 1, lam = 1, k_init = False):
""" Initializes a new model for the given base_learner.
d: the number of parameters for the base learner
k: the number of latent model components
base_learner: the base learner to use (currently can only be
LinearRegression, Ridge, or LogisticRegression).
base_learner_kwargs: keyword arguments to base learner (for instance to
specify regularization strength)
mu: hyperparameter for sparsity
lam: L2 penalty on L
mu: the L_1 penalty to use
lam: the L_2 penalty to use
NOTE: currently only binary logistic regression is supported
"""
self.d = d
self.k = k
self.L = np.random.randn(d,k)
self.A = np.zeros((d * k, d * k))
self.b = np.zeros((d * k, 1))
self.S = np.zeros((k, 0))
self.T = 0
self.mu = mu
self.lam = lam
self.k_init = k_init
if base_learner in [LinearRegression, Ridge]:
self.perf_metric = explained_variance_score
elif base_learner in [LogisticRegression]:
self.perf_metric = accuracy_score
else:
raise Exception("Unsupported Base Learner")
self.base_learner = base_learner
self.base_learner_kwargs = base_learner_kwargs
def fit(self, X, y, task_id):
""" Fit the model to a new batch of training data. The task_id must
start at 0 and increase by one each time this function is called.
Currently you cannot add new data to old tasks.
X: the training data
y: the trianing labels
task_id: the id of the task
"""
self.T += 1
single_task_model = self.base_learner(fit_intercept = False, **self.base_learner_kwargs).fit(X, y)
D_t = self.get_hessian(single_task_model, X, y)
D_t_sqrt = sqrtm(D_t)
theta_t = single_task_model.coef_
sparse_encode = Lasso(alpha = self.mu / (X.shape[0] * 2.0),
fit_intercept = False).fit(D_t_sqrt.dot(self.L),
D_t_sqrt.dot(theta_t.T))
if self.k_init and task_id < self.k:
sparse_coeffs = np.zeros((self.k,))
sparse_coeffs[task_id] = 1.0
else:
sparse_coeffs = sparse_encode.coef_
self.S = np.hstack((self.S, np.matrix(sparse_coeffs).T))
self.A += np.kron(self.S[:,task_id].dot(self.S[:,task_id].T), D_t)
self.b += np.kron(self.S[:,task_id].T, np.mat(theta_t).dot(D_t)).T
L_vectorized = inv(self.A / self.T + self.lam * np.eye(self.d * self.k, self.d * self.k)).dot(self.b) / self.T
self.L = L_vectorized.reshape((self.k, self.d)).T
self.revive_dead_components()
def revive_dead_components(self):
""" re-initailizes any components that have decayed to 0 """
for i,val in enumerate(np.sum(self.L, axis = 0)):
if abs(val) < 10 ** -8:
self.L[:, i] = np.random.randn(self.d,)
def predict(self, X, task_id):
""" Output ELLA's predictions for the specified data on the specified
task_id. If using a continuous model (Ridge and LinearRegression)
the result is the prediction. If using a classification model
(LogisticRgerssion) the output is currently a probability.
"""
if self.base_learner == LinearRegression or self.base_learner == Ridge:
return X.dot(self.L.dot(self.S[:, task_id]))
elif self.base_learner == LogisticRegression:
return 1. / (1.0 + np.exp(-X.dot(self.L.dot(self.S[:, task_id])))) > 0.5
def predict_probs(self, X, task_id):
""" Output ELLA's predictions for the specified data on the specified
task_id. If using a continuous model (Ridge and LinearRegression)
the result is the prediction. If using a classification model
(LogisticRgerssion) the output is currently a probability.
"""
if self.base_learner == LinearRegression or self.base_learner == Ridge:
raise Exception("This base learner does not support predicting probabilities")
elif self.base_learner == LogisticRegression:
return np.exp(self.predict_logprobs(X, task_id))
def predict_logprobs(self, X, task_id):
""" Output ELLA's predictions for the specified data on the specified
task_id. If using a continuous model (Ridge and LinearRegression)
the result is the prediction. If using a classification model
(LogisticRgerssion) the output is currently a probability.
"""
if self.base_learner == LinearRegression or self.base_learner == Ridge:
raise Exception("This base learner does not support predicting probabilities")
elif self.base_learner == LogisticRegression:
return -logsumexp(np.hstack((np.zeros((X.shape[0], 1)), -X.dot(self.L.dot(self.S[:, task_id])))), axis = 1)
def score(self, X, y, task_id):
""" Output the score for ELLA's model on the specified testing data.
If using a continuous model (Ridge and LinearRegression)
the score is explained variance. If using a classification model
(LogisticRegression) the score is accuracy.
"""
return self.perf_metric(self.predict(X, task_id), y)
def get_hessian(self, model, X, y):
""" ELLA requires that each single task learner provide the Hessian
of the loss function evaluated around the optimal single task
parameters. This funciton implements this for the base learners
that are currently supported """
theta_t = model.coef_
if self.base_learner == LinearRegression:
return X.T.dot(X)/(2.0 * X.shape[0])
elif self.base_learner == Ridge:
return X.T.dot(X)/(2.0 * X.shape[0]) + model.alpha * np.eye(self.d, self.d)
elif self.base_learner == LogisticRegression:
preds = 1. / (1.0 + np.exp(-X.dot(theta_t.T)))
base = np.tile(preds * (1 - preds), (1, X.shape[1]))
hessian = (np.multiply(X, base)).T.dot(X) / (2.0 * X.shape[0])
return hessian + np.eye(self.d,self.d) / (2.0 * model.C)