-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRasch_Model
226 lines (190 loc) · 8.98 KB
/
Rasch_Model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from tqdm import tqdm
from rasch import Rasch
import pickle
def main():
with open('./assist_train_for_individual_model_1111.txt', 'rt', encoding='utf8') as f:
raw_data = f.readlines()
container = []
triplet_data = []
student_list = []
whole_question_list = []
for idx, line in tqdm(enumerate(raw_data), desc="preprocess data"):
if idx % 3 == 0:
student = line.strip()
container.append(student)
student_list.append(student)
elif idx % 3 == 1:
question_list = line.strip().split(',')
container.append(question_list)
whole_question_list.extend(question_list)
elif idx % 3 == 2:
response_list = line.strip().split(',')
container.append(response_list)
if len(container) >= 3:
for ques, resp in zip(container[1], container[2]):
triplet_data.append((container[0], ques, resp))
container.clear()
student_list = list(set(student_list))
whole_question_list = list(set(whole_question_list))
model = Rasch(
student_list=student_list,
question_list=whole_question_list,
score_triplet_list=triplet_data)
model.fit()
params = model.get_params()
# print(params)
print(f"num student {len(student_list)} / num questions {len(whole_question_list)}")
with open('irt_result.pkl', 'wb') as f:
pickle.dump(params, f)
if __name__ == "__main__":
main()
import torch.nn as nn
import torch
from tqdm import tqdm
import sys
import visdom as vis
import numpy as np
NUM_EPOCH = 10
LEARNING_RATE = 0.0007
BATCH_SIZE = 64
def parse_resp(resp):
return float(resp)
class Loader:
def __init__(self, student_list, question_list, score_triplet_list):
self.student_list = student_list
self.question_list = question_list
self.stud2id = {item: idx for idx, item in enumerate(student_list)}
self.ques2id = {item: idx for idx, item in enumerate(question_list)}
# convert to id
self.score_triplet_list = [(self.stud2id[stud], self.ques2id[ques], parse_resp(resp)) for stud, ques, resp
in tqdm(score_triplet_list, desc="loading batch loader")]
self.loader = torch.utils.data.DataLoader(self.score_triplet_list, batch_size=BATCH_SIZE, shuffle=True)
# def get_batch(self):
# for batch in self.loader:
# yield batch
class Rasch(nn.Module):
def __init__(
self,
student_list: list, # list of student_i
question_list: list, # list of item_j
score_triplet_list: list # list of (student_i,student_j,0 or 1)
):
super(Rasch, self).__init__()
self.score_triplet_list = score_triplet_list
self.student_list = student_list
self.question_list = question_list
self.stud2id = {item: idx for idx, item in enumerate(student_list)}
self.ques2id = {item: idx for idx, item in enumerate(question_list)}
self.num_epoch = NUM_EPOCH
self.params = nn.Embedding(len(student_list) + len(question_list), 1)
self.loader = Loader(student_list=student_list, question_list=question_list,
score_triplet_list=score_triplet_list)
# self.params = nn.Parameter(torch.zeros(len(student_list) + len(question_list)))
self.optimizer = torch.optim.Adam(
self.parameters(),
lr=LEARNING_RATE,
weight_decay=.0)
self.loss_monitor = {}
self.vis = vis.Visdom()
self.plot = None
self.last_batch_num = 0
def fit(self):
num_batch = int(len(self.score_triplet_list) / BATCH_SIZE) + 1
for epoch in range(self.num_epoch):
self.loss_monitor[epoch + 1] = []
for idx, batch in enumerate(self.loader.loader):
loss = self.forward(batch)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.loss_monitor[epoch + 1].append(loss.item())
sys.stdout.write(
'EPOCH {} | {} / {} | nll loss : {:.4f}\r'.format(epoch + 1, idx + 1, num_batch, loss.item()))
print("EPOCH {} | {} / {} | nll loss : {:.4f}\r".format(epoch + 1, idx + 1, num_batch, loss.item()))
self.plot_loss_progress(epoch + 1)
print('Finished Training with epoch {}'.format(epoch + 1))
def forward(self, batch):
stud_param = self.params(batch[0])
ques_param = self.params(len(self.student_list) + batch[1])
loglike = batch[2] * (stud_param - ques_param)
loglike -= torch.log(1 + torch.exp(stud_param - ques_param))
return -loglike.mean() # negative log likelihood
def plot_loss_progress(self, epoch):
# for epoch in range(1,len(self.loss_monitor)+1):
xrange = np.array(range(self.last_batch_num, self.last_batch_num + len(self.loss_monitor[epoch])))
if self.plot is None:
self.plot = self.vis.line(Y=self.loss_monitor[epoch], X=xrange)
else:
self.vis.line(Y=self.loss_monitor[epoch], X=xrange, update='append', win=self.plot)
self.last_batch_num += len(self.loss_monitor[epoch])
def get_params(self):
params = self.params.weight.detach().numpy()
student_params = {stud: params[idx][0] for idx, stud in enumerate(self.student_list)}
question_params = {ques: params[len(self.student_list) + idx][0] for idx, ques in
enumerate(self.question_list)}
return student_params, question_params
from tqdm import tqdm
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
def parse_resp_as_label(resp):
return int(float(resp) > 0.5)
def parse_data(data_type="skill_builder_only"):
assert data_type in (
'skill_builder_only', 'all'), "data_type should be either 'skill_builder_only' or 'all'"
if data_type == 'skill_builder_only':
with open('./ASSISTment_skill_builder_only_test_1123.txt', 'rt', encoding='utf8') as f:
raw_data = f.readlines()
else:
with open('./assist_test_for_individual_model_1111.txt', 'rt', encoding='utf8') as f:
raw_data = f.readlines()
container = []
triplet_data = []
student_list = []
whole_question_list = []
for idx, line in tqdm(enumerate(raw_data), desc="preprocess data"):
if idx % 3 == 0:
student = line.strip()
container.append(student)
student_list.append(student)
elif idx % 3 == 1:
question_list = line.strip().split(',')
container.append(question_list)
whole_question_list.extend(question_list)
elif idx % 3 == 2:
response_list = line.strip().split(',')
container.append(response_list)
if len(container) >= 3:
for ques, resp in zip(container[1], container[2]):
triplet_data.append((container[0], ques, resp))
container.clear()
student_list = list(set(student_list))
whole_question_list = list(set(whole_question_list))
return student_list, whole_question_list, triplet_data
def test(dataset, irt_result):
''' dataset is a list of (student, question, response) '''
stud_param, ques_param = irt_result
pred = []
true = []
pass_cnt = 0
for stud, ques, resp in dataset:
try:
_prob = np.exp(stud_param[stud] - ques_param[ques]) / (
1 + np.exp(stud_param[stud] - ques_param[ques]))
pred.append(_prob)
true.append(parse_resp_as_label(resp))
except:
pass_cnt += 1
pred, true = np.array(pred), np.array(true)
acc = accuracy_score(true, (pred > 0.5).astype(np.int))
auc = roc_auc_score(true, pred)
print(f"missed {pass_cnt} items due to key errors!")
return acc, auc
def main():
_, _, triplet_data = parse_data('all')
with open('irt_result.pkl', 'rb') as f:
irt_result = pickle.load(f)
result = test(triplet_data, irt_result)
print("Skill Builder and Non Skill Builder combined \nACC : {:.6f} | AUC : {:.6f}".format(*result))
if __name__ == "__main__":
main()