-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
140 lines (111 loc) · 5.13 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# TODO: implement data loader
# preliminary idea: recordID as key, matrix as value
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
import utils
class PhysioNET(Dataset):
def __init__(self, data, outcomes, num_features=33, means=None, stds=None):
self.outcomes = outcomes
self.x = {}
self.delta = {}
self.m = {}
self.x_obs = {}
self.active_indices = list(range(len(outcomes)))
# get only data in records
self.data = data.loc[outcomes['RecordID']]
# if not given, compute empirical mean and std
self.means = means
if self.means is None:
self.means = self.data.loc[outcomes.iloc[self.active_indices]['RecordID']].iloc[:, 1:].mean().values
# self.means = self.data.iloc[self.active_indices, 1:].mean().values
self.stds = stds
if self.stds is None:
self.stds = self.data.loc[outcomes.iloc[self.active_indices]['RecordID']].iloc[:, 1:].std().values
# self.stds = self.data.iloc[self.active_indices, 1:].std().values
for i, record_id in enumerate(outcomes['RecordID']):
partial_df = self.data.loc[[record_id]]
# get x
self.x[i] = partial_df.drop(['time'], axis=1).as_matrix()
# self.x[i] = (self.x[i] - self.means) / self.stds
# get m
m = np.isnan(self.x[i]).astype(int)
self.m[i] = m
# get delta
delta = np.zeros(m.shape)
timestamps = np.array([utils.timestamp2minute(s) for s in partial_df['time']])
x_obs = np.zeros(m.shape)
last_obs = [None] * num_features
for d in range(num_features):
for t in range(len(m)):
# compute delta
if t == 0:
delta[t][d] = 0
elif t > 0 and m[t - 1][d] == 0:
delta[t][d] = timestamps[t] - timestamps[t - 1] + delta[t - 1][d]
elif t > 0 and m[t - 1][d] == 1:
delta[t][d] = timestamps[t] - timestamps[t - 1]
# compute carry-forward, if non-existent impute with mean
# if not missing, update last obs
if m[t][d] == 0:
last_obs[d] = self.x[i][t][d]
# if missing, fill with last obs or mean
elif m[t][d] == 1:
x_obs[t][d] = last_obs[d] or self.means[d]
self.delta[i] = delta
# normalize
self.x[i] = (self.x[i] - self.means) / (self.stds + 1e-3)
self.x_obs[i] = (x_obs - self.means) / (self.stds + 1e-3)
self.labels = outcomes['In-hospital_death'].as_matrix()
def __len__(self):
return len(self.active_indices)
def set_active_indices(self, new_indices, new_means=None, new_stds=None):
self.active_indices = new_indices
# recompute mean, std
if new_means is None:
new_means = self.data.loc[self.outcomes.iloc[self.active_indices]['RecordID']].iloc[:, 1:].mean().values
if new_stds is None:
new_stds = self.data.loc[self.outcomes.iloc[self.active_indices]['RecordID']].iloc[:, 1:].std().values
# re normalize
for i in new_indices:
self.x[i] = (((self.x[i] * self.stds) + self.means) - new_means) / (new_stds + 1e-3)
self.x_obs[i] = (((self.x_obs[i] * self.stds) + self.means) - new_means) / (new_stds + 1e-3)
self.means = new_means
self.stds = new_stds
return None
def __getitem__(self, key):
"""
:param key:
:return: x, delta, m
"""
real_key = self.active_indices[key]
return (self.x[real_key], self.delta[real_key], self.m[real_key], self.x_obs[real_key]), self.labels[real_key]
# TODO: implement batch collate by padding with 0s
def collate_batch(batch):
# find maximum sequence length
max_seq_len = max([len(b[0][0]) for b in batch])
num_features = batch[0][0][0].shape[1]
matrices = [list() for _ in range(4)]
lengths = torch.LongTensor(np.array([len(b[0][0]) for b in batch]))
# pad to max_seq_len in batch with zeros
for b in batch:
# get length of sample
sample_len = len(b[0][0])
# pad the three matrices
for i in range(4):
matrices[i].append(torch.cat([
torch.FloatTensor(b[0][i]),
torch.FloatTensor(max_seq_len-sample_len, num_features).zero_()
]))
packed_matrices = []
for i in range(4):
packed_matrices.append(torch.stack(matrices[i]))
packed_matrices.append(lengths)
return packed_matrices, torch.LongTensor([int(b[1]) for b in batch])
if __name__ == "__main__":
df = pd.read_csv('./all_data.csv')
outcomes = pd.read_csv('./set-a/Outcomes-a.txt')
dataset = PhysioNET(df, outcomes)