forked from chl8856/Dynamic-DeepHit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_data.py
executable file
·157 lines (122 loc) · 6.19 KB
/
import_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pandas as pd
import numpy as np
##### USER-DEFINED FUNCTIONS
def f_get_Normalization(X, norm_mode):
num_Patient, num_Feature = np.shape(X)
if norm_mode == 'standard': #zero mean unit variance
for j in range(num_Feature):
if np.nanstd(X[:,j]) != 0:
X[:,j] = (X[:,j] - np.nanmean(X[:, j]))/np.nanstd(X[:,j])
else:
X[:,j] = (X[:,j] - np.nanmean(X[:, j]))
elif norm_mode == 'normal': #min-max normalization
for j in range(num_Feature):
X[:,j] = (X[:,j] - np.nanmin(X[:,j]))/(np.nanmax(X[:,j]) - np.nanmin(X[:,j]))
else:
print("INPUT MODE ERROR!")
return X
def f_get_fc_mask1(meas_time, num_Event, num_Category):
'''
mask3 is required to get the contional probability (to calculate the denominator part)
mask3 size is [N, num_Event, num_Category]. 1's until the last measurement time
'''
mask = np.zeros([np.shape(meas_time)[0], num_Event, num_Category]) # for denominator
for i in range(np.shape(meas_time)[0]):
mask[i, :, :int(meas_time[i, 0]+1)] = 1 # last measurement time
return mask
def f_get_fc_mask2(time, label, num_Event, num_Category):
'''
mask4 is required to get the log-likelihood loss
mask4 size is [N, num_Event, num_Category]
if not censored : one element = 1 (0 elsewhere)
if censored : fill elements with 1 after the censoring time (for all events)
'''
mask = np.zeros([np.shape(time)[0], num_Event, num_Category]) # for the first loss function
for i in range(np.shape(time)[0]):
if label[i,0] != 0: #not censored
mask[i,int(label[i,0]-1),int(time[i,0])] = 1
else: #label[i,2]==0: censored
mask[i,:,int(time[i,0]+1):] = 1 #fill 1 until from the censoring time (to get 1 - \sum F)
return mask
def f_get_fc_mask3(time, meas_time, num_Category):
'''
mask5 is required calculate the ranking loss (for pair-wise comparision)
mask5 size is [N, num_Category].
- For longitudinal measurements:
1's from the last measurement to the event time (exclusive and inclusive, respectively)
denom is not needed since comparing is done over the same denom
- For single measurement:
1's from start to the event time(inclusive)
'''
mask = np.zeros([np.shape(time)[0], num_Category]) # for the first loss function
if np.shape(meas_time): #lonogitudinal measurements
for i in range(np.shape(time)[0]):
t1 = int(meas_time[i, 0]) # last measurement time
t2 = int(time[i, 0]) # censoring/event time
mask[i,(t1+1):(t2+1)] = 1 #this excludes the last measurement time and includes the event time
else: #single measurement
for i in range(np.shape(time)[0]):
t = int(time[i, 0]) # censoring/event time
mask[i,:(t+1)] = 1 #this excludes the last measurement time and includes the event time
return mask
##### TRANSFORMING DATA
def f_construct_dataset(df, feat_list):
'''
id : patient indicator
tte : time-to-event or time-to-censoring
- must be synchronized based on the reference time
times: time at which observations are measured
- must be synchronized based on the reference time (i.e., times start from 0)
label: event/censoring information
- 0: censoring
- 1: event type 1
- 2: event type 2
...
'''
grouped = df.groupby(['id'])
id_list = pd.unique(df['id'])
max_meas = np.max(grouped.count())[0]
data = np.zeros([len(id_list), max_meas, len(feat_list)+1])
pat_info = np.zeros([len(id_list), 5])
for i, tmp_id in enumerate(id_list):
tmp = grouped.get_group(tmp_id).reset_index(drop=True)
pat_info[i,4] = tmp.shape[0] #number of measurement
pat_info[i,3] = np.max(tmp['times']) #last measurement time
pat_info[i,2] = tmp['label'][0] #cause
pat_info[i,1] = tmp['tte'][0] #time_to_event
pat_info[i,0] = tmp['id'][0]
data[i, :int(pat_info[i, 4]), 1:] = tmp[feat_list]
data[i, :int(pat_info[i, 4]-1), 0] = np.diff(tmp['times'])
return pat_info, data
def import_dataset(norm_mode = 'standard'):
df_ = pd.read_csv('./data/pbc2_cleaned.csv')
bin_list = ['drug', 'sex', 'ascites', 'hepatomegaly', 'spiders']
cont_list = ['age', 'edema', 'serBilir', 'serChol', 'albumin', 'alkaline', 'SGOT', 'platelets', 'prothrombin', 'histologic']
feat_list = cont_list + bin_list
df_ = df_[['id', 'tte', 'times', 'label']+feat_list]
df_org_ = df_.copy(deep=True)
df_[cont_list] = f_get_Normalization(np.asarray(df_[cont_list]).astype(float), norm_mode)
pat_info, data = f_construct_dataset(df_, feat_list)
_, data_org = f_construct_dataset(df_org_, feat_list)
data_mi = np.zeros(np.shape(data))
data_mi[np.isnan(data)] = 1
data_org[np.isnan(data)] = 0
data[np.isnan(data)] = 0
x_dim = np.shape(data)[2] # 1 + x_dim_cont + x_dim_bin (including delta)
x_dim_cont = len(cont_list)
x_dim_bin = len(bin_list)
last_meas = pat_info[:,[3]] #pat_info[:, 3] contains age at the last measurement
label = pat_info[:,[2]] #two competing risks
time = pat_info[:,[1]] #age when event occurred
num_Category = int(np.max(pat_info[:, 1]) * 1.2) #or specifically define larger than the max tte
num_Event = len(np.unique(label)) - 1
if num_Event == 1:
label[np.where(label!=0)] = 1 #make single risk
mask1 = f_get_fc_mask1(last_meas, num_Event, num_Category)
mask2 = f_get_fc_mask2(time, label, num_Event, num_Category)
mask3 = f_get_fc_mask3(time, -1, num_Category)
DIM = (x_dim, x_dim_cont, x_dim_bin)
DATA = (data, time, label)
# DATA = (data, data_org, time, label)
MASK = (mask1, mask2, mask3)
return DIM, DATA, MASK, data_mi