forked from chl8856/DeepIMV
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_data.py
executable file
·63 lines (42 loc) · 1.83 KB
/
import_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np
import pandas as pd
import random
## all samples MUST include at least one view.
def import_incomplete_handwritten():
npz = np.load('./data/Handwritten_Missing/data_with_missingviews.npz', allow_pickle=True)
X_set = npz['X_set'].tolist()
Y_onehot = npz['Y_onehot']
M = len(X_set)
### Construct Mask Vector to indicate available (m=1) or missing (m=0) values
Mask = np.ones([np.shape(X_set[0])[0], M])
for m_idx in range(M):
Mask[np.isnan(X_set[m_idx]).all(axis=1), m_idx] = 0
X_set[m_idx][Mask[:, m_idx] == 0] = np.mean(X_set[m_idx][Mask[:, m_idx] == 1], axis=0)
return X_set, Y_onehot, Mask
def import_dataset_TCGA(year=1):
filename = '/media/vdslab/Genomics/TCGA/dataset/FINAL/cleaned/incomplete_multi_view_pca_{}yr.npz'.format(int(year))
npz = np.load(filename)
Mask = npz['m']
M = np.shape(Mask)[1]
X_set = {}
for m in range(M):
tmp = npz['x{}'.format(m+1)]
tmp[np.isnan(tmp[:, 0]), :] = np.nanmean(tmp, axis=0)
X_set[m] = tmp
Y = npz['y']
X_set_incomp = {}
X_set_comp = {}
for m in range(M):
X_set_comp[m] = X_set[m][np.sum(Mask, axis=1) == 4]
X_set_incomp[m] = X_set[m][np.sum(Mask, axis=1) != 4]
Y_comp = Y[np.sum(Mask, axis=1) == 4]
Y_incomp = Y[np.sum(Mask, axis=1) != 4]
Mask_comp = Mask[np.sum(Mask, axis=1) == 4]
Mask_incomp = Mask[np.sum(Mask, axis=1) != 4]
Y_onehot_incomp = np.zeros([np.shape(Y_incomp)[0], 2])
Y_onehot_comp = np.zeros([np.shape(Y_comp)[0], 2])
Y_onehot_incomp[Y_incomp == 0, 0] = 1
Y_onehot_incomp[Y_incomp == 1, 1] = 1
Y_onehot_comp[Y_comp == 0, 0] = 1
Y_onehot_comp[Y_comp == 1, 1] = 1
return X_set_comp, Y_onehot_comp, Mask_comp, X_set_incomp, Y_onehot_incomp, Mask_incomp