forked from NodLabs/tensorflow-dlrm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataloader.py
127 lines (91 loc) · 4.78 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import tensorflow as tf
import numpy as np
from io import BytesIO
from tensorflow.python.lib.io import file_io
def load_amazon_book(dataset_folder='dataset/'):
raw_data = dict()
raw_data['total_users'] = 99473
raw_data['total_items'] = 450166
raw_data['train_data'] = np.load(dataset_folder + 'amazon/user_data_train.npy')
raw_data['val_data'] = np.load(dataset_folder + 'amazon/user_data_val.npy')
raw_data['test_data'] = np.load(dataset_folder + 'amazon/user_data_test.npy')
raw_data['item_features'] = np.array(np.memmap(dataset_folder + 'amazon/book_features_update.mem',
dtype=np.float32, mode='r', shape=(raw_data['max_item'], 4096)))
raw_data['user_features'] = np.load(dataset_folder + 'amazon/user_features_categories.npy')
return raw_data
def load_citeulike(dataset_folder='dataset/'):
raw_data = dict()
raw_data['total_users'] = 5551
raw_data['total_items'] = 16980
raw_data['train_data'] = np.load(dataset_folder + 'citeulike/user_data_train.npy')
raw_data['val_data'] = np.load(dataset_folder + 'citeulike/user_data_val.npy')
raw_data['test_data'] = np.load(dataset_folder + 'citeulike/user_data_test.npy')
return raw_data
def load_tradesy(dataset_folder='dataset/'):
raw_data = dict()
raw_data['total_users'] = 19243
raw_data['total_items'] = 165906
raw_data['train_data'] = np.load(dataset_folder + 'tradesy/user_data_train.npy')
raw_data['val_data'] = np.load(dataset_folder + 'tradesy/user_data_val.npy')
raw_data['test_data'] = np.load(dataset_folder + 'tradesy/user_data_test.npy')
raw_data['item_features'] = np.load(dataset_folder + 'tradesy/item_features.npy') / 32.671101
return raw_data
def load_criteo_google_cloud(dataset_folder='dataset/'):
# Data processing code adapted from https://github.com/facebookresearch/dlrm
# Follow steps in https://github.com/ylongqi/dlrm/blob/master/data_utils.py to generate kaggle_processed.npz
# Or using `./download_dataset.sh criteo` command to download the processed data.
f = BytesIO(file_io.read_file_to_string(dataset_folder + 'criteo/kaggle_processed.npz', binary_mode=True))
with np.load(f) as data:
X_int = data["X_int"]
X_cat = data["X_cat"]
y = data["y"]
counts = data["counts"]
indices = np.arange(len(y))
indices = np.array_split(indices, 7)
for i in range(len(indices)):
indices[i] = np.random.permutation(indices[i])
train_indices = np.concatenate(indices[:-1])
test_indices = indices[-1]
val_indices, test_indices = np.array_split(test_indices, 2)
train_indices = np.random.permutation(train_indices)
raw_data = dict()
raw_data['counts'] = counts
raw_data['X_cat_train'] = X_cat[train_indices].astype(np.int32)
raw_data['X_int_train'] = np.log(X_int[train_indices] + 1).astype(np.float32)
raw_data['y_train'] = y[train_indices].astype(np.float32)
raw_data['X_cat_val'] = X_cat[val_indices]
raw_data['X_int_val'] = np.log(X_int[val_indices] + 1).astype(np.float32)
raw_data['y_val'] = y[val_indices]
raw_data['X_cat_test'] = X_cat[test_indices]
raw_data['X_int_test'] = np.log(X_int[test_indices] + 1).astype(np.float32)
raw_data['y_test'] = y[test_indices]
return raw_data
def load_criteo(dataset_folder='dataset/'):
# Data processing code adapted from https://github.com/facebookresearch/dlrm
# Follow steps in https://github.com/ylongqi/dlrm/blob/master/data_utils.py to generate kaggle_processed.npz
# Or using `./download_dataset.sh criteo` command to download the processed data.
with np.load(dataset_folder + 'criteo/kaggle_processed.npz') as data:
X_int = data["X_int"]
X_cat = data["X_cat"]
y = data["y"]
counts = data["counts"]
indices = np.arange(len(y))
indices = np.array_split(indices, 7)
for i in range(len(indices)):
indices[i] = np.random.permutation(indices[i])
train_indices = np.concatenate(indices[:-1])
test_indices = indices[-1]
val_indices, test_indices = np.array_split(test_indices, 2)
train_indices = np.random.permutation(train_indices)
raw_data = dict()
raw_data['counts'] = counts
raw_data['X_cat_train'] = X_cat[train_indices].astype(np.int32)
raw_data['X_int_train'] = np.log(X_int[train_indices]+1).astype(np.float32)
raw_data['y_train'] = y[train_indices].astype(np.float32)
raw_data['X_cat_val'] = X_cat[val_indices]
raw_data['X_int_val'] = np.log(X_int[val_indices]+1).astype(np.float32)
raw_data['y_val'] = y[val_indices]
raw_data['X_cat_test'] = X_cat[test_indices]
raw_data['X_int_test'] = np.log(X_int[test_indices]+1).astype(np.float32)
raw_data['y_test'] = y[test_indices]
return raw_data