-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing.py
124 lines (98 loc) · 4.62 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 8 09:15:20 2020
@author: rfuchs
"""
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import autograd.numpy as np
from autograd.numpy.random import uniform
from autograd.numpy import newaxis as n_axis
def gen_categ_as_bin_dataset(y, var_distrib):
''' Convert the categorical variables in the dataset to binary variables
y (numobs x p ndarray): The observations containing categorical variables
var_distrib (p 1darray): An array containing the types of the variables in y
----------------------------------------------------------------------------
returns ((numobs, p_new) ndarray): The new dataset where categorical variables
have been converted to binary variables
'''
new_y = deepcopy(y)
new_y = new_y.reset_index(drop = True)
new_var_distrib = deepcopy(var_distrib[var_distrib != 'categorical'])
categ_idx = np.where(var_distrib == 'categorical')[0]
oh = OneHotEncoder(drop = 'first')
for idx in categ_idx:
name = y.iloc[:, idx].name
categ_var = pd.DataFrame(oh.fit_transform(pd.DataFrame(y.iloc[:, idx])).toarray())
nj_var = len(categ_var.columns)
categ_var.columns = [str(name) + '_' + str(categ_var.columns[i]) for i in range(nj_var)]
# Delete old categorical variable & insert new binary variables in the dataframe
del(new_y[name])
new_y = new_y.join(categ_var.astype(int))
new_var_distrib = np.concatenate([new_var_distrib, ['bernoulli'] * nj_var])
return new_y, new_var_distrib
def ordinal_encoding(sequence, ord_labels, codes):
''' Perform label encoding, replacing ord_labels with codes
sequence (numobs 1darray): The sequence to encode
ord_labels (nj_ord_j 1darray): The labels existing in sequences
codes (nj_ord_j 1darray): The codes used to replace ord_labels
-----------------------------------------------------------------
returns (numobs 1darray): The encoded sequence
'''
new_sequence = deepcopy(sequence.values)
for i, lab in enumerate(ord_labels):
new_sequence = np.where(new_sequence == lab, codes[i], new_sequence)
return new_sequence
def compute_nj(y, var_distrib):
''' Compute nj for each variable y_j
y (numobs x p ndarray): The original data
var_distrib (p 1darray): The type of the variables in the data
-------------------------------------------------------------------
returns (tuple (p 1d array, nb_bin 1d array, nb_ord 1d array)): The number
of categories of all the variables, for count/bin variables only and for
ordinal variables only
'''
nj = []
nj_bin = []
nj_ord = []
nj_categ = []
for i in range(len(y.columns)):
if np.logical_or(var_distrib[i] == 'bernoulli', var_distrib[i] == 'binomial'):
max_nj = np.max(y.iloc[:,i], axis = 0)
nj.append(max_nj)
nj_bin.append(max_nj)
elif var_distrib[i] == 'ordinal':
card_nj = len(np.unique(y.iloc[:,i]))
nj.append(card_nj)
nj_ord.append(card_nj)
elif var_distrib[i] == 'categorical':
card_nj = len(np.unique(y.iloc[:,i]))
nj.append(card_nj)
nj_categ.append(card_nj)
elif var_distrib[i] == 'continuous':
nj.append(np.inf)
else:
raise ValueError('Data type', var_distrib[i], 'is illegal')
nj = np.array(nj)
nj_bin = np.array(nj_bin)
nj_ord = np.array(nj_ord)
nj_categ = np.array(nj_categ)
return nj, nj_bin, nj_ord, nj_categ
def bin_to_bern(Nj, yj_binom, zM_binom):
''' Split the binomial variable into Bernoulli. Them just recopy the corresponding zM.
It is necessary to fit binary logistic regression
Example: yj has support in [0,10]: Then if y_ij = 3 generate a vector with 3 ones and 7 zeros
(3 success among 10).
Nj (int): The upper bound of the support of yj_binom
yj_binom (numobs 1darray): The Binomial variable considered
zM_binom (numobs x r nd-array): The continuous representation of the data
-----------------------------------------------------------------------------------
returns (tuple of 2 (numobs x Nj) arrays): The "Bernoullied" Binomial variable
'''
n_yk = len(yj_binom) # parameter k of the binomial
# Generate Nj Bernoullis from each binomial and get a (numobsxNj, 1) table
u = uniform(size =(n_yk, int(Nj)))
p = (yj_binom/Nj)[..., n_axis]
yk_bern = (u > p).astype(int).flatten('A')#[..., n_axis]
return yk_bern, np.repeat(zM_binom, Nj, 0)