-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdata_manager.py
executable file
·164 lines (135 loc) · 4.41 KB
/
data_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python
#-*- coding:utf-8 -*-
import random
"""
The manager responsible for processing the datasets
"""
__author__ = 'Filipe Wanderley Lima (fwl), Juliana Medeiros de Lucena (jml) and Tiago Ferreira Lima (tfl2)'
def get_data(path):
instances = dict()
with open(path) as f:
for line in f:
inst = line.split(',')
klass = inst[-1].rstrip('\n')
inst = map(float, inst[:-1])
inst.append(klass)
if instances.has_key(klass):
instances[klass].append(inst)
else:
instances[klass] = [inst]
return instances
def push_data(data, path):
f = open(path, 'w')
data_compressed = list()
for klass in data.keys():
data_compressed.extend([inst for inst in data[klass]])
for a in range(0, len(data_compressed)):
i = random.randint(0, len(data_compressed) - 1)
inst = data_compressed.pop(i)
for attr in inst:
f.write(str(attr))
if attr == inst[-1]:
f.write('\n')
else:
f.write(',')
def process_data(processed_data, training_percentage):
"""
Divide the dataset in training set and test set based on the percentage
@type processed_data: dict
@param processed_data: Tha data extracted from the dataset
@type training_percentage: int
@param training_percentage: The percentage used to divide the dataset
@rtype: tuple
@returns: The training set and the test set
"""
print training_percentage, '%', 'training and', 100 - training_percentage, '%', 'test'
len_test = 0
test = dict()
training = dict()
for c, l in processed_data.items():
training_quantity = (training_percentage*len(l)) / 100
for i in range(training_quantity):
if i == 0:
training[c] = [l.pop(random.randint(0, len(l)-1))]
else:
training[c].append(l.pop(random.randint(0, len(l)-1)))
test[c] = l
return (training, test)
def get_infos(path):
for i in range(5):
dataset_path = path + "/sub_training_%i" % i
training_path = path + "/training_%i" % i
test_path = path + "/test_%i" % i
print '\nfold %i' % i
print 'database:'
dataset = get_data(dataset_path)
try:
print 'Class 1:', len(dataset['1'])
except KeyError:
print 0
try:
print 'Class 2:', len(dataset['2'])
except KeyError:
print 0
print '\ntraining:'
training = get_data(training_path)
try:
print 'Class 1:', len(training['1'])
except KeyError:
print 0
try:
print 'Class 2:', len(training['2'])
except KeyError:
print 0
print '\ntest:'
test = get_data(test_path)
try:
print 'Class 1:', len(test['1'])
except KeyError:
print 0
try:
print 'Class 2:', len(test['2'])
except KeyError:
print 0
"""
Generates the training and test sets through k-fold cross validation
@type path: str
@param path: The path where the folds are
@type n_folds: int
@param n_folds: The number of folds
"""
def generate_sets(path, n_folds):
folds = [open(path + 'sub_training_%i' % i).readlines() for i in range(n_folds)]
for i, fold in enumerate(folds):
# The current file is the test set
test = open(path + 'test_%i' % i, 'w')
for l in fold:
test.write(l)
# The other files are the training set
train = open(path + 'training_%i' % i, 'w')
# Flattening files lines
lines = sum([f for f in folds if folds.index(f) != i], [])
for l in lines:
train.write(l)
if __name__ == '__main__':
#print get_data('datasets/sub_training_0')
#data = {'1': [[92.0, 80.0, 10.0, 26.0, 20.0, 6.0, '1'],
# [91.0, 68.0, 27.0, 26.0, 14.0, 16.0, '1']],
# '2': [[91.0, 69.0, 25.0, 25.0, 66.0, 8.0, '2'],
# [92.0, 108.0, 53.0, 33.0, 94.0, 12.0, '2'],
# [89.0, 63.0, 24.0, 20.0, 38.0, 0.5, '2']]}
#push_data(data, 'datasets/data')
#folds_data = [get_data('datasets/liver/folds/oss/sub_training_%i' % i)
#for i in range(0, 5)]
#for data in folds_data:
#train, test = process_data(data, 80)
#push_data(train, 'datasets/liver/folds/oss/training_%i' %
#folds_data.index(data))
#push_data(test, 'datasets/liver/folds/oss/test_%i' %
#folds_data.index(data))
#get_infos('datasets/liver/selected/ib2')
for path in ['liver/selected/drop3/', 'liver/selected/hmn_ei/',
'liver/selected/ib2/', 'liver/selected/icf/', 'liver/selected/oss/',
'liver/folds/original/']:
path = 'datasets/' + path
generate_sets(path, 5)