-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhomogenize_signals.py
247 lines (194 loc) · 8.77 KB
/
homogenize_signals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import os
from collections import defaultdict
import pyedflib
import pyedflib.highlevel as hl
import numpy as np
import copy
import shutil
import bz2
import pickle
import _pickle as cPickle
import multiprocessing as mp
# Pickle a file and then compress it into a file with extension
def compressed_pickle(title, data):
with bz2.BZ2File(title + '.pbz2', 'w') as f:
cPickle.dump(data, f)
# Process metadata
def process_metadata(summary, filename):
f = open(summary, 'r')
metadata = {}
lines = f.readlines()
times = []
for i in range(len(lines)):
line = lines[i].split()
if len(line) == 3 and line[2]==filename:
j = i+1
processed=False
while (not processed):
if (lines[j].split()[0]=='Number'):
seizures = int(lines[j].split()[-1])
processed=True
j = j + 1
# If file has seizures get start and end time
if seizures > 0:
j = i + 1
for s in range(seizures):
# Save start and end time of each seizure
processed = False
while (not processed):
l = lines[j].split()
#print(l)
if l[0]=='Seizure' and 'Start' in l:
start = int(l[-2]) * 256 - 1 # Index of start time
end = int(lines[j+1].split()[-2]) * 256 - 1 # Index of end time
processed=True
j = j + 1
times.append((start,end))
metadata['seizures']=seizures
metadata['times']=times
return metadata
# Keep some channels from a .edf and ignore the others
def drop_channels(edf_source, edf_target=None, to_keep=None, to_drop=None):
signals, signal_headers, header = hl.read_edf(edf_source,
ch_nrs=to_keep,
digital=False)
clean_file = {}
for signal, header in zip(signals, signal_headers):
channel = header.get('label')
if channel in clean_file.keys():
channel = channel + '-2'
clean_file[channel] = signal
return clean_file
# At first, it permuted the channels of a edf signal
# Now, only keeps valid channels and compress+save into pkl
def move_channels(clean_dict, channels, target):
# Keep only valid channels
keys_to_delete = []
for key in clean_dict:
if key != 'metadata' and key not in channels.keys():
keys_to_delete.append(key)
for key in keys_to_delete:
del clean_dict[key]
# Get size of the numpy array
size = 0
for item in clean_dict.keys():
if item != 'metadata':
size = len(clean_dict.get(item))
break
for k in channels.keys():
if k not in clean_dict.keys():
clean_dict[k]=np.zeros(size, dtype=float)
compressed_pickle(target + '.pkl', clean_dict)
# Process edf files of a pacient from start number to end number
def process_files(pacient, valid_channels, channels, start, end):
for num in range(start, end + 1):
to_keep = []
num = ('0' + str(num))[-2:]
filename = '{path}/chb{p}/chb{p}_{n}.edf'.format(path=signals_path, p=pacient, n=num)
# Check with (cleaned) reference file if we have to remove more channels
try:
signals, signal_headers, header = hl.read_edf(filename, digital=False)
n = 0
for h in signal_headers:
if h.get('label') in valid_channels:
if n not in to_keep:
to_keep.append(n)
n = n + 1
except OSError:
print('****************************************')
print("WARNING - Do not worry")
print("File", filename, "does not exist.\nProcessing next file.")
print('****************************************')
continue
if len(to_keep) > 0:
try:
print("Removing", len(signal_headers) - len(to_keep), "channels from file " , 'chb{p}_{n}.edf'.format(p=pacient, n=num))
clean_dict = drop_channels(filename, edf_target='{path}/chb{p}/chb{p}_{n}.edf'.format(path=clean_path,
p=pacient, n=num), to_keep=to_keep)
print("Processing file ", filename)
except AssertionError:
print('****************************************')
print("WARNING - Do not worry")
print("File", filename, "does not exist.\nProcessing next file.")
print('****************************************')
continue
metadata = process_metadata('{path}/chb{p}/chb{p}-summary.txt'.format(path=signals_path, p=pacient),'chb{p}_{n}.edf'.format(p=pacient, n=num))
metadata['channels']=valid_channels
clean_dict['metadata'] = metadata
target = '{path}/chb{p}/chb{p}_{n}.edf'.format(path=clean_path, p=pacient, n=num)
move_channels(clean_dict, channels, target)
def start_process(pacient, num, start, end, sum_ind):
# Summary file
f = open('{path}/chb{p}/chb{p}-summary.txt'.format(path=signals_path, p=pacient), 'r')
channels = defaultdict(list) # Dict of channels and indices
valid_channels = [] # Valid channels
to_keep = [] # Indices of channels we want to keep
channel_index = 1 # Index for each channel
summary_index = 0 # Index to choose which channel reference take from summary file
# Process summary file
for line in f:
line = line.split()
if len(line) == 0: continue
if line[0]=='Channels' and line[1] == 'changed:':
summary_index += 1
if line[0]=='Channel' and summary_index == sum_ind and (line[2]!= '-' and line[2]!='.'): # '-' means a void channel
if line[2] in channels.keys(): # In case of repeated channel just add '-2' to the label
name = line[2] + '-2'
else: name = line[2]
# Add channel to dict and update lists
channels[name].append(str(channel_index))
channel_index += 1
valid_channels.append(name)
to_keep.append(int(line[1][:-1]) - 1)
#for item in channels.items(): print(item)
# Clean reference file
filename = '{path}/chb{p}/chb{p}_{n}.edf'.format(path=signals_path, p=pacient, n=num)
target = '{path}/chb{p}/chb{p}_{n}.edf'.format(path=clean_path, p=pacient, n=num)
if not os.path.exists('{path}/chb{p}'.format(p=pacient, path=clean_path)):
os.makedirs('{path}/chb{p}'.format(p=pacient, path=clean_path))
clean_dict = drop_channels(filename, edf_target=target, to_keep=to_keep)
# Process metadata : Number of seizures and start/end time
metadata = process_metadata('{path}/chb{p}/chb{p}-summary.txt'.format(path=signals_path, p=pacient),
'chb{p}_{n}.edf'.format(p=pacient, n=num))
metadata['channels'] = valid_channels
clean_dict['metadata'] = metadata
compressed_pickle(target + '.pkl', clean_dict)
# Process the rest of the files to get same channels as reference file
process_files(pacient, valid_channels, channels, start, end)
# PARAMETERS
signals_path = 'physionet.org/files/chbmit/1.0.0' # Path to the data main directory
clean_path = 'clean_signals' # Path where to store clean data
if not os.path.exists(clean_path):
os.makedirs(clean_path)
# Clean pacients one by one manually with these parameters
pacient = '04'
num = '01' # Reference file
summary_index = 0 # Index of channels summary reference
start = 28 # Number of first file to process
end = 28 # Number of last file to process
# Start the process
#start_process(pacient, num, start, end, summary_index)
# FULL DATA PROCESS
parameters = [('01', '01', 2, 46, 0),
('02', '01', 2, 35, 0),
('03', '01', 2, 38, 0),
('05', '01', 2, 39, 0),
('06', '01', 2, 24, 0),
('07', '01', 2, 19, 0),
('08', '02', 3, 29, 0),
('10', '01', 2, 89, 0),
('11', '01', 2, 99, 0),
('14', '01', 2, 42, 0),
('20', '01', 2, 68, 0),
('21', '01', 2, 33, 0),
('22', '01', 2, 77, 0),
('23', '06', 7, 20, 0),
('24', '01', 3, 21, 0),
('04', '07', 1, 43, 1),
('09', '02', 1, 19, 1),
('15', '02', 1, 63, 1),
('16', '01', 2, 19, 0),
('18', '02', 1, 36, 1),
('19', '02', 1, 30, 1)]
with mp.Pool(mp.cpu_count()) as pool:
res = pool.starmap(start_process, parameters)