-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_preprocessing.py
604 lines (459 loc) · 27 KB
/
dataset_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
import os
import numpy as np
import re
import pandas as pd
import fastparquet as fp
import scipy.integrate as it
from copy import deepcopy
from sklearn.cluster import KMeans
from scipy.interpolate import interp1d
from scipy.spatial.distance import cdist
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from imblearn.under_sampling import RandomUnderSampler
def gen_train_test_valid(source, cluster_classes, nb_files_tvt = [5, 4, 1],\
train_umbal_margin = 100, return_files = False, seed = None,\
flrs = [6,25]):
''' Generate a train balanced dataset and a test set with all observations
source (str): The location of extracted (and formatted) Pulse files on disk
cluster_classes (list of str): The classes used in the prediction task
nb_files_tvt (list of int): the number of files used to generate respectively the train/valid/test sets
flrs (list of size 2): The lowest and highest FLR thresholds
seed (all type): The seed to use to generate random samples
------------------------------------------------------------------------------------
'''
assert (np.array(nb_files_tvt) != 0).all()
# Set the seed for train/test sets splitting
np.random.seed(seed)
# Encode the names of functional groups into integers
le = LabelEncoder()
le.fit(cluster_classes)
# If validation set does not contain all classes or if the test set has not been generated
# by a FLR6 at least then redraw them
not_all_classes_in_valid = True
no_FLR6_in_valid = True
no_FLR25_in_valid = True
no_FLR6_in_test = True
no_FLR25_in_test = True
while not_all_classes_in_valid or no_FLR6_in_test or no_FLR25_in_test or no_FLR6_in_valid or no_FLR25_in_valid:
# Fetching the formatted files
files = os.listdir(source)
files = [f for f in files if re.search("Labelled",f) and not(re.search('lock', f))]
np.random.shuffle(files)
# Assigning them between train, valid and test
bounds = np.concatenate([[0], np.cumsum(nb_files_tvt)])
train_files, valid_files, test_files = [files[bounds[i]:bounds[i + 1]] for i in range(3)]
print('train', train_files)
print('valid',valid_files)
print('test', test_files)
# Check that all classes are present in the validation set
valid_classes = []
for vfile in valid_files:
pfile = fp.ParquetFile(source + '/' + vfile)
vc = np.unique(pfile.to_pandas(columns=['cluster'])['cluster'])
vc = homogeneous_cluster_names_swings(vc) # To decomment for SSLAMM
valid_classes = valid_classes + vc # unlist vs if homogeneous_cluster_names is uncommented
valid_classes = np.unique(valid_classes)
if (len(valid_classes) == len(cluster_classes)):
not_all_classes_in_valid = False
# Check that the valid files contains at least one FLR6 file which are more diversified than FLR25 files
for vfile in valid_files:
if re.search('Pulse' + str(flrs[0]), vfile):
no_FLR6_in_valid = False
if re.search('Pulse' + str(flrs[1]), vfile):
no_FLR25_in_valid = False
# Check that the test files contains at least one FLR6 file which are more diversified than FLR25 files
for tfile in test_files:
if re.search('Pulse' + str(flrs[0]), tfile):
no_FLR6_in_test = False
if re.search('Pulse' + str(flrs[1]), tfile):
no_FLR25_in_test = False
# Extract a balanced trained_dataset
print('Generating train set')
X_train, seq_len_list_train, y_train, pid_list_train, file_name_train, le_train = gen_dataset(source, \
cluster_classes, train_files, le, nb_obs_to_extract_per_group = train_umbal_margin, \
to_balance = True, seed = None)
# BE CAREFUL !!!! NO MORE UNDERSAMPLING HERE
# Extract the valid dataset from full files
print('Generating valid set')
X_valid, seq_len_list_valid, y_valid, pid_list_valid, file_name_valid, le_valid = gen_dataset(source, \
cluster_classes, valid_files, le, nb_obs_to_extract_per_group = 10000000, \
to_balance = False, to_undersample = False, seed = None)
# Extract the test dataset from full files
print('Generating test set')
X_test, seq_len_list_test, y_test, pid_list_test, file_name_test, le_test = gen_dataset(source, \
cluster_classes, test_files, le, nb_obs_to_extract_per_group = 10000000, \
to_balance = False, to_undersample = False, seed = None)
# Write the nomenclature (Ex: Synnechochocus = 0, Prochlorrococus = 1 ...) used to generate the dataset
pd.DataFrame(zip(le.classes_,le.transform(le.classes_)),\
columns = ['cluster', 'labels']).to_csv(source + '/train_test_nomenclature.csv',\
index = False)
# Write the names of the test_files for latter vizualisation
log_file = source + "/test_files_name.txt" # Create the file if it does not exist
if not(os.path.isfile(log_file)):
open(log_file, 'w+').close()
# Mark that the file has been formatted
with open(log_file, "a") as file:
for tfile in test_files:
file.write(tfile + '\n')
if return_files:
return [X_train, y_train, X_valid, y_valid, X_test, y_test], \
[file_name_train, file_name_valid, file_name_test],\
[pid_list_train, pid_list_valid, pid_list_test]
else:
return X_train, y_train, X_valid, y_valid, X_test, y_test
def gen_dataset(source, cluster_classes, files = [], le = None, nb_obs_to_extract_per_group = 1E7, \
to_balance = True, to_undersample = False, seed = None):
''' Generate a balanced dataset from the cleaned Pulse files
source (str): The location of extracted (and formatted) Pulse files on disk
cluster_classes (list of str): The classes used in the prediction task
files (list of str): If None extract the observations from all files of source, if list of str extract only from the names specified
le (LabelEncoder object): None if no label encoder is provided.
nb_obs_to_extract_per_group: Number of cells to extract for each group in each file
seed (all type): The seed to use to generate random samples
------------------------------------------------------------------------------
return (4 arrays): The dataset (X, y) the particle ids (pid) and the encoder of the groups names
'''
CURVES_DEFAULT_LEN = 120 # Standard curves lengths expected by the Network
if le == None:
le = LabelEncoder()
le.fit(cluster_classes)
# Fetching the formatted files if not provided
if len(files) == 0:
files = os.listdir(source)
files = [f for f in files if re.search("Labelled",f) and not(re.search('lock', f))]
X = []
y = []
pid_list = []
seq_len_list = []
file_name = []
# Get the records of how many observations per class have already been included in the dataset(ignored if to_balance = False)
balancing_dict = dict(zip(cluster_classes, np.full(len(cluster_classes), nb_obs_to_extract_per_group)))
for idx, file in enumerate(files):
print('File:', idx + 1, '/', len(files), '(', file, ')')
pfile = fp.ParquetFile(source + '/' + file)
df = pfile.to_pandas()
if not('Particle ID' in df.columns):
df = df.reset_index()
if len(df) == 0:
continue
X_file, seq_len_file, y_file, pid_list_file = data_preprocessing(df, balancing_dict, CURVES_DEFAULT_LEN, \
to_balance = to_balance, to_undersample = to_undersample, seed = seed)
if pd.isna(X_file).any():
print('There were NaNs so the file was not treated')
continue
if len(X_file) == 0:
continue
X.append(X_file)
y.append(y_file)
pid_list.append(pid_list_file)
seq_len_list.append(seq_len_file)
file_name.append(np.repeat(file, len(X_file)))
print('Nb of obs',len(np.concatenate(y)))
if to_balance:
# Defining the groups to sample in priority in the next sample: Those which have less observations
balancing_dict = pd.Series(np.concatenate(y)).value_counts().to_dict()
balancing_dict = {k: balancing_dict[k] if k in balancing_dict else 0 for k in cluster_classes}
balancing_dict = {k: max(balancing_dict.values()) - balancing_dict[k] for k in cluster_classes}
# Give the final form to the dataset
X = np.vstack(X)
y = np.concatenate(y)
# Encode y and taking care of missing classes
y = le.transform(y)
y = to_categorical(y, num_classes = len(cluster_classes))
pid_list = np.concatenate(pid_list)
seq_len_list = np.concatenate(seq_len_list)
file_name = np.concatenate(file_name)
# Sanity checks:
assert y.shape[1] == len(cluster_classes)
assert len(X) == len(y) == len(pid_list) == len(seq_len_list) == len(file_name)
return X, seq_len_list, y, pid_list, file_name, le
def interp_sequences(sequences, max_len):
''' Interpolate sequences in order to reduce their length to max_len
sequences (ndarray): The sequences to interpolate
maxlen (int): The maximum length of the sequence: All sequences will be interpolated to match this length
-------------------------------------------------------------------
returns (ndarray): The interpolated sequences
'''
interp_obs = np.zeros((len(sequences), 5, max_len))
for idx, s in enumerate(sequences):
original_len = s.shape[1]
f = interp1d(np.arange(original_len), s, 'quadratic', axis = 1)
interp_seq = f(np.linspace(0, original_len -1, num = max_len))
interp_obs[idx] = interp_seq
try:
return np.stack(interp_obs)
except ValueError:
return np.array([])
def data_preprocessing(df, balancing_dict = None, max_len = None, \
to_balance = True, to_undersample = False, seed = None):
''' Interpolates Pulse sequences and rebalance the dataset
df (pandas DataFrame): The data container
balancing_dict (dict): A dict that contains the desired quantities to extract for each group in order to obtain a balanced dataset. Only used if to_balance is True
maxlen (int): The maximum length of the sequence: All sequences will be interpolated to match this length
to_balance (Bool): Whether to balance or not the dataset
to_undersample (Bool): Whether to undersample the data even if it is not to obtain a balanced data_set. Used only if to_balance == False
seed (int): The seed to use to fix random results if wanted
-----------------------------------------------------------------------------------------------------------------------------------
returns (3 arrays): The dataset (X, y_list) y_list being unencoded labels and pid_list the list of corresponding particle ids
'''
# The final length of the sequences to fed the Neural Network
DEFAULT_MAX_LEN = 120 # The default size of the
DEFAULT_UDS_SIZE = 6000 # If decide to undersample, how many observations to keep in the end
# Make the cluster names homogeneous and get the group of each particule
df = homogeneous_cluster_names_swings(df) # To uncomment for SSLAMM
pid_cluster = df[['Particle ID', 'cluster']].drop_duplicates()
clus_value_count = pid_cluster.cluster.value_counts().to_dict()
if to_balance:
# Deleting non existing keys and adapting to the data in place
balancing_dict = {k: min(balancing_dict[k], clus_value_count[k]) for k in clus_value_count.keys()}
# Undersampling to get a balanced dataset
rus = RandomUnderSampler(random_state = seed, sampling_strategy = balancing_dict)
pid_resampled, y_resampled = rus.fit_sample(pid_cluster['Particle ID'].values.reshape(-1,1), pid_cluster['cluster'])
df_resampled = df.set_index('Particle ID').loc[pid_resampled.flatten()]
else: # Keep the same distribution as in the original dataset
if to_undersample: # Reduce the size of the dataset to speed things up
pids = deepcopy(pid_cluster['Particle ID'].tolist())
uds_size = np.min([DEFAULT_UDS_SIZE, len(pids)])
pid_resampled = np.random.choice(pids, replace = False, size = uds_size)
df_resampled = df.set_index('Particle ID').loc[pid_resampled.flatten()]
else:
df_resampled = df.set_index('Particle ID')
# Reformatting the values
obs_list = [] # The 5 curves
pid_list = [] # The Particle ids
y_list = [] # The class (e.g. 0 = prochlorocchoccus, 1= ...)
seq_len_list = [] # The original length of the sequence
for pid, obs in df_resampled.groupby('Particle ID'):
# Sanity check: only one group for each particle
try:
assert(len(set(obs['cluster'])) == 1)
except: # If a particle have been clustered in 2 groups them drop it
print("Doublon de cluster", set(obs['cluster']), "pour l'obs ", pid)
continue
obs_list.append(obs.iloc[:,:5].values.T)
seq_len_list.append(len(obs))
pid_list.append(pid)
y_list.append(list(set(obs['cluster']))[0])
if max_len == None:
max_len = DEFAULT_MAX_LEN
obs_list = interp_sequences(obs_list, max_len)
X = np.transpose(obs_list, (0, 2, 1))
return X, seq_len_list, y_list, pid_list
def homogeneous_cluster_names(array):
''' Make homogeneous the names of the groups coming from the different Pulse files
array (list, numpy 1D array or dataframe): The container in which the names have to be changed
-----------------------------------------------------------------------------------------------
returns (array): The array with the name changed and the original shape
'''
bubble_pat = '[_A-Za-z0-9?\-()]*bubble[_A-Za-z()0-9?\-]*'
crypto_pat = '[_A-Za-z0-9?\-()]*[Cc]rypto[_A-Za-z()0-9?\-]*'
pico_pat = '[_A-Za-z0-9?\-()]*[pP]ico[_A-Za-z()0-9?\-]*'
nano_pat = '[_A-Za-z0-9?\-()]*[Nn]ano[_A-Za-z()0-9?\-]*'
micro_pat = '[_A-Za-z0-9?\-()]*[Mm]icrop[_A-Za-z()0-9?\-]*'
prochlo_pat = '[_A-Za-z0-9?\-()]*[Pp]rochlo[_A-Za-z()0-9?\-]*'
synecho_pat = '[_A-Za-z0-9?\-()]*[Ss]ynecho[_A-Za-z()0-9?\-]*'
lowfluo_pat = '[_A-Za-z0-9?\-()]*low[ _0-9]{0,1}fluo[_A-Za-z()0-9?\-]*'
noiseinf_pat = '[_A-Za-z0-9?\-()]*noise[ _0-9]{0,1}in[fg][_A-Za-z()0-9?\-]*'
noisesup_pat = '[_A-Za-z0-9?\-()]*noise[ _0-9]{0,1}sup[_A-Za-z()0-9?\-]*'
nophyto_pat = '[_A-Za-z0-9?\-()]*nophyto[_A-Za-z()0-9?\-]*'
unassigned_pat = '[_A-Za-z0-9?\-()]*[Uu]nassigned[ _A-Za-z()0-9?\-]*'
unidentified_pat = '^unidentified_particles*'
# Add regex for undertermined
# Add regex for unassigned
if type(array) == pd.core.frame.DataFrame:
array['cluster'] = array.cluster.str.replace('Cryptophyceae','cryptophyte')
array['cluster'] = array.cluster.str.replace('coccolithophorideae like','nanoeucaryote')
array['cluster'] = array.cluster.str.replace('[_A-Za-z0-9?\-()]+naneu[_A-Za-z()0-9?\-]+','nanoeucaryote')
array['cluster'] = array.cluster.str.replace('PPE 1','picoeucaryote')
array['cluster'] = array.cluster.str.replace('PPE 2','picoeucaryote')
array['cluster'] = array.cluster.str.replace('New Set 4','inf1microm_unidentified_particle')
array['cluster'] = array.cluster.str.replace('New Set 7','inf1microm_unidentified_particle')
array['cluster'] = array.cluster.str.replace('A_undetermined','noise')
array['cluster'] = array.cluster.str.replace('C_undetermined','noise')
array['cluster'] = array.cluster.str.replace('inf1um_unidentified_particle','inf1microm_unidentified_particle')
array['cluster'] = array.cluster.str.replace('sup1um_unidentified_particle','sup1microm_unidentified_particle')
array['cluster'] = array.cluster.str.replace('[_A-Za-z0-9?\-()]+syncho[_A-Za-z()0-9?\-]+','synechococcus')
array['cluster'] = array['cluster'].str.replace(bubble_pat, 'airbubble', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(crypto_pat, 'cryptophyte', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(pico_pat, 'picoeucaryote', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(nano_pat, 'nanoeucaryote', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(micro_pat, 'microphytoplancton', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(prochlo_pat, 'prochlorococcus', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(synecho_pat, 'synechococcus', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(unidentified_pat, 'noise', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(noiseinf_pat,\
'inf1microm_unidentified_particle', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(noisesup_pat,\
'sup1microm_unidentified_particle', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(nophyto_pat,\
'sup1microm_unidentified_particle', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(lowfluo_pat,\
'sup1microm_unidentified_particle', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace(unassigned_pat,\
'noise', regex = True, case = False)
array['cluster'] = array.cluster.str.replace('C_noise1','inf1microm_unidentified_particle')
array['cluster'] = array.cluster.str.replace('µ','micro')
array['cluster'] = array.cluster.str.replace('es$','e') # Put in the names in singular form
array['cluster'] = array.cluster.str.replace(' ','') # Put in the names in singular form
array['cluster'] = array.cluster.str.lower()
else:
array = [re.sub('Cryptophyceae','cryptophyte', string) for string in array]
array = [re.sub('coccolithophorideae like','nanoeucaryote', string) for string in array]
array = [re.sub('[_A-Za-z0-9?\-()]+naneu[_A-Za-z()0-9?\-]+','nanoeucaryote', string) for string in array]
array = [re.sub('PPE 1','picoeucaryotes', string) for string in array]
array = [re.sub('PPE 2','picoeucaryotes', string) for string in array]
array = [re.sub('A_undetermined','noise', string) for string in array]
array = [re.sub('C_undetermined','noise', string) for string in array]
array = [re.sub('New Set 7','inf1microm_unidentified_particle', string) for string in array]
array = [re.sub('New Set 4','inf1microm_unidentified_particle', string) for string in array]
array = [re.sub('[_A-Za-z0-9?\-()]+syncho[_A-Za-z()0-9?\-]+','synechococcus', string) for string in array]
array = [re.sub(bubble_pat,'airbubble', string) for string in array]
array = [re.sub(crypto_pat,'cryptophyte', string) for string in array]
array = [re.sub(pico_pat,'picoeucaryote', string) for string in array]
array = [re.sub(nano_pat,'nanoeucaryote', string) for string in array]
array = [re.sub(micro_pat,'microphytoplancton', string) for string in array]
array = [re.sub(prochlo_pat,'prochlorococcus', string) for string in array]
array = [re.sub(synecho_pat,'synechococcus', string) for string in array]
array = [re.sub(unidentified_pat, 'noise', string) for string in array]
array = [re.sub(noiseinf_pat,'inf1microm_unidentified_particle', string) for string in array]
array = [re.sub(noisesup_pat,'sup1microm_unidentified_particle', string) for string in array]
array = [re.sub(nophyto_pat,'sup1microm_unidentified_particle', string) for string in array]
array = [re.sub(lowfluo_pat,'sup1microm_unidentified_particle', string) for string in array]
array = [re.sub(unassigned_pat, 'noise', string) for string in array]
array = [re.sub('C_noise1','inf1microm_unidentified_particle', string) for string in array]
array = [re.sub('µ','micro', string) for string in array]
array = [re.sub('es$','e', string) for string in array]
array = [re.sub(' ','', string) for string in array]
array = [string.lower() for string in array]
array = list(array)
return array
def homogeneous_cluster_names_swings(array):
''' Make homogeneous the names of the groups coming from the different Pulse files of the Swings campaign
array (list, numpy 1D array or dataframe): The container in which the names have to be changed
-----------------------------------------------------------------------------------------------
returns (array): The array with the name changed and the original shape
'''
if type(array) == pd.core.frame.DataFrame:
array['cluster'] = array['cluster'].str.replace('bruitdefond', 'Unassigned Particles')
array['cluster'] = array['cluster'].str.replace('[A-Za-z]+MICRO', 'MICRO', regex = True, case = False)
array['cluster'] = array['cluster'].str.replace('([A-Za-z]+)_[A-Za-z0-9]+', r'\1', regex = True, case = False)
else:
array = [re.sub('bruitdefond', 'Unassigned Particles', string) for string in array]
array = [re.sub('[A-Za-z]+MICRO', 'MICRO', string) for string in array]
array = [re.sub(r'([A-Za-z]+)_[A-Za-z0-9]+', r'\1', string) for string in array]
return array
def extract_features_from_nn(dataset, pre_model):
''' Extract and flatten the output of a Neural Network
dataset ((nb_obs,curve_length, nb_curves) array): The interpolated and scaled data
pre_model (Keras model): The model without his head
---------------------------------------------------------------------
returns ((nb_obs,nb_features) array): The features extracted from the NN
'''
features = pre_model.predict(dataset, batch_size=32)
features_flatten = features.reshape((features.shape[0], -1))
return features_flatten
def split_noise(df_, is_FlYellow = True):
'''
Split the noise between "small" and big "noise" particles using KMeans
df (pd DataFrame): DataFrame containing Pulse data for all particles
---------------------------------------------------------------
returns (pd DataFrame): The data with a splitted noise
'''
# Dirty: avoid to modify the original object
df = deepcopy(df_)
# To account for Orange Fluorescence
fl2 = 'Fl Yellow' if is_FlYellow else 'FL Orange'
noise_cells = df[df['cluster'] == 'Unassigned Particles']
noise_df = noise_cells.reset_index().groupby('Particle ID').agg(
{
'FWS': it.trapz,
'SWS': it.trapz,
fl2: it.trapz,
'FL Red': it.trapz,
'Curvature': it.trapz,
'cluster': lambda x: list(x)[0] # Fetch the name of the cluster for each particle
})
# Select noise particles
X = noise_df.iloc[:,:4]
# Perform the KMeans on Total FLR, Total FWS, Total FLO and Total SWS
kmeans = KMeans(n_clusters=2, random_state=0, n_init = 100, algorithm= 'full').fit(np.log(X + 1E-16))
labels = kmeans.labels_
# Store the new labels in a new column
noise_df['cluster2'] = labels
class_size = noise_df.groupby('cluster2')['FWS'].mean()
noise_df['cluster2'] = noise_df['cluster2'].astype(str).replace(str(class_size.argmax()), 'sup1microm') # The biggest class is the sup1microm
noise_df['cluster2'] = noise_df['cluster2'].astype(str).replace(str(class_size.argmin()), 'inf1microm') # The lowest class is the inf1microm
# Erase the old cluster labels
noise_cells = noise_cells.set_index('Particle ID').join(noise_df['cluster2'])
del(noise_cells['cluster'])
cols = list(noise_cells.columns)
cols[-1] = 'cluster'
noise_cells.columns = cols
# Replace the noise entries by the new noise entries
df = df.set_index('Particle ID')
non_noise_index = set(df.index) - set(noise_cells.index)
df = df.loc[non_noise_index]
df = pd.concat([df, noise_cells]).sort_index()
return df
def centroid_sampling(X, y, sampling_strategy, columns, random_state = None):
'''
Per class sampling strategy: The furthest from the class centroid, the more likely for a point
to be sampled
X (pandas DataFrame): The integrated five curves issued by AFCM. Index: Filename and Particle ID
y (pandas Series): The label of each particle issued by AFCM. Index: Filename and Particle ID
sampling_strategy (dict): Keys: The cluster names, values: the number of observations to sample
-----------------------------------------------------------------------------------------------
returns (X, y): The undersampled X and y DataFrames
'''
np.random.seed(random_state)
# Cluster classes and number of particles per class
cluster_classes = sampling_strategy.keys()
nb_classes = len(cluster_classes)
# Storages
X_total_train = pd.DataFrame()
y_total_train = pd.DataFrame()
# Compute the centroids
centroids = X.join(y)[columns + ['cluster']].groupby('cluster').mean()
for i in range(nb_classes):
# Compute the distance to centroids for each group
cluster_label = centroids.index[i]
cluster_data = X[y == cluster_label]
obs = np.log(cluster_data[columns] + 1E-8).values
cc = np.log(centroids.iloc[i,:4].values.reshape(1,-1))
dm = cdist(obs, cc)
# Delete outliers and compute the sampling probability
#q99 = np.quantile(dm, 0.99)
#dm = np.where(dm >= q99, 1E-2, dm)
dm = 1 / ((dm + 1E-8) ** 3)
p = dm / dm.sum()
# Sample the data
nb_obs = len(obs)
indices = range(nb_obs)
nb_to_sample = sampling_strategy[cluster_label]
sampled_indices = np.random.choice(a = indices, size = nb_to_sample, p = p.reshape(-1), replace = False)
sampled_data = cluster_data.iloc[sampled_indices]
# Store the data
X_total_train = X_total_train.append(sampled_data)
y_file = pd.DataFrame([cluster_label] * nb_to_sample, index = sampled_data.index, columns = ['cluster'])
y_total_train = y_total_train.append(y_file)
return X_total_train, y_total_train
def quick_preprocessing(df, columns):
'''
Preprocess the curves for network predictions (short version of the data_preprocessing function).
df (pandas DataFrame): The total curves (as in the .parq files)
------------------------------------------------------------------
returns (np.array, list): The formatted curves and the associated class
Note: Some Refactoring is needed
'''
max_len = 120
# Reformatting the values
obs_list = [] # The 5 curves
y_list = [] # The class (e.g. 0 = prochlorocchoccus, 1= ...)
for pid, obs in df.groupby('Particle ID'):
obs_list.append(obs[columns].values.T)
y_list.append(list(set(obs['cluster']))[0])
#print(obs.columns)
obs_list = interp_sequences(obs_list, max_len)
X = np.transpose(obs_list, (0, 2, 1))
return X, y_list