Adds the scripts used for hyperparameter search

epodium · Aug 26, 2021 · c7a9246 · c7a9246
1 parent f41c5fd
commit c7a9246
Show file tree

Hide file tree

Showing 41 changed files with 6,559 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -29,7 +29,6 @@ Model validation:
 Weights inspection:
 - Deep learning EEG_Model inspection.ipynb (Inspection of the weights of a DL model, visualizations)
 
-
 ### Configuration file
 
 The config_template.py file should be renamed to config.py. Here the paths of the file locations can be stored. The ROOT folder can be the ROOT folder of this repository as well.
@@ -48,6 +47,10 @@ We made use of a SECOND_ROOT, which was an external harddisk. On this harddisk t
 
 The main folder of this repository also contains a few helper files, for example DataGenerators.
 
+### Scripts
+
+The scripts were used to train the models on an external cluster (Surfsara Lisa). This was done using the reduced DL data set and was only used for hyperparameter search of the DL models. The final models were trained using the full DL data set.
+
 
 ## Data set
 

diff --git a/scripts/DL_final_BLSTM_regressor.py b/scripts/DL_final_BLSTM_regressor.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+# ================ IMPORT LIBRARIES ================ #
+import sys, os, fnmatch, time
+import numpy as np
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+
+sys.path.insert(0, os.path.dirname(os.getcwd()))
+
+from dataset_generator import DataGenerator
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+from tensorflow import keras
+from tensorflow.keras import layers, Input, Sequential
+from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D
+from tensorflow.keras.optimizers import Adam, Adadelta, SGD
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
+
+n_timesteps = 501
+n_features = 30 
+n_outputs = 1
+
+COUNT_MODEL = "FINAL" # This will be appended to the saved model's name. To make sure to not overwrite models, increase this.
+MAX_QUEUE_SIZE = 5000
+WORKERS = 6
+
+input_shape = (n_timesteps, n_features)
+
+# Input and output folders
+PATH_DATA_PROCESSED_DL = sys.argv[1]
+PATH_OUTPUT = sys.argv[2]
+
+# ================ INITIAL LOGS ================ #
+
+print("LOGGING: Imported all modules")
+
+# ================ LOAD PREPROCESSED DATA ================ #
+
+# Step 1: Get all the files in the output folder
+file_names = os.listdir(PATH_DATA_PROCESSED_DL)
+
+# Step 2: Get the full paths of the files (without extensions)
+files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")]
+
+# Step 3: Load all the metadata
+frames = []
+
+for idx, feature_file in enumerate(files):
+    df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv")
+    frames.append(df_metadata)
+
+df_metadata = pd.concat(frames) 
+
+# Step 4: Add missing age information based on the age group the subject is in
+df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True)
+df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True)
+df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True)
+
+# Step 5: List all the unique subject IDs
+subject_ids = sorted(list(set(df_metadata["code"].tolist())))
+
+# Step 6: Split the subjects into train, val and test
+IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42)
+IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42)
+
+# Step 7: Initialize DataGenerators
+train_generator_noise = DataGenerator(list_IDs = IDs_train,
+                                      BASE_PATH = PATH_DATA_PROCESSED_DL,
+                                      metadata = df_metadata,
+                                      n_average = 30,
+                                      batch_size = 10,
+                                      gaussian_noise=0.01,
+                                      iter_per_epoch = 30,
+                                      n_timepoints = 501, 
+                                      n_channels=30, 
+                                      shuffle=True)
+
+val_generator = DataGenerator(list_IDs = IDs_val,
+                              BASE_PATH = PATH_DATA_PROCESSED_DL,
+                              metadata = df_metadata,
+                              n_average = 30,
+                              batch_size = 10,
+                              iter_per_epoch = 100,
+                              n_timepoints = 501,
+                              n_channels=30,
+                              shuffle=True)
+
+print("LOGGING: Loaded all data and created generators")
+
+# ================ BLSTM-LSTM model ================ #
+
+try:
+    def blstm_lstm_model():
+        """ Returns the BLSTM-LSTM model from Kaushik et al. (2019). """
+
+        # MARK: This model compresses too much in the last phase, check if possible to improve.
+
+        model = keras.Sequential()
+
+        # BLSTM layer
+        model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=input_shape))
+        model.add(Dropout(.2))
+        model.add(BatchNormalization())
+
+        # LSTM layer
+        model.add(LSTM(128, return_sequences=True))
+        model.add(BatchNormalization())
+
+        # LSTM layer
+        model.add(LSTM(64, return_sequences=False))
+        model.add(BatchNormalization())
+
+        # Fully connected layer
+        model.add(Dense(32))
+
+        model.add(Dense(n_outputs))
+
+        return model 
+
+    model = blstm_lstm_model()
+
+    optimizer = Adam(learning_rate=0.01)    
+
+    model.compile(loss='mean_squared_error', 
+                optimizer=optimizer, 
+                metrics=[RootMeanSquaredError(), MeanAbsoluteError()])
+
+    output_filename = f'BLSTM_regressor_{COUNT_MODEL}'
+    output_file = os.path.join(PATH_OUTPUT, output_filename)
+
+    checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
+    earlystopper = EarlyStopping(monitor='val_loss', patience=250, verbose=1)
+    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)
+
+    epochs = 1500
+
+    print("LOGGING: Starting BLSTM-LSTM model training")
+    # fit network
+    history = model.fit(x=train_generator_noise,
+                        validation_data=val_generator,
+                        epochs=epochs, 
+                        verbose=2, 
+                        max_queue_size=MAX_QUEUE_SIZE,
+                        workers=WORKERS,  
+                        callbacks=[checkpointer, earlystopper, reduce_lr])
+    print("LOGGING: Finished BLSTM-LSTM model training")
+except Exception as e:
+    print("LOGGING: Failed BLSTM-LSTM model training:")
+    print(e)
+    pass
diff --git a/scripts/DL_final_BLSTM_regressor.sh b/scripts/DL_final_BLSTM_regressor.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#Set job requirements
+#SBATCH -p gpu_shared
+#SBATCH --gpus=1
+#SBATCH -t 30:00:00
+#SBATCH --mail-type=BEGIN,END
+#SBATCH [email protected]
+
+#Loading modules
+module load 2020
+module load Python/3.8.2-GCCcore-9.3.0
+module load CUDA/11.0.2-GCC-9.3.0
+module load cuDNN/8.0.3.33-gcccuda-2020a
+module load TensorFlow/2.3.1-fosscuda-2020a-Python-3.8.2
+pip install --user zarr
+pip install --user pandas
+pip install --user scikit-learn
+pip install --user tensorflow
+pip install tensorflow-addons
+
+python $HOME/check-gpu.py
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+
+#Copy input file to scratch
+cp -r $HOME/data_processed_DL "$TMPDIR"
+
+#Execute a Python program located in $HOME, that takes an input file and output directory as arguments.
+python $HOME/DL_final_BLSTM_regressor.py "$TMPDIR"/data_processed_DL $HOME/trained_models
diff --git a/scripts/DL_final_CNN_regressor.py b/scripts/DL_final_CNN_regressor.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+# ================ IMPORT LIBRARIES ================ #
+import sys, os, fnmatch, time
+import numpy as np
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+
+sys.path.insert(0, os.path.dirname(os.getcwd()))
+
+from dataset_generator import DataGenerator
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+from tensorflow import keras
+from tensorflow.keras import layers, Input, Sequential
+from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D
+from tensorflow.keras.optimizers import Adam, Adadelta, SGD
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
+
+n_timesteps = 501
+n_features = 30 
+n_outputs = 1
+
+COUNT_MODEL = "FINAL" # This will be appended to the saved model's name. To make sure to not overwrite models, increase this.
+MAX_QUEUE_SIZE = 5000
+WORKERS = 6
+
+input_shape = (n_timesteps, n_features)
+
+# Input and output folders
+PATH_DATA_PROCESSED_DL = sys.argv[1]
+PATH_OUTPUT = sys.argv[2]
+
+# ================ INITIAL LOGS ================ #
+
+print("LOGGING: Imported all modules")
+
+# ================ LOAD PREPROCESSED DATA ================ #
+
+# Step 1: Get all the files in the output folder
+file_names = os.listdir(PATH_DATA_PROCESSED_DL)
+
+# Step 2: Get the full paths of the files (without extensions)
+files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")]
+
+# Step 3: Load all the metadata
+frames = []
+
+for idx, feature_file in enumerate(files):
+    df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv")
+    frames.append(df_metadata)
+
+df_metadata = pd.concat(frames) 
+
+# Step 4: Add missing age information based on the age group the subject is in
+df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True)
+df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True)
+df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True)
+
+# Step 5: List all the unique subject IDs
+subject_ids = sorted(list(set(df_metadata["code"].tolist())))
+
+# Step 6: Split the subjects into train, val and test
+IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42)
+IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42)
+
+# Step 7: Initialize DataGenerators
+train_generator_noise = DataGenerator(list_IDs = IDs_train,
+                                      BASE_PATH = PATH_DATA_PROCESSED_DL,
+                                      metadata = df_metadata,
+                                      n_average = 30,
+                                      batch_size = 10,
+                                      gaussian_noise=0.01,
+                                      iter_per_epoch = 30,
+                                      n_timepoints = 501, 
+                                      n_channels=30, 
+                                      shuffle=True)
+
+val_generator = DataGenerator(list_IDs = IDs_val,
+                              BASE_PATH = PATH_DATA_PROCESSED_DL,
+                              metadata = df_metadata,
+                              n_average = 30,
+                              batch_size = 10,
+                              iter_per_epoch = 100,
+                              n_timepoints = 501,
+                              n_channels=30,
+                              shuffle=True)
+
+print("LOGGING: Loaded all data and created generators")
+
+# ================ Convolutional neural network model ================ #
+
+try:
+    def cnn_model():
+        """ Returns the CNN (FCN) model from Ismail Fawaz et al. (2019). """
+
+        input_layer = keras.layers.Input(input_shape)
+
+        conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer)
+        conv1 = keras.layers.BatchNormalization()(conv1)
+        conv1 = keras.layers.Activation(activation='relu')(conv1)
+
+        conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1)
+        conv2 = keras.layers.BatchNormalization()(conv2)
+        conv2 = keras.layers.Activation('relu')(conv2)
+
+        conv3 = keras.layers.Conv1D(128, kernel_size=3, padding='same')(conv2)
+        conv3 = keras.layers.BatchNormalization()(conv3)
+        conv3 = keras.layers.Activation('relu')(conv3)
+
+        gap_layer = keras.layers.GlobalAveragePooling1D()(conv3)
+
+        output_layer = keras.layers.Dense(1)(gap_layer)
+
+        model = keras.models.Model(inputs=input_layer, outputs=output_layer)
+
+        return model 
+
+    model = cnn_model()
+
+    optimizer = Adam(learning_rate=0.01)    
+
+    model.compile(loss='mean_squared_error', 
+                optimizer=optimizer, 
+                metrics=[RootMeanSquaredError(), MeanAbsoluteError()])
+
+    output_filename = f'CNN_regressor_{COUNT_MODEL}'
+    output_file = os.path.join(PATH_OUTPUT, output_filename)
+
+    checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
+    earlystopper = EarlyStopping(monitor='val_loss', patience=250, verbose=1)
+    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)
+
+    epochs = 2000
+
+    print("LOGGING: Starting Convolutional neural network model training")
+
+    # fit network
+    history = model.fit(x=train_generator_noise,
+                        validation_data=val_generator,
+                        epochs=epochs, 
+                        verbose=2, 
+                        max_queue_size=MAX_QUEUE_SIZE,
+                        workers=WORKERS, 
+                        callbacks=[checkpointer, earlystopper, reduce_lr])
+
+    print("LOGGING: Finished Convolutional neural network model training")
+except Exception as e:
+    print("LOGGING: Failed Convolutional neural network model training:")
+    print(e)
+    pass
diff --git a/scripts/DL_final_CNN_regressor.sh b/scripts/DL_final_CNN_regressor.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#Set job requirements
+#SBATCH -p gpu_shared
+#SBATCH --gpus=1
+#SBATCH -t 30:00:00
+#SBATCH --mail-type=BEGIN,END
+#SBATCH [email protected]
+
+#Loading modules
+module load 2020
+module load Python/3.8.2-GCCcore-9.3.0
+module load CUDA/11.0.2-GCC-9.3.0
+module load cuDNN/8.0.3.33-gcccuda-2020a
+module load TensorFlow/2.3.1-fosscuda-2020a-Python-3.8.2
+pip install --user zarr
+pip install --user pandas
+pip install --user scikit-learn
+pip install --user tensorflow
+pip install tensorflow-addons
+
+python $HOME/check-gpu.py
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+
+#Copy input file to scratch
+cp -r $HOME/data_processed_DL "$TMPDIR"
+
+#Execute a Python program located in $HOME, that takes an input file and output directory as arguments.
+python $HOME/DL_final_CNN_regressor.py "$TMPDIR"/data_processed_DL $HOME/trained_models