Skip to content

Commit

Permalink
Adds the scripts used for hyperparameter search
Browse files Browse the repository at this point in the history
  • Loading branch information
BjornBruns committed Aug 26, 2021
1 parent f41c5fd commit c7a9246
Show file tree
Hide file tree
Showing 41 changed files with 6,559 additions and 1 deletion.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ Model validation:
Weights inspection:
- Deep learning EEG_Model inspection.ipynb (Inspection of the weights of a DL model, visualizations)


### Configuration file

The config_template.py file should be renamed to config.py. Here the paths of the file locations can be stored. The ROOT folder can be the ROOT folder of this repository as well.
Expand All @@ -48,6 +47,10 @@ We made use of a SECOND_ROOT, which was an external harddisk. On this harddisk t

The main folder of this repository also contains a few helper files, for example DataGenerators.

### Scripts

The scripts were used to train the models on an external cluster (Surfsara Lisa). This was done using the reduced DL data set and was only used for hyperparameter search of the DL models. The final models were trained using the full DL data set.


## Data set

Expand Down
154 changes: 154 additions & 0 deletions scripts/DL_final_BLSTM_regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python

# ================ IMPORT LIBRARIES ================ #
import sys, os, fnmatch, time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

sys.path.insert(0, os.path.dirname(os.getcwd()))

from dataset_generator import DataGenerator

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers, Input, Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D
from tensorflow.keras.optimizers import Adam, Adadelta, SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

n_timesteps = 501
n_features = 30
n_outputs = 1

COUNT_MODEL = "FINAL" # This will be appended to the saved model's name. To make sure to not overwrite models, increase this.
MAX_QUEUE_SIZE = 5000
WORKERS = 6

input_shape = (n_timesteps, n_features)

# Input and output folders
PATH_DATA_PROCESSED_DL = sys.argv[1]
PATH_OUTPUT = sys.argv[2]

# ================ INITIAL LOGS ================ #

print("LOGGING: Imported all modules")

# ================ LOAD PREPROCESSED DATA ================ #

# Step 1: Get all the files in the output folder
file_names = os.listdir(PATH_DATA_PROCESSED_DL)

# Step 2: Get the full paths of the files (without extensions)
files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")]

# Step 3: Load all the metadata
frames = []

for idx, feature_file in enumerate(files):
df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv")
frames.append(df_metadata)

df_metadata = pd.concat(frames)

# Step 4: Add missing age information based on the age group the subject is in
df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True)
df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True)
df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True)

# Step 5: List all the unique subject IDs
subject_ids = sorted(list(set(df_metadata["code"].tolist())))

# Step 6: Split the subjects into train, val and test
IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42)
IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42)

# Step 7: Initialize DataGenerators
train_generator_noise = DataGenerator(list_IDs = IDs_train,
BASE_PATH = PATH_DATA_PROCESSED_DL,
metadata = df_metadata,
n_average = 30,
batch_size = 10,
gaussian_noise=0.01,
iter_per_epoch = 30,
n_timepoints = 501,
n_channels=30,
shuffle=True)

val_generator = DataGenerator(list_IDs = IDs_val,
BASE_PATH = PATH_DATA_PROCESSED_DL,
metadata = df_metadata,
n_average = 30,
batch_size = 10,
iter_per_epoch = 100,
n_timepoints = 501,
n_channels=30,
shuffle=True)

print("LOGGING: Loaded all data and created generators")

# ================ BLSTM-LSTM model ================ #

try:
def blstm_lstm_model():
""" Returns the BLSTM-LSTM model from Kaushik et al. (2019). """

# MARK: This model compresses too much in the last phase, check if possible to improve.

model = keras.Sequential()

# BLSTM layer
model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=input_shape))
model.add(Dropout(.2))
model.add(BatchNormalization())

# LSTM layer
model.add(LSTM(128, return_sequences=True))
model.add(BatchNormalization())

# LSTM layer
model.add(LSTM(64, return_sequences=False))
model.add(BatchNormalization())

# Fully connected layer
model.add(Dense(32))

model.add(Dense(n_outputs))

return model

model = blstm_lstm_model()

optimizer = Adam(learning_rate=0.01)

model.compile(loss='mean_squared_error',
optimizer=optimizer,
metrics=[RootMeanSquaredError(), MeanAbsoluteError()])

output_filename = f'BLSTM_regressor_{COUNT_MODEL}'
output_file = os.path.join(PATH_OUTPUT, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=250, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)

epochs = 1500

print("LOGGING: Starting BLSTM-LSTM model training")
# fit network
history = model.fit(x=train_generator_noise,
validation_data=val_generator,
epochs=epochs,
verbose=2,
max_queue_size=MAX_QUEUE_SIZE,
workers=WORKERS,
callbacks=[checkpointer, earlystopper, reduce_lr])
print("LOGGING: Finished BLSTM-LSTM model training")
except Exception as e:
print("LOGGING: Failed BLSTM-LSTM model training:")
print(e)
pass
30 changes: 30 additions & 0 deletions scripts/DL_final_BLSTM_regressor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
#Set job requirements
#SBATCH -p gpu_shared
#SBATCH --gpus=1
#SBATCH -t 30:00:00
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]

#Loading modules
module load 2020
module load Python/3.8.2-GCCcore-9.3.0
module load CUDA/11.0.2-GCC-9.3.0
module load cuDNN/8.0.3.33-gcccuda-2020a
module load TensorFlow/2.3.1-fosscuda-2020a-Python-3.8.2
pip install --user zarr
pip install --user pandas
pip install --user scikit-learn
pip install --user tensorflow
pip install tensorflow-addons

python $HOME/check-gpu.py
if [ $? -ne 0 ]; then
exit 1
fi

#Copy input file to scratch
cp -r $HOME/data_processed_DL "$TMPDIR"

#Execute a Python program located in $HOME, that takes an input file and output directory as arguments.
python $HOME/DL_final_BLSTM_regressor.py "$TMPDIR"/data_processed_DL $HOME/trained_models
154 changes: 154 additions & 0 deletions scripts/DL_final_CNN_regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python

# ================ IMPORT LIBRARIES ================ #
import sys, os, fnmatch, time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

sys.path.insert(0, os.path.dirname(os.getcwd()))

from dataset_generator import DataGenerator

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers, Input, Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D
from tensorflow.keras.optimizers import Adam, Adadelta, SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

n_timesteps = 501
n_features = 30
n_outputs = 1

COUNT_MODEL = "FINAL" # This will be appended to the saved model's name. To make sure to not overwrite models, increase this.
MAX_QUEUE_SIZE = 5000
WORKERS = 6

input_shape = (n_timesteps, n_features)

# Input and output folders
PATH_DATA_PROCESSED_DL = sys.argv[1]
PATH_OUTPUT = sys.argv[2]

# ================ INITIAL LOGS ================ #

print("LOGGING: Imported all modules")

# ================ LOAD PREPROCESSED DATA ================ #

# Step 1: Get all the files in the output folder
file_names = os.listdir(PATH_DATA_PROCESSED_DL)

# Step 2: Get the full paths of the files (without extensions)
files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")]

# Step 3: Load all the metadata
frames = []

for idx, feature_file in enumerate(files):
df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv")
frames.append(df_metadata)

df_metadata = pd.concat(frames)

# Step 4: Add missing age information based on the age group the subject is in
df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True)
df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True)
df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True)

# Step 5: List all the unique subject IDs
subject_ids = sorted(list(set(df_metadata["code"].tolist())))

# Step 6: Split the subjects into train, val and test
IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42)
IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42)

# Step 7: Initialize DataGenerators
train_generator_noise = DataGenerator(list_IDs = IDs_train,
BASE_PATH = PATH_DATA_PROCESSED_DL,
metadata = df_metadata,
n_average = 30,
batch_size = 10,
gaussian_noise=0.01,
iter_per_epoch = 30,
n_timepoints = 501,
n_channels=30,
shuffle=True)

val_generator = DataGenerator(list_IDs = IDs_val,
BASE_PATH = PATH_DATA_PROCESSED_DL,
metadata = df_metadata,
n_average = 30,
batch_size = 10,
iter_per_epoch = 100,
n_timepoints = 501,
n_channels=30,
shuffle=True)

print("LOGGING: Loaded all data and created generators")

# ================ Convolutional neural network model ================ #

try:
def cnn_model():
""" Returns the CNN (FCN) model from Ismail Fawaz et al. (2019). """

input_layer = keras.layers.Input(input_shape)

conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer)
conv1 = keras.layers.BatchNormalization()(conv1)
conv1 = keras.layers.Activation(activation='relu')(conv1)

conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1)
conv2 = keras.layers.BatchNormalization()(conv2)
conv2 = keras.layers.Activation('relu')(conv2)

conv3 = keras.layers.Conv1D(128, kernel_size=3, padding='same')(conv2)
conv3 = keras.layers.BatchNormalization()(conv3)
conv3 = keras.layers.Activation('relu')(conv3)

gap_layer = keras.layers.GlobalAveragePooling1D()(conv3)

output_layer = keras.layers.Dense(1)(gap_layer)

model = keras.models.Model(inputs=input_layer, outputs=output_layer)

return model

model = cnn_model()

optimizer = Adam(learning_rate=0.01)

model.compile(loss='mean_squared_error',
optimizer=optimizer,
metrics=[RootMeanSquaredError(), MeanAbsoluteError()])

output_filename = f'CNN_regressor_{COUNT_MODEL}'
output_file = os.path.join(PATH_OUTPUT, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=250, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)

epochs = 2000

print("LOGGING: Starting Convolutional neural network model training")

# fit network
history = model.fit(x=train_generator_noise,
validation_data=val_generator,
epochs=epochs,
verbose=2,
max_queue_size=MAX_QUEUE_SIZE,
workers=WORKERS,
callbacks=[checkpointer, earlystopper, reduce_lr])

print("LOGGING: Finished Convolutional neural network model training")
except Exception as e:
print("LOGGING: Failed Convolutional neural network model training:")
print(e)
pass
30 changes: 30 additions & 0 deletions scripts/DL_final_CNN_regressor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
#Set job requirements
#SBATCH -p gpu_shared
#SBATCH --gpus=1
#SBATCH -t 30:00:00
#SBATCH --mail-type=BEGIN,END
#SBATCH [email protected]

#Loading modules
module load 2020
module load Python/3.8.2-GCCcore-9.3.0
module load CUDA/11.0.2-GCC-9.3.0
module load cuDNN/8.0.3.33-gcccuda-2020a
module load TensorFlow/2.3.1-fosscuda-2020a-Python-3.8.2
pip install --user zarr
pip install --user pandas
pip install --user scikit-learn
pip install --user tensorflow
pip install tensorflow-addons

python $HOME/check-gpu.py
if [ $? -ne 0 ]; then
exit 1
fi

#Copy input file to scratch
cp -r $HOME/data_processed_DL "$TMPDIR"

#Execute a Python program located in $HOME, that takes an input file and output directory as arguments.
python $HOME/DL_final_CNN_regressor.py "$TMPDIR"/data_processed_DL $HOME/trained_models
Loading

0 comments on commit c7a9246

Please sign in to comment.