-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds the scripts used for hyperparameter search
- Loading branch information
1 parent
f41c5fd
commit c7a9246
Showing
41 changed files
with
6,559 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#!/usr/bin/env python | ||
|
||
# ================ IMPORT LIBRARIES ================ # | ||
import sys, os, fnmatch, time | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.model_selection import train_test_split | ||
|
||
sys.path.insert(0, os.path.dirname(os.getcwd())) | ||
|
||
from dataset_generator import DataGenerator | ||
|
||
import tensorflow as tf | ||
import tensorflow_addons as tfa | ||
from tensorflow import keras | ||
from tensorflow.keras import layers, Input, Sequential | ||
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D | ||
from tensorflow.keras.optimizers import Adam, Adadelta, SGD | ||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau | ||
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError | ||
|
||
n_timesteps = 501 | ||
n_features = 30 | ||
n_outputs = 1 | ||
|
||
COUNT_MODEL = "FINAL" # This will be appended to the saved model's name. To make sure to not overwrite models, increase this. | ||
MAX_QUEUE_SIZE = 5000 | ||
WORKERS = 6 | ||
|
||
input_shape = (n_timesteps, n_features) | ||
|
||
# Input and output folders | ||
PATH_DATA_PROCESSED_DL = sys.argv[1] | ||
PATH_OUTPUT = sys.argv[2] | ||
|
||
# ================ INITIAL LOGS ================ # | ||
|
||
print("LOGGING: Imported all modules") | ||
|
||
# ================ LOAD PREPROCESSED DATA ================ # | ||
|
||
# Step 1: Get all the files in the output folder | ||
file_names = os.listdir(PATH_DATA_PROCESSED_DL) | ||
|
||
# Step 2: Get the full paths of the files (without extensions) | ||
files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")] | ||
|
||
# Step 3: Load all the metadata | ||
frames = [] | ||
|
||
for idx, feature_file in enumerate(files): | ||
df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv") | ||
frames.append(df_metadata) | ||
|
||
df_metadata = pd.concat(frames) | ||
|
||
# Step 4: Add missing age information based on the age group the subject is in | ||
df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True) | ||
df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True) | ||
df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True) | ||
|
||
# Step 5: List all the unique subject IDs | ||
subject_ids = sorted(list(set(df_metadata["code"].tolist()))) | ||
|
||
# Step 6: Split the subjects into train, val and test | ||
IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42) | ||
IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42) | ||
|
||
# Step 7: Initialize DataGenerators | ||
train_generator_noise = DataGenerator(list_IDs = IDs_train, | ||
BASE_PATH = PATH_DATA_PROCESSED_DL, | ||
metadata = df_metadata, | ||
n_average = 30, | ||
batch_size = 10, | ||
gaussian_noise=0.01, | ||
iter_per_epoch = 30, | ||
n_timepoints = 501, | ||
n_channels=30, | ||
shuffle=True) | ||
|
||
val_generator = DataGenerator(list_IDs = IDs_val, | ||
BASE_PATH = PATH_DATA_PROCESSED_DL, | ||
metadata = df_metadata, | ||
n_average = 30, | ||
batch_size = 10, | ||
iter_per_epoch = 100, | ||
n_timepoints = 501, | ||
n_channels=30, | ||
shuffle=True) | ||
|
||
print("LOGGING: Loaded all data and created generators") | ||
|
||
# ================ BLSTM-LSTM model ================ # | ||
|
||
try: | ||
def blstm_lstm_model(): | ||
""" Returns the BLSTM-LSTM model from Kaushik et al. (2019). """ | ||
|
||
# MARK: This model compresses too much in the last phase, check if possible to improve. | ||
|
||
model = keras.Sequential() | ||
|
||
# BLSTM layer | ||
model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=input_shape)) | ||
model.add(Dropout(.2)) | ||
model.add(BatchNormalization()) | ||
|
||
# LSTM layer | ||
model.add(LSTM(128, return_sequences=True)) | ||
model.add(BatchNormalization()) | ||
|
||
# LSTM layer | ||
model.add(LSTM(64, return_sequences=False)) | ||
model.add(BatchNormalization()) | ||
|
||
# Fully connected layer | ||
model.add(Dense(32)) | ||
|
||
model.add(Dense(n_outputs)) | ||
|
||
return model | ||
|
||
model = blstm_lstm_model() | ||
|
||
optimizer = Adam(learning_rate=0.01) | ||
|
||
model.compile(loss='mean_squared_error', | ||
optimizer=optimizer, | ||
metrics=[RootMeanSquaredError(), MeanAbsoluteError()]) | ||
|
||
output_filename = f'BLSTM_regressor_{COUNT_MODEL}' | ||
output_file = os.path.join(PATH_OUTPUT, output_filename) | ||
|
||
checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True) | ||
earlystopper = EarlyStopping(monitor='val_loss', patience=250, verbose=1) | ||
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1) | ||
|
||
epochs = 1500 | ||
|
||
print("LOGGING: Starting BLSTM-LSTM model training") | ||
# fit network | ||
history = model.fit(x=train_generator_noise, | ||
validation_data=val_generator, | ||
epochs=epochs, | ||
verbose=2, | ||
max_queue_size=MAX_QUEUE_SIZE, | ||
workers=WORKERS, | ||
callbacks=[checkpointer, earlystopper, reduce_lr]) | ||
print("LOGGING: Finished BLSTM-LSTM model training") | ||
except Exception as e: | ||
print("LOGGING: Failed BLSTM-LSTM model training:") | ||
print(e) | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
#Set job requirements | ||
#SBATCH -p gpu_shared | ||
#SBATCH --gpus=1 | ||
#SBATCH -t 30:00:00 | ||
#SBATCH --mail-type=BEGIN,END | ||
#SBATCH [email protected] | ||
|
||
#Loading modules | ||
module load 2020 | ||
module load Python/3.8.2-GCCcore-9.3.0 | ||
module load CUDA/11.0.2-GCC-9.3.0 | ||
module load cuDNN/8.0.3.33-gcccuda-2020a | ||
module load TensorFlow/2.3.1-fosscuda-2020a-Python-3.8.2 | ||
pip install --user zarr | ||
pip install --user pandas | ||
pip install --user scikit-learn | ||
pip install --user tensorflow | ||
pip install tensorflow-addons | ||
|
||
python $HOME/check-gpu.py | ||
if [ $? -ne 0 ]; then | ||
exit 1 | ||
fi | ||
|
||
#Copy input file to scratch | ||
cp -r $HOME/data_processed_DL "$TMPDIR" | ||
|
||
#Execute a Python program located in $HOME, that takes an input file and output directory as arguments. | ||
python $HOME/DL_final_BLSTM_regressor.py "$TMPDIR"/data_processed_DL $HOME/trained_models |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#!/usr/bin/env python | ||
|
||
# ================ IMPORT LIBRARIES ================ # | ||
import sys, os, fnmatch, time | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.model_selection import train_test_split | ||
|
||
sys.path.insert(0, os.path.dirname(os.getcwd())) | ||
|
||
from dataset_generator import DataGenerator | ||
|
||
import tensorflow as tf | ||
import tensorflow_addons as tfa | ||
from tensorflow import keras | ||
from tensorflow.keras import layers, Input, Sequential | ||
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D | ||
from tensorflow.keras.optimizers import Adam, Adadelta, SGD | ||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau | ||
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError | ||
|
||
n_timesteps = 501 | ||
n_features = 30 | ||
n_outputs = 1 | ||
|
||
COUNT_MODEL = "FINAL" # This will be appended to the saved model's name. To make sure to not overwrite models, increase this. | ||
MAX_QUEUE_SIZE = 5000 | ||
WORKERS = 6 | ||
|
||
input_shape = (n_timesteps, n_features) | ||
|
||
# Input and output folders | ||
PATH_DATA_PROCESSED_DL = sys.argv[1] | ||
PATH_OUTPUT = sys.argv[2] | ||
|
||
# ================ INITIAL LOGS ================ # | ||
|
||
print("LOGGING: Imported all modules") | ||
|
||
# ================ LOAD PREPROCESSED DATA ================ # | ||
|
||
# Step 1: Get all the files in the output folder | ||
file_names = os.listdir(PATH_DATA_PROCESSED_DL) | ||
|
||
# Step 2: Get the full paths of the files (without extensions) | ||
files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")] | ||
|
||
# Step 3: Load all the metadata | ||
frames = [] | ||
|
||
for idx, feature_file in enumerate(files): | ||
df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv") | ||
frames.append(df_metadata) | ||
|
||
df_metadata = pd.concat(frames) | ||
|
||
# Step 4: Add missing age information based on the age group the subject is in | ||
df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True) | ||
df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True) | ||
df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True) | ||
|
||
# Step 5: List all the unique subject IDs | ||
subject_ids = sorted(list(set(df_metadata["code"].tolist()))) | ||
|
||
# Step 6: Split the subjects into train, val and test | ||
IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42) | ||
IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42) | ||
|
||
# Step 7: Initialize DataGenerators | ||
train_generator_noise = DataGenerator(list_IDs = IDs_train, | ||
BASE_PATH = PATH_DATA_PROCESSED_DL, | ||
metadata = df_metadata, | ||
n_average = 30, | ||
batch_size = 10, | ||
gaussian_noise=0.01, | ||
iter_per_epoch = 30, | ||
n_timepoints = 501, | ||
n_channels=30, | ||
shuffle=True) | ||
|
||
val_generator = DataGenerator(list_IDs = IDs_val, | ||
BASE_PATH = PATH_DATA_PROCESSED_DL, | ||
metadata = df_metadata, | ||
n_average = 30, | ||
batch_size = 10, | ||
iter_per_epoch = 100, | ||
n_timepoints = 501, | ||
n_channels=30, | ||
shuffle=True) | ||
|
||
print("LOGGING: Loaded all data and created generators") | ||
|
||
# ================ Convolutional neural network model ================ # | ||
|
||
try: | ||
def cnn_model(): | ||
""" Returns the CNN (FCN) model from Ismail Fawaz et al. (2019). """ | ||
|
||
input_layer = keras.layers.Input(input_shape) | ||
|
||
conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer) | ||
conv1 = keras.layers.BatchNormalization()(conv1) | ||
conv1 = keras.layers.Activation(activation='relu')(conv1) | ||
|
||
conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1) | ||
conv2 = keras.layers.BatchNormalization()(conv2) | ||
conv2 = keras.layers.Activation('relu')(conv2) | ||
|
||
conv3 = keras.layers.Conv1D(128, kernel_size=3, padding='same')(conv2) | ||
conv3 = keras.layers.BatchNormalization()(conv3) | ||
conv3 = keras.layers.Activation('relu')(conv3) | ||
|
||
gap_layer = keras.layers.GlobalAveragePooling1D()(conv3) | ||
|
||
output_layer = keras.layers.Dense(1)(gap_layer) | ||
|
||
model = keras.models.Model(inputs=input_layer, outputs=output_layer) | ||
|
||
return model | ||
|
||
model = cnn_model() | ||
|
||
optimizer = Adam(learning_rate=0.01) | ||
|
||
model.compile(loss='mean_squared_error', | ||
optimizer=optimizer, | ||
metrics=[RootMeanSquaredError(), MeanAbsoluteError()]) | ||
|
||
output_filename = f'CNN_regressor_{COUNT_MODEL}' | ||
output_file = os.path.join(PATH_OUTPUT, output_filename) | ||
|
||
checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True) | ||
earlystopper = EarlyStopping(monitor='val_loss', patience=250, verbose=1) | ||
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1) | ||
|
||
epochs = 2000 | ||
|
||
print("LOGGING: Starting Convolutional neural network model training") | ||
|
||
# fit network | ||
history = model.fit(x=train_generator_noise, | ||
validation_data=val_generator, | ||
epochs=epochs, | ||
verbose=2, | ||
max_queue_size=MAX_QUEUE_SIZE, | ||
workers=WORKERS, | ||
callbacks=[checkpointer, earlystopper, reduce_lr]) | ||
|
||
print("LOGGING: Finished Convolutional neural network model training") | ||
except Exception as e: | ||
print("LOGGING: Failed Convolutional neural network model training:") | ||
print(e) | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
#Set job requirements | ||
#SBATCH -p gpu_shared | ||
#SBATCH --gpus=1 | ||
#SBATCH -t 30:00:00 | ||
#SBATCH --mail-type=BEGIN,END | ||
#SBATCH [email protected] | ||
|
||
#Loading modules | ||
module load 2020 | ||
module load Python/3.8.2-GCCcore-9.3.0 | ||
module load CUDA/11.0.2-GCC-9.3.0 | ||
module load cuDNN/8.0.3.33-gcccuda-2020a | ||
module load TensorFlow/2.3.1-fosscuda-2020a-Python-3.8.2 | ||
pip install --user zarr | ||
pip install --user pandas | ||
pip install --user scikit-learn | ||
pip install --user tensorflow | ||
pip install tensorflow-addons | ||
|
||
python $HOME/check-gpu.py | ||
if [ $? -ne 0 ]; then | ||
exit 1 | ||
fi | ||
|
||
#Copy input file to scratch | ||
cp -r $HOME/data_processed_DL "$TMPDIR" | ||
|
||
#Execute a Python program located in $HOME, that takes an input file and output directory as arguments. | ||
python $HOME/DL_final_CNN_regressor.py "$TMPDIR"/data_processed_DL $HOME/trained_models |
Oops, something went wrong.