From 5f7fc3c815a66110dfedff3c5d3b005bb1a8848a Mon Sep 17 00:00:00 2001 From: "V.Malefioudakis" Date: Tue, 20 Feb 2024 15:31:13 +0200 Subject: [PATCH 01/59] refactor: delete unused functions --- GANDLF/cli/main_run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/GANDLF/cli/main_run.py b/GANDLF/cli/main_run.py index f9676f76b..dbef803fe 100644 --- a/GANDLF/cli/main_run.py +++ b/GANDLF/cli/main_run.py @@ -9,7 +9,6 @@ populate_header_in_parameters, parseTrainingCSV, parseTestingCSV, - set_determinism, ) From d56dbefa04b2736d67c69abc8a313a96c9df3f90 Mon Sep 17 00:00:00 2001 From: "V.Malefioudakis" Date: Tue, 20 Feb 2024 16:16:44 +0200 Subject: [PATCH 02/59] refactor: delete unused imports Deleted unused imports os, pickle that are reported by codacy --- GANDLF/cli/main_run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/GANDLF/cli/main_run.py b/GANDLF/cli/main_run.py index dbef803fe..45e303254 100644 --- a/GANDLF/cli/main_run.py +++ b/GANDLF/cli/main_run.py @@ -1,4 +1,3 @@ -import os, pickle from typing import Optional from pathlib import Path From d7d8e1df640e3e92b12f8ff0c849e17888d53b55 Mon Sep 17 00:00:00 2001 From: "V.Malefioudakis" Date: Tue, 20 Feb 2024 16:27:47 +0200 Subject: [PATCH 03/59] refactor: delete unused functions Deleted the unused "MSELoss" and "L1loss" imported from "torch.nn". These are reported by codacy --- GANDLF/losses/regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py index bd7911895..49cb3c32d 100644 --- a/GANDLF/losses/regression.py +++ b/GANDLF/losses/regression.py @@ -1,7 +1,7 @@ from typing import Optional import torch import torch.nn.functional as F -from torch.nn import MSELoss, CrossEntropyLoss, L1Loss +from torch.nn import CrossEntropyLoss from GANDLF.utils import one_hot From c9d1f476c65cd1c601fc9e086200cf4c954d533a Mon Sep 17 00:00:00 2001 From: "V.Malefioudakis" Date: Tue, 20 Feb 2024 16:38:01 +0200 Subject: [PATCH 04/59] refactor: Delete unused variables Deleted (put in comments) the parameters_temp variable. It is reported by codacy. --- testing/test_full.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index e6e003467..03a0f7107 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -475,7 +475,7 @@ def test_train_regression_brainage_rad_2d(device): parameters["model"]["architecture"] = "brain_age" parameters["model"]["onnx_export"] = False parameters["model"]["print_summary"] = False - parameters_temp = copy.deepcopy(parameters) + # parameters_temp = copy.deepcopy(parameters) parameters = populate_header_in_parameters(parameters, parameters["headers"]) sanitize_outputDir() TrainingManager( @@ -747,7 +747,7 @@ def test_train_inference_optimize_classification_rad_3d(device): parameters["model"]["architecture"] = all_models_regression[0] parameters["model"]["onnx_export"] = False parameters["model"]["print_summary"] = False - parameters_temp = copy.deepcopy(parameters) + #parameters_temp = copy.deepcopy(parameters) sanitize_outputDir() TrainingManager( dataframe=training_data, From 9392917a4f4cd156e07edd7d526ad861672fe811 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 10:31:05 -0400 Subject: [PATCH 05/59] added check for stratified k-fold --- GANDLF/training_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index 41c188d74..9dcdb0486 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -69,7 +69,11 @@ def TrainingManager( # put 2 just so that the first for-loop does not fail parameters["nested_training"]["testing"] = 2 - # initialize the kfold structures + # check if stratified k-fold is requested + assert not parameters["nested_training"].get( + "stratified" + ), "Stratified k-fold can only be performed through offline data splitting - see gandlf_dataSplitter for more information." + kf_testing = KFold(n_splits=parameters["nested_training"]["testing"]) kf_validation = KFold(n_splits=parameters["nested_training"]["validation"]) From b201b2ea9374bb58c87eac39f93033e62fb61ba3 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 10:31:32 -0400 Subject: [PATCH 06/59] initialize stratified split to false --- GANDLF/config_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/GANDLF/config_manager.py b/GANDLF/config_manager.py index 80e147064..9532156de 100644 --- a/GANDLF/config_manager.py +++ b/GANDLF/config_manager.py @@ -634,6 +634,9 @@ def _parseConfig( "nested_training" in params ), "The parameter 'nested_training' needs to be defined" # initialize defaults for nested training + params["nested_training"]["stratified"] = params["nested_training"].get( + "stratified", False + ) params["nested_training"]["testing"] = params["nested_training"].get("testing", -5) params["nested_training"]["validation"] = params["nested_training"].get( "validation", -5 From 17e7c8126eb6208cbaffefa0ef69788dcd0a5180 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 10:32:17 -0400 Subject: [PATCH 07/59] added check for another word --- GANDLF/config_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/GANDLF/config_manager.py b/GANDLF/config_manager.py index 9532156de..81809829e 100644 --- a/GANDLF/config_manager.py +++ b/GANDLF/config_manager.py @@ -637,6 +637,9 @@ def _parseConfig( params["nested_training"]["stratified"] = params["nested_training"].get( "stratified", False ) + params["nested_training"]["stratified"] = params["nested_training"].get( + "proportional", False + ) params["nested_training"]["testing"] = params["nested_training"].get("testing", -5) params["nested_training"]["validation"] = params["nested_training"].get( "validation", -5 From ae2532915e0a03949801aa54a8c9d4a284a6416d Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 10:33:22 -0400 Subject: [PATCH 08/59] added notes in config --- samples/config_all_options.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/config_all_options.yaml b/samples/config_all_options.yaml index 7eb85cbd4..ba21517b1 100644 --- a/samples/config_all_options.yaml +++ b/samples/config_all_options.yaml @@ -138,6 +138,7 @@ optimizer: adam # for train on a single fold, use '-' before the fold number to make the number of folds "negative" -- NOT recommended nested_training: { + stratified: False, # this will perform stratified k-fold cross-validation but only with offline data splitting, see https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html testing: 5, # this controls the number of testing data folds for final model evaluation; [NOT recommended] to disable this, use '1' validation: 5 # this controls the number of validation data folds to be used for model *selection* during training (not used for back-propagation) } From c39714ef0a69056b8014cb44c66a0c1db82b4cad Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:35:15 -0400 Subject: [PATCH 09/59] added the logic for data splitting as a separate module --- GANDLF/utils/__init__.py | 2 + GANDLF/utils/data_splitter.py | 228 ++++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 GANDLF/utils/data_splitter.py diff --git a/GANDLF/utils/__init__.py b/GANDLF/utils/__init__.py index 311edeed8..66d830d3d 100644 --- a/GANDLF/utils/__init__.py +++ b/GANDLF/utils/__init__.py @@ -66,3 +66,5 @@ save_model, optimize_and_save_model, ) + +from .data_splitter import split_data diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py new file mode 100644 index 000000000..6d7555e6c --- /dev/null +++ b/GANDLF/utils/data_splitter.py @@ -0,0 +1,228 @@ +from typing import List, Tuple +import pandas as pd +from sklearn.model_selection import KFold, StratifiedKFold + +from . import parseTrainingCSV, populate_header_in_parameters + + +def split_data( + full_dataset: pd.DataFrame, parameters: dict +) -> List[Tuple[Tuple[int, int], pd.DataFrame, pd.DataFrame, pd.DataFrame]]: + """ + Split the data into training, validation, and testing sets. + + Args: + full_dataset (pd.DataFrame): The full dataset to split. + parameters (dict): The parameters to use for splitting the data, which should contain the "nested_training" key with relevant information. + + Returns: + List[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]: A list of tuples, each containing the a tuple of the testing & validation split indeces, and training, validation, and testing sets. + """ + assert ( + "nested_training" in parameters + ), "`nested_training` key missing in parameters" + # populate the headers + _, parameters["headers"] = ( + parseTrainingCSV(full_dataset) if "headers" not in parameters else (_, _) + ) + + parameters = ( + populate_header_in_parameters(parameters, parameters["headers"]) + if "problem_type" not in parameters + else parameters + ) + + return_data = [] + + # check for single fold training + singleFoldValidation = False + singleFoldTesting = False + # if the user wants a single fold training + testing_folds = parameters["nested_training"]["testing"] + if testing_folds < 0: + testing_folds = abs(testing_folds) + singleFoldTesting = True + + # if the user wants a single fold training + validation_folds = parameters["nested_training"]["validation"] + if validation_folds < 0: + validation_folds = abs(validation_folds) + singleFoldValidation = True + + # this is the condition where testing data is not to be kept + noTestingData = False + if testing_folds == 1: + noTestingData = True + singleFoldTesting = True + # put 2 just so that the first for-loop does not fail + testing_folds = 2 + print( + "WARNING: Testing data is empty, which will result in scientifically incorrect results; use at your own risk." + ) + + # get unique subject IDs + subjectIDs_full = ( + full_dataset[full_dataset.columns[parameters["headers"]["subjectIDHeader"]]] + .unique() + .tolist() + ) + + all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index) + + assert ( + all_subjects_are_unique and parameters["nested_training"]["stratified"] + ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." + + # get the targets for prediction for classification + target = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle + if parameters["problem_type"] == "classification": + target = full_dataset.loc[ + :, full_dataset.columns[parameters["headers"]["predictionHeaders"]] + ] + + folding_type = KFold + if parameters["nested_training"]["stratified"]: + folding_type = StratifiedKFold + + kf_testing = folding_type(n_splits=testing_folds) + kf_validation = folding_type(n_splits=validation_folds) + + # start StratifiedKFold splitting + currentTestingFold = 0 + if parameters["nested_training"]["stratified"]: + for trainAndVal_index, testing_index in kf_testing.split(full_dataset, target): + # ensure the validation fold is initialized per-testing split + currentValidationFold = 0 + + trainingAndValidationData = pd.DataFrame() # initialize the variable + testingData = pd.DataFrame() # initialize the variable + # get the current training and testing data + if noTestingData: + # don't consider the split indeces for this case + trainingAndValidationData = full_dataset + testingData = None + else: + trainingAndValidationData = full_dataset.loc[trainAndVal_index, :] + testingData = full_dataset.loc[testing_index, :] + + for train_index, val_index in kf_validation.split( + trainingAndValidationData, target + ): + # get the current training and validation data + trainingData = trainingAndValidationData.loc[train_index, :] + validationData = trainingAndValidationData.loc[val_index, :] + return_data.append( + ( + (currentTestingFold, currentValidationFold), + trainingData, + validationData, + testingData, + ) + ) + if singleFoldValidation: + break + currentValidationFold += 1 # increment the validation fold + + if singleFoldTesting: + break + currentTestingFold += 1 # increment the testing fold + + currentTestingFold = 0 + + # start the kFold train for testing + for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full, target): + # ensure the validation fold is initialized per-testing split + currentValidationFold = 0 + + trainingAndValidationData = pd.DataFrame() # initialize the variable + testingData = pd.DataFrame() # initialize the variable + # get the current training and testing data + if noTestingData: + # don't consider the split indeces for this case + trainingAndValidationData = full_dataset + else: + # loop over all trainAndVal_index and construct new dataframe + for subject_idx in trainAndVal_index: + trainingAndValidationData = trainingAndValidationData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + # loop over all testing_index and construct new dataframe + for subject_idx in testing_index: + testingData = testingData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + current_training_subject_indeces_full = ( + trainingAndValidationData[ + trainingAndValidationData.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + .unique() + .tolist() + ) + + # start the kFold train for validation + for train_index, val_index in kf_validation.split( + current_training_subject_indeces_full + ): + trainingData = pd.DataFrame() # initialize the variable + validationData = pd.DataFrame() # initialize the variable + + # loop over all train_index and construct new dataframe + for subject_idx in train_index: + trainingData = trainingData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + # loop over all val_index and construct new dataframe + for subject_idx in val_index: + validationData = validationData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + return_data.append( + ( + (currentTestingFold, currentValidationFold), + trainingData, + validationData, + testingData, + ) + ) + + if singleFoldValidation: + break + currentValidationFold += 1 # go to next fold + + if singleFoldTesting: + break + currentTestingFold += 1 # go to next fold From 1d9d9f862b61d34b4758777ab34f341fe891dd38 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:35:49 -0400 Subject: [PATCH 10/59] using the new module --- GANDLF/training_manager.py | 332 +++++++++---------------------------- 1 file changed, 74 insertions(+), 258 deletions(-) diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index 9dcdb0486..3ef08de4e 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -1,10 +1,9 @@ import pandas as pd -import os, sys, pickle, subprocess, shutil -from sklearn.model_selection import KFold +import os, pickle, shutil from pathlib import Path from GANDLF.compute import training_loop -from GANDLF.utils import get_dataframe +from GANDLF.utils import get_dataframe, split_data def TrainingManager( @@ -44,273 +43,90 @@ def TrainingManager( ) parameters = pickle.load(open(currentModelConfigPickle, "rb")) - # check for single fold training - singleFoldValidation = False - singleFoldTesting = False - noTestingData = False - # if the user wants a single fold training - if parameters["nested_training"]["testing"] < 0: - parameters["nested_training"]["testing"] = abs( - parameters["nested_training"]["testing"] - ) - singleFoldTesting = True - - # if the user wants a single fold training - if parameters["nested_training"]["validation"] < 0: - parameters["nested_training"]["validation"] = abs( - parameters["nested_training"]["validation"] - ) - singleFoldValidation = True - - # this is the condition where testing data is not to be kept - if parameters["nested_training"]["testing"] == 1: - noTestingData = True - singleFoldTesting = True - # put 2 just so that the first for-loop does not fail - parameters["nested_training"]["testing"] = 2 - - # check if stratified k-fold is requested - assert not parameters["nested_training"].get( - "stratified" - ), "Stratified k-fold can only be performed through offline data splitting - see gandlf_dataSplitter for more information." + dataframe_split = split_data(dataframe, parameters) - kf_testing = KFold(n_splits=parameters["nested_training"]["testing"]) - kf_validation = KFold(n_splits=parameters["nested_training"]["validation"]) + last_indeces, _, _, _ = dataframe_split[-1] - currentTestingFold = 0 - - # split across subjects - subjectIDs_full = ( - dataframe[dataframe.columns[parameters["headers"]["subjectIDHeader"]]] - .unique() - .tolist() - ) - - # get the indeces for kfold splitting - trainingData_full = dataframe - - # start the kFold train for testing - for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full): - # ensure the validation fold is initialized per-testing split - currentValidationFold = 0 - - trainingAndValidationData = pd.DataFrame() # initialize the variable - testingData = pd.DataFrame() # initialize the variable - # get the current training and testing data - if noTestingData: - # don't consider the split indeces for this case - trainingAndValidationData = trainingData_full - testingData = None - else: - # loop over all trainAndVal_index and construct new dataframe - for subject_idx in trainAndVal_index: - trainingAndValidationData = trainingAndValidationData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) - - # loop over all testing_index and construct new dataframe - for subject_idx in testing_index: - testingData = testingData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) + # check the last indeces to see if single fold training is requested + singleFoldTesting = True if last_indeces[0] == 0 else False + for ( + testing_and_valid_indeces, + trainingData, + validationData, + testingData, + ) in dataframe_split: # the output of the current fold is only needed if multi-fold training is happening - if singleFoldTesting: - currentOutputFolder = outputDir - else: - currentOutputFolder = os.path.join( - outputDir, "testing_" + str(currentTestingFold) + currentTestingOutputFolder = outputDir + if not singleFoldTesting: + currentTestingOutputFolder = os.path.join( + outputDir, "testing_" + str(testing_and_valid_indeces[0]) ) - Path(currentOutputFolder).mkdir(parents=True, exist_ok=True) + Path(currentTestingOutputFolder).mkdir(parents=True, exist_ok=True) - # save the current training+validation and testing datasets - if noTestingData: - print( - "WARNING: Testing data is empty, which will result in scientifically incorrect results; use at your own risk." - ) - current_training_subject_indeces_full = subjectIDs_full - currentTestingDataPickle = "None" - else: - currentTrainingAndValidationDataPickle = os.path.join( - currentOutputFolder, "data_trainAndVal.pkl" - ) - currentTestingDataPickle = os.path.join( - currentOutputFolder, "data_testing.pkl" + currentValidationOutputFolder = os.path.join( + currentTestingOutputFolder, str(testing_and_valid_indeces[1]) + ) + Path(currentValidationOutputFolder).mkdir(parents=True, exist_ok=True) + + # initialize the dataframes and save them to disk + data_dict = { + "training": trainingData, + "validation": validationData, + "testing": testingData, + } + data_dict_files = {} + for data_type, data in data_dict.items(): + currentDataPickle = os.path.join( + currentValidationOutputFolder, "data_" + data_type + ".pkl" ) - - if (not os.path.exists(currentTestingDataPickle)) or reset or resume: - testingData.to_pickle(currentTestingDataPickle) + data_dict_files[data_type] = currentDataPickle + if (not os.path.exists(currentDataPickle)) or reset or resume: + data.to_pickle(currentDataPickle) + data.to_csv(currentDataPickle.replace(".pkl", ".csv"), index=False) else: - if os.path.exists(currentTestingDataPickle): - print( - "Using previously saved testing data", - currentTestingDataPickle, - flush=True, - ) - testingData = pd.read_pickle(currentTestingDataPickle) - - if ( - (not os.path.exists(currentTrainingAndValidationDataPickle)) - or reset - or resume - ): - trainingAndValidationData.to_pickle( - currentTrainingAndValidationDataPickle - ) - else: - if os.path.exists(currentTrainingAndValidationDataPickle): - print( - "Using previously saved training+validation data", - currentTrainingAndValidationDataPickle, - flush=True, - ) - trainingAndValidationData = pd.read_pickle( - currentTrainingAndValidationDataPickle - ) - - current_training_subject_indeces_full = ( - trainingAndValidationData[ - trainingAndValidationData.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - .unique() - .tolist() + # read the data from the pickle if present + data_dict[data_type] = get_dataframe(currentDataPickle) + + # parallel_compute_command is an empty string, thus no parallel computing requested + if not parameters["parallel_compute_command"]: + training_loop( + training_data=data_dict["training"], + validation_data=data_dict["validation"], + output_dir=currentValidationOutputFolder, + device=device, + params=parameters, + testing_data=data_dict["testing"], ) - # start the kFold train for validation - for train_index, val_index in kf_validation.split( - current_training_subject_indeces_full - ): - # the output of the current fold is only needed if multi-fold training is happening - if singleFoldValidation: - currentValOutputFolder = currentOutputFolder - else: - currentValOutputFolder = os.path.join( - currentOutputFolder, str(currentValidationFold) - ) - Path(currentValOutputFolder).mkdir(parents=True, exist_ok=True) - - trainingData = pd.DataFrame() # initialize the variable - validationData = pd.DataFrame() # initialize the variable - - # loop over all train_index and construct new dataframe - for subject_idx in train_index: - trainingData = trainingData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) - - # loop over all val_index and construct new dataframe - for subject_idx in val_index: - validationData = validationData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) - - # # write parameters to pickle - this should not change for the different folds, so keeping is independent - ## pickle/unpickle data - # pickle the data - currentTrainingDataPickle = os.path.join( - currentValOutputFolder, "data_training.pkl" - ) - currentValidationDataPickle = os.path.join( - currentValOutputFolder, "data_validation.pkl" + else: + # call hpc command here + parallel_compute_command_actual = parameters[ + "parallel_compute_command" + ].replace("${outputDir}", currentValidationOutputFolder) + + assert ( + "python" in parallel_compute_command_actual + ), "The 'parallel_compute_command_actual' needs to have the python from the virtual environment, which is usually '${GANDLF_dir}/venv/bin/python'" + + command = ( + parallel_compute_command_actual + + " -m GANDLF.training_loop -train_loader_pickle " + + data_dict_files["training"] + + " -val_loader_pickle " + + data_dict_files["validation"] + + " -parameter_pickle " + + currentModelConfigPickle + + " -device " + + str(device) + + " -outputDir " + + currentValidationOutputFolder + + " -testing_loader_pickle " + + data_dict_files["testing"] ) - if (not os.path.exists(currentTrainingDataPickle)) or reset or resume: - trainingData.to_pickle(currentTrainingDataPickle) - trainingData.to_csv( - currentTrainingDataPickle.replace(".pkl", ".csv"), index=False - ) - else: - trainingData = get_dataframe(currentTrainingDataPickle) - if (not os.path.exists(currentValidationDataPickle)) or reset or resume: - validationData.to_pickle(currentValidationDataPickle) - validationData.to_csv( - currentValidationDataPickle.replace(".pkl", ".csv"), index=False - ) - else: - validationData = get_dataframe(currentValidationDataPickle) - - # parallel_compute_command is an empty string, thus no parallel computing requested - if (not parameters["parallel_compute_command"]) or (singleFoldValidation): - training_loop( - training_data=trainingData, - validation_data=validationData, - output_dir=currentValOutputFolder, - device=device, - params=parameters, - testing_data=testingData, - ) - - else: - # call qsub here - parallel_compute_command_actual = parameters[ - "parallel_compute_command" - ].replace("${outputDir}", currentValOutputFolder) - - if not ("python" in parallel_compute_command_actual): - sys.exit( - "The 'parallel_compute_command_actual' needs to have the python from the virtual environment, which is usually '${GANDLF_dir}/venv/bin/python'" - ) - - command = ( - parallel_compute_command_actual - + " -m GANDLF.training_loop -train_loader_pickle " - + currentTrainingDataPickle - + " -val_loader_pickle " - + currentValidationDataPickle - + " -parameter_pickle " - + currentModelConfigPickle - + " -device " - + str(device) - + " -outputDir " - + currentValOutputFolder - + " -testing_loader_pickle " - + currentTestingDataPickle - ) - - print( - "Submitting job for testing split " - + str(currentTestingFold) - + " and validation split " - + str(currentValidationFold) - ) - subprocess.Popen(command, shell=True).wait() - - if singleFoldValidation: - break - currentValidationFold += 1 # go to next fold - if singleFoldTesting: - break - currentTestingFold += 1 # go to next fold + print("Running command: ", command, flush=True) + os.system(command, flush=True) def TrainingManager_split( From 339c20d04024d43f56012f4cd4b2a13fe98141c9 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:38:03 -0400 Subject: [PATCH 11/59] added test case for stratified --- testing/test_full.py | 1 + 1 file changed, 1 insertion(+) diff --git a/testing/test_full.py b/testing/test_full.py index e6e003467..be9f6e9c6 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -837,6 +837,7 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): parameters["patch_size"] = patch_size["3D"] parameters["model"]["dimension"] = 3 parameters["model"]["final_layer"] = "logits" + parameters["nested_training"]["stratified"] = True # read and parse csv training_data, parameters["headers"] = parseTrainingCSV( From 5671a30839bcac05773b17dd97091386fcd1a0ca Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:54:26 -0400 Subject: [PATCH 12/59] fixed indent --- GANDLF/utils/data_splitter.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 6d7555e6c..f37cce07f 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -167,15 +167,15 @@ def split_data( ] ) - current_training_subject_indeces_full = ( - trainingAndValidationData[ - trainingAndValidationData.columns[ - parameters["headers"]["subjectIDHeader"] - ] + current_training_subject_indeces_full = ( + trainingAndValidationData[ + trainingAndValidationData.columns[ + parameters["headers"]["subjectIDHeader"] ] - .unique() - .tolist() - ) + ] + .unique() + .tolist() + ) # start the kFold train for validation for train_index, val_index in kf_validation.split( From 8770b69a10627263b73a302503eeacd79cb6e9c5 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:56:00 -0400 Subject: [PATCH 13/59] apparently, `(_, _)` is not valid syntax --- GANDLF/utils/data_splitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index f37cce07f..9a00f7856 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -23,7 +23,8 @@ def split_data( ), "`nested_training` key missing in parameters" # populate the headers _, parameters["headers"] = ( - parseTrainingCSV(full_dataset) if "headers" not in parameters else (_, _) + parseTrainingCSV(full_dataset) if "headers" not in parameters else full_dataset, + parameters["headers"], ) parameters = ( From aafc5b7b11b65568ceead56f6d74f3b71bd26323 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Thu, 21 Mar 2024 15:57:19 -0400 Subject: [PATCH 14/59] fixed default --- GANDLF/config_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GANDLF/config_manager.py b/GANDLF/config_manager.py index 81809829e..d6b30d08c 100644 --- a/GANDLF/config_manager.py +++ b/GANDLF/config_manager.py @@ -638,7 +638,7 @@ def _parseConfig( "stratified", False ) params["nested_training"]["stratified"] = params["nested_training"].get( - "proportional", False + "proportional", params["nested_training"]["stratified"] ) params["nested_training"]["testing"] = params["nested_training"].get("testing", -5) params["nested_training"]["validation"] = params["nested_training"].get( From a21e74a154998e64258e7091b1e0e0117066f812 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 09:26:04 -0400 Subject: [PATCH 15/59] updated check --- GANDLF/utils/data_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 9a00f7856..6f6a127ec 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -71,7 +71,7 @@ def split_data( all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index) assert ( - all_subjects_are_unique and parameters["nested_training"]["stratified"] + all_subjects_are_unique or not parameters["nested_training"]["stratified"] ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." # get the targets for prediction for classification From 1e24831bcea3b2814056cbc471310cc1623876c6 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 09:47:25 -0400 Subject: [PATCH 16/59] syntax fix --- GANDLF/utils/data_splitter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 6f6a127ec..31477c0d0 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -128,10 +128,8 @@ def split_data( break currentTestingFold += 1 # increment the testing fold - currentTestingFold = 0 - # start the kFold train for testing - for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full, target): + for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full): # ensure the validation fold is initialized per-testing split currentValidationFold = 0 From 137eda6d304b7bb04f2a26498d77955d3a689b40 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 10:01:03 -0400 Subject: [PATCH 17/59] forgot to return :facepalm: --- GANDLF/utils/data_splitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 31477c0d0..3ab04fdb8 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -225,3 +225,5 @@ def split_data( if singleFoldTesting: break currentTestingFold += 1 # go to next fold + + return return_data From f896a1a0cd6eea764d951144164c24cb27de14cd Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 10:25:12 -0400 Subject: [PATCH 18/59] ensure `testingData` gets returned as `None` when it is not defined --- GANDLF/utils/data_splitter.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 3ab04fdb8..3d239cebf 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -95,13 +95,15 @@ def split_data( # ensure the validation fold is initialized per-testing split currentValidationFold = 0 - trainingAndValidationData = pd.DataFrame() # initialize the variable - testingData = pd.DataFrame() # initialize the variable + trainingAndValidationData, testingData = ( + pd.DataFrame(), + pd.DataFrame(), + ) # initialize the variables # get the current training and testing data if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = None + testingData = None # this should be None to ensure downstream code does not fail else: trainingAndValidationData = full_dataset.loc[trainAndVal_index, :] testingData = full_dataset.loc[testing_index, :] @@ -133,12 +135,15 @@ def split_data( # ensure the validation fold is initialized per-testing split currentValidationFold = 0 - trainingAndValidationData = pd.DataFrame() # initialize the variable - testingData = pd.DataFrame() # initialize the variable + trainingAndValidationData, testingData = ( + pd.DataFrame(), + pd.DataFrame(), + ) # initialize the variables # get the current training and testing data if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset + testingData = None # this should be None to ensure downstream code does not fail else: # loop over all trainAndVal_index and construct new dataframe for subject_idx in trainAndVal_index: From f597c59e977a997a1e5c7a70bdd94223f9842442 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:36:08 -0400 Subject: [PATCH 19/59] added check for `None` --- GANDLF/compute/training_loop.py | 2 +- GANDLF/training_manager.py | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py index d129757e3..2534b52ce 100644 --- a/GANDLF/compute/training_loop.py +++ b/GANDLF/compute/training_loop.py @@ -244,7 +244,7 @@ def training_loop( params["validation_data"] = validation_data params["testing_data"] = testing_data testingDataDefined = True - if params["testing_data"] is None: + if (params["testing_data"] is None) or (params["testing_data"] == ""): # testing_data = validation_data testingDataDefined = False diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index 3ef08de4e..f582de432 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -77,16 +77,18 @@ def TrainingManager( } data_dict_files = {} for data_type, data in data_dict.items(): - currentDataPickle = os.path.join( - currentValidationOutputFolder, "data_" + data_type + ".pkl" - ) - data_dict_files[data_type] = currentDataPickle - if (not os.path.exists(currentDataPickle)) or reset or resume: - data.to_pickle(currentDataPickle) - data.to_csv(currentDataPickle.replace(".pkl", ".csv"), index=False) - else: - # read the data from the pickle if present - data_dict[data_type] = get_dataframe(currentDataPickle) + data_dict_files[data_type] = "" + if data is not None: + currentDataPickle = os.path.join( + currentValidationOutputFolder, "data_" + data_type + ".pkl" + ) + data_dict_files[data_type] = currentDataPickle + if (not os.path.exists(currentDataPickle)) or reset or resume: + data.to_pickle(currentDataPickle) + data.to_csv(currentDataPickle.replace(".pkl", ".csv"), index=False) + else: + # read the data from the pickle if present + data_dict[data_type] = get_dataframe(currentDataPickle) # parallel_compute_command is an empty string, thus no parallel computing requested if not parameters["parallel_compute_command"]: From 6f5c6fad6bef7210aef687116c7fc052905d8dcd Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:54:45 -0400 Subject: [PATCH 20/59] ensuring that checks are confined to `None` and are not bleeding into an additional datatype (i.e., `str`) --- GANDLF/compute/training_loop.py | 6 +++--- GANDLF/training_manager.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py index 2534b52ce..c3f356032 100644 --- a/GANDLF/compute/training_loop.py +++ b/GANDLF/compute/training_loop.py @@ -244,9 +244,9 @@ def training_loop( params["validation_data"] = validation_data params["testing_data"] = testing_data testingDataDefined = True - if (params["testing_data"] is None) or (params["testing_data"] == ""): - # testing_data = validation_data - testingDataDefined = False + if not isinstance(testing_data, pd.DataFrame): + if params["testing_data"] is None: + testingDataDefined = False # Setup a few variables for tracking best_loss = 1e7 diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index f582de432..03c27085a 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -77,7 +77,7 @@ def TrainingManager( } data_dict_files = {} for data_type, data in data_dict.items(): - data_dict_files[data_type] = "" + data_dict_files[data_type] = None if data is not None: currentDataPickle = os.path.join( currentValidationOutputFolder, "data_" + data_type + ".pkl" From b93bd038eb2c2ac51856ea66b1f4df31dbb91b7e Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 14:02:35 -0400 Subject: [PATCH 21/59] added check for singleFoldValidation --- GANDLF/training_manager.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index 03c27085a..e605af6f6 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -49,6 +49,7 @@ def TrainingManager( # check the last indeces to see if single fold training is requested singleFoldTesting = True if last_indeces[0] == 0 else False + singleFoldValidation = True if last_indeces[1] == 0 else False for ( testing_and_valid_indeces, @@ -64,10 +65,12 @@ def TrainingManager( ) Path(currentTestingOutputFolder).mkdir(parents=True, exist_ok=True) - currentValidationOutputFolder = os.path.join( - currentTestingOutputFolder, str(testing_and_valid_indeces[1]) - ) - Path(currentValidationOutputFolder).mkdir(parents=True, exist_ok=True) + currentValidationOutputFolder = currentTestingOutputFolder + if not singleFoldValidation: + currentValidationOutputFolder = os.path.join( + currentTestingOutputFolder, str(testing_and_valid_indeces[1]) + ) + Path(currentValidationOutputFolder).mkdir(parents=True, exist_ok=True) # initialize the dataframes and save them to disk data_dict = { From d15842cef5e71df45b478812928948e825cddcf0 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 15:13:37 -0400 Subject: [PATCH 22/59] added `else` after stratified --- GANDLF/utils/data_splitter.py | 162 +++++++++++++++++----------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 3d239cebf..079692f90 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -122,113 +122,113 @@ def split_data( testingData, ) ) + currentValidationFold += 1 # increment the validation fold if singleFoldValidation: break - currentValidationFold += 1 # increment the validation fold + currentTestingFold += 1 # increment the testing fold if singleFoldTesting: break - currentTestingFold += 1 # increment the testing fold + else: + # start the kFold train for testing + for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full): + # ensure the validation fold is initialized per-testing split + currentValidationFold = 0 - # start the kFold train for testing - for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full): - # ensure the validation fold is initialized per-testing split - currentValidationFold = 0 - - trainingAndValidationData, testingData = ( - pd.DataFrame(), - pd.DataFrame(), - ) # initialize the variables - # get the current training and testing data - if noTestingData: - # don't consider the split indeces for this case - trainingAndValidationData = full_dataset - testingData = None # this should be None to ensure downstream code does not fail - else: - # loop over all trainAndVal_index and construct new dataframe - for subject_idx in trainAndVal_index: - trainingAndValidationData = trainingAndValidationData._append( - full_dataset[ + trainingAndValidationData, testingData = ( + pd.DataFrame(), + pd.DataFrame(), + ) # initialize the variables + # get the current training and testing data + if noTestingData: + # don't consider the split indeces for this case + trainingAndValidationData = full_dataset + testingData = None # this should be None to ensure downstream code does not fail + else: + # loop over all trainAndVal_index and construct new dataframe + for subject_idx in trainAndVal_index: + trainingAndValidationData = trainingAndValidationData._append( full_dataset[ - full_dataset.columns[ - parameters["headers"]["subjectIDHeader"] + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] ] + == subjectIDs_full[subject_idx] ] - == subjectIDs_full[subject_idx] - ] - ) + ) - # loop over all testing_index and construct new dataframe - for subject_idx in testing_index: - testingData = testingData._append( - full_dataset[ + # loop over all testing_index and construct new dataframe + for subject_idx in testing_index: + testingData = testingData._append( full_dataset[ - full_dataset.columns[ - parameters["headers"]["subjectIDHeader"] + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] ] + == subjectIDs_full[subject_idx] ] - == subjectIDs_full[subject_idx] - ] - ) + ) - current_training_subject_indeces_full = ( - trainingAndValidationData[ - trainingAndValidationData.columns[ - parameters["headers"]["subjectIDHeader"] + current_training_subject_indeces_full = ( + trainingAndValidationData[ + trainingAndValidationData.columns[ + parameters["headers"]["subjectIDHeader"] + ] ] - ] - .unique() - .tolist() - ) + .unique() + .tolist() + ) + + # start the kFold train for validation + for train_index, val_index in kf_validation.split( + current_training_subject_indeces_full + ): + trainingData = pd.DataFrame() # initialize the variable + validationData = pd.DataFrame() # initialize the variable - # start the kFold train for validation - for train_index, val_index in kf_validation.split( - current_training_subject_indeces_full - ): - trainingData = pd.DataFrame() # initialize the variable - validationData = pd.DataFrame() # initialize the variable - - # loop over all train_index and construct new dataframe - for subject_idx in train_index: - trainingData = trainingData._append( - full_dataset[ + # loop over all train_index and construct new dataframe + for subject_idx in train_index: + trainingData = trainingData._append( full_dataset[ - full_dataset.columns[ - parameters["headers"]["subjectIDHeader"] + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] ] + == subjectIDs_full[subject_idx] ] - == subjectIDs_full[subject_idx] - ] - ) + ) - # loop over all val_index and construct new dataframe - for subject_idx in val_index: - validationData = validationData._append( - full_dataset[ + # loop over all val_index and construct new dataframe + for subject_idx in val_index: + validationData = validationData._append( full_dataset[ - full_dataset.columns[ - parameters["headers"]["subjectIDHeader"] + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] ] + == subjectIDs_full[subject_idx] ] - == subjectIDs_full[subject_idx] - ] - ) + ) - return_data.append( - ( - (currentTestingFold, currentValidationFold), - trainingData, - validationData, - testingData, + return_data.append( + ( + (currentTestingFold, currentValidationFold), + trainingData, + validationData, + testingData, + ) ) - ) - if singleFoldValidation: - break - currentValidationFold += 1 # go to next fold + currentValidationFold += 1 # go to next fold + if singleFoldValidation: + break - if singleFoldTesting: - break - currentTestingFold += 1 # go to next fold + currentTestingFold += 1 # go to next fold + if singleFoldTesting: + break return return_data From 51ac783b007750e019296b459bbde7992f40303f Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 15:13:50 -0400 Subject: [PATCH 23/59] updated test for stratified splitting check --- testing/test_full.py | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index be9f6e9c6..58032fc23 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -837,6 +837,12 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): parameters["patch_size"] = patch_size["3D"] parameters["model"]["dimension"] = 3 parameters["model"]["final_layer"] = "logits" + # loop through selected models and train for single epoch + model = all_models_regression[0] + parameters["model"]["architecture"] = model + parameters["model"]["onnx_export"] = False + parameters["model"]["print_summary"] = False + ## add stratified splitting parameters["nested_training"]["stratified"] = True # read and parse csv @@ -845,20 +851,30 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): ) parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"]) parameters = populate_header_in_parameters(parameters, parameters["headers"]) - # loop through selected models and train for single epoch - model = all_models_regression[0] - parameters["model"]["architecture"] = model - parameters["model"]["onnx_export"] = False - parameters["model"]["print_summary"] = False - sanitize_outputDir() - TrainingManager( - dataframe=training_data, - outputDir=outputDir, - parameters=parameters, - device=device, - resume=False, - reset=True, - ) + # duplicate the data to test stratified sampling + training_data_duplicate = training_data._append(training_data) + for _ in range(1): + training_data_duplicate = training_data_duplicate._append( + training_data_duplicate + ) + training_data_duplicate.reset_index(drop=True, inplace=True) + # ensure subjects are not duplicated + training_data_duplicate["SubjectID"] = training_data_duplicate.index + + # ensure every part of the code is tested + for folds in [-5, 2]: + ## add stratified folding information + parameters["nested_training"]["testing"] = folds + parameters["nested_training"]["validation"] = folds + sanitize_outputDir() + TrainingManager( + dataframe=training_data_duplicate, + outputDir=outputDir, + parameters=parameters, + device=device, + resume=False, + reset=True, + ) ## this is to test if inference can run without having ground truth column training_data.drop("ValueToPredict", axis=1, inplace=True) training_data.drop("Label", axis=1, inplace=True) From 4132fe8fded404ea9c3dd71a21bef06e70488395 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:16:36 -0400 Subject: [PATCH 24/59] re-index dataset after testing split, and use new targets --- GANDLF/utils/data_splitter.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 079692f90..2c7c4c0a4 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -75,11 +75,12 @@ def split_data( ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." # get the targets for prediction for classification - target = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle + target_testing = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle if parameters["problem_type"] == "classification": - target = full_dataset.loc[ + target_testing = full_dataset.loc[ :, full_dataset.columns[parameters["headers"]["predictionHeaders"]] ] + target_validation = target_testing folding_type = KFold if parameters["nested_training"]["stratified"]: @@ -91,7 +92,9 @@ def split_data( # start StratifiedKFold splitting currentTestingFold = 0 if parameters["nested_training"]["stratified"]: - for trainAndVal_index, testing_index in kf_testing.split(full_dataset, target): + for trainAndVal_index, testing_index in kf_testing.split( + full_dataset, target_testing + ): # ensure the validation fold is initialized per-testing split currentValidationFold = 0 @@ -103,13 +106,20 @@ def split_data( if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = None # this should be None to ensure downstream code does not fail + testingData = ( + None # this should be None to ensure downstream code does not fail + ) else: trainingAndValidationData = full_dataset.loc[trainAndVal_index, :] + trainingAndValidationData.reset_index(drop=True, inplace=True) testingData = full_dataset.loc[testing_index, :] + # update the targets after the split + target_validation = trainingAndValidationData.loc[ + :, full_dataset.columns[parameters["headers"]["predictionHeaders"]] + ] for train_index, val_index in kf_validation.split( - trainingAndValidationData, target + trainingAndValidationData, target_validation ): # get the current training and validation data trainingData = trainingAndValidationData.loc[train_index, :] @@ -143,7 +153,9 @@ def split_data( if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = None # this should be None to ensure downstream code does not fail + testingData = ( + None # this should be None to ensure downstream code does not fail + ) else: # loop over all trainAndVal_index and construct new dataframe for subject_idx in trainAndVal_index: From 26299abdf41a37dac45349094f4fa8574b470bb1 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:44:19 -0400 Subject: [PATCH 25/59] check if this works instead to pick up the `modelDir` --- testing/test_full.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index dedf4b429..b2f18adb2 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -869,7 +869,7 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): training_data_duplicate["SubjectID"] = training_data_duplicate.index # ensure every part of the code is tested - for folds in [-5, 2]: + for folds in [2, -5]: ## add stratified folding information parameters["nested_training"]["testing"] = folds parameters["nested_training"]["validation"] = folds @@ -892,8 +892,6 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): testingDir + "/config_classification.yaml", version_check_flag=False ) training_data, parameters["headers"] = parseTrainingCSV(temp_infer_csv) - parameters["output_dir"] = outputDir # this is in inference mode - parameters["output_dir"] = outputDir # this is in inference mode parameters["modality"] = "rad" parameters["patch_size"] = patch_size["3D"] parameters["model"]["dimension"] = 3 From fd2c63d89c227c35ef9d44d6f98c48e895e62b5f Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 19:44:21 -0400 Subject: [PATCH 26/59] using `.get()` to make things better --- GANDLF/utils/data_splitter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 2c7c4c0a4..4ae2c9d41 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -70,8 +70,8 @@ def split_data( all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index) - assert ( - all_subjects_are_unique or not parameters["nested_training"]["stratified"] + assert all_subjects_are_unique or not parameters["nested_training"].get( + "stratified" ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." # get the targets for prediction for classification @@ -83,7 +83,7 @@ def split_data( target_validation = target_testing folding_type = KFold - if parameters["nested_training"]["stratified"]: + if parameters["nested_training"].get("stratified"): folding_type = StratifiedKFold kf_testing = folding_type(n_splits=testing_folds) @@ -91,7 +91,7 @@ def split_data( # start StratifiedKFold splitting currentTestingFold = 0 - if parameters["nested_training"]["stratified"]: + if parameters["nested_training"].get("stratified"): for trainAndVal_index, testing_index in kf_testing.split( full_dataset, target_testing ): From 96efef5f54300f0e4ddd0e8e63afbd4242c9cc0c Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Fri, 22 Mar 2024 22:16:59 -0400 Subject: [PATCH 27/59] added case to test to improve test coverage --- testing/test_full.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index b2f18adb2..be895ac50 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -869,10 +869,10 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): training_data_duplicate["SubjectID"] = training_data_duplicate.index # ensure every part of the code is tested - for folds in [2, -5]: + for folds in [2, 1, -5]: ## add stratified folding information parameters["nested_training"]["testing"] = folds - parameters["nested_training"]["validation"] = folds + parameters["nested_training"]["validation"] = folds if folds != 1 else -5 sanitize_outputDir() TrainingManager( dataframe=training_data_duplicate, From d4b324eacd010743517cdc8a9b28d29a2240d48b Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:03:15 -0400 Subject: [PATCH 28/59] updated comments --- GANDLF/utils/data_splitter.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 4ae2c9d41..d314f687a 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -106,9 +106,8 @@ def split_data( if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = ( - None # this should be None to ensure downstream code does not fail - ) + # this should be None to ensure downstream code does not fail + testingData = None else: trainingAndValidationData = full_dataset.loc[trainAndVal_index, :] trainingAndValidationData.reset_index(drop=True, inplace=True) @@ -153,9 +152,8 @@ def split_data( if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = ( - None # this should be None to ensure downstream code does not fail - ) + # this should be None to ensure downstream code does not fail + testingData = None else: # loop over all trainAndVal_index and construct new dataframe for subject_idx in trainAndVal_index: From c0405ec03677ce8657abd5a86aa137b62ddd062a Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:04:11 -0400 Subject: [PATCH 29/59] added new file with code --- GANDLF/cli/data_split_saver.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 GANDLF/cli/data_split_saver.py diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py new file mode 100644 index 000000000..36f19adf5 --- /dev/null +++ b/GANDLF/cli/data_split_saver.py @@ -0,0 +1,50 @@ +from typing import Union +import os + +import pandas as pd +from GANDLF.utils import get_dataframe, split_data + + +def split_data_and_save_csvs( + input_data: Union[pd.Dataframe, str], output_dir: str, parameters: dict +) -> None: + """ + Split the data into training, validation, and testing sets and save them as csvs in the output directory + + Args: + input_data (Union[pd.Dataframe, str]): The input data to be split and saved. + output_dir (str): The output directory to save the split data. + parameters (dict): The parameters dictionary. + """ + + full_data = get_dataframe(input_data) + + dataframe_split = split_data(full_data, parameters) + + for ( + testing_and_valid_indeces, + trainingData, + validationData, + testingData, + ) in dataframe_split: + + # training and validation dataframes use the same index, since they are based on the validation split + training_data_path = os.path.join( + output_dir, f"training_{testing_and_valid_indeces[1]}.csv" + ) + validation_data_path = os.path.join( + output_dir, f"validation_{testing_and_valid_indeces[1]}.csv" + ) + # testing dataframes use the first index + testing_data_path = os.path.join( + output_dir, f"testing_{testing_and_valid_indeces[0]}.csv" + ) + + for data, path in zip( + [trainingData, validationData, testingData], + [training_data_path, validation_data_path, testing_data_path], + ): + # check if the data is not None and the path does not exist + if not os.path.exists(path): + if data is not None: + data.to_csv(path, index=False) From ca7a96f750884951ae2ebe9181c0c9f2a15f3629 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:04:30 -0400 Subject: [PATCH 30/59] added to init --- GANDLF/cli/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/GANDLF/cli/__init__.py b/GANDLF/cli/__init__.py index 7021caa4b..cc1eda44b 100644 --- a/GANDLF/cli/__init__.py +++ b/GANDLF/cli/__init__.py @@ -6,6 +6,7 @@ from .recover_config import recover_config from .post_training_model_optimization import post_training_model_optimization from .generate_metrics import generate_metrics_dict +from .data_split_saver import split_data_and_save_csvs from datetime import date From 99ba0af8bb805010c34ee6c09eec2669f3c4e8df Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:09:20 -0400 Subject: [PATCH 31/59] added top-level script --- gandlf_splitCSV | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 gandlf_splitCSV diff --git a/gandlf_splitCSV b/gandlf_splitCSV new file mode 100644 index 000000000..842576eaf --- /dev/null +++ b/gandlf_splitCSV @@ -0,0 +1,61 @@ +#!usr/bin/env python +# -*- coding: utf-8 -*- + +import os, argparse, sys, yaml +from GANDLF.cli import copyrightMessage, split_data_and_save_csvs + + +def main(): + parser = argparse.ArgumentParser( + prog="GANDLF_SplitCSV", + formatter_class=argparse.RawTextHelpFormatter, + description="Split the data into training, validation, and testing sets and save them as csvs in the output directory.\n\n" + + copyrightMessage, + ) + parser.add_argument( + "-i", + "--inputCSV", + metavar="", + type=str, + help="Input CSV file which contains the data to be split.", + ) + parser.add_argument( + "-c", + "--config", + metavar="", + default="", + type=str, + help="The GaNDLF config (in YAML) with the `nested_training` key specified", + ) + parser.add_argument( + "-o", + "--outputDir", + metavar="", + type=str, + help="Output directory to save the split data.", + ) + + args = parser.parse_args() + + # check for required parameters - this is needed here to keep the cli clean + for param_none_check in [args.inputCSV, args.outputDir, args.config]: + if param_none_check is None: + sys.exit("ERROR: Missing required parameter:", param_none_check) + + inputCSV = os.path.normpath(args.inputCSV) + outputDir = os.path.normpath(args.outputDir) + # initialize default + config = {"nested_training": {"testing": 5, "validation": 5}} + if os.path.isfile(args.config): + config = yaml.safe_load(open(args.config, "r")) + + print("Config used for split:", config) + + split_data_and_save_csvs(inputCSV, outputDir, config) + + print("Finished successfully.") + + +# main function +if __name__ == "__main__": + main() From 7e47ffa173b8733994fcfed8b28f1f15575c6ba6 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:16:08 -0400 Subject: [PATCH 32/59] updated requirements --- gandlf_splitCSV | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gandlf_splitCSV b/gandlf_splitCSV index 842576eaf..e969a8f63 100644 --- a/gandlf_splitCSV +++ b/gandlf_splitCSV @@ -17,21 +17,23 @@ def main(): "--inputCSV", metavar="", type=str, + required=True, help="Input CSV file which contains the data to be split.", ) parser.add_argument( "-c", "--config", metavar="", - default="", + default=None, type=str, - help="The GaNDLF config (in YAML) with the `nested_training` key specified", + help="The GaNDLF config (in YAML) with the `nested_training` key specified. Defaults to 5 folds for testing and validation.", ) parser.add_argument( "-o", "--outputDir", metavar="", type=str, + required=True, help="Output directory to save the split data.", ) @@ -48,7 +50,7 @@ def main(): config = {"nested_training": {"testing": 5, "validation": 5}} if os.path.isfile(args.config): config = yaml.safe_load(open(args.config, "r")) - + print("Config used for split:", config) split_data_and_save_csvs(inputCSV, outputDir, config) From 90b797e4a51c7c7e14e5cfa057c55a395d517dfe Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:26:16 -0400 Subject: [PATCH 33/59] added test for data split --- testing/test_full.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/testing/test_full.py b/testing/test_full.py index be895ac50..f7dc2686f 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -3147,6 +3147,41 @@ def test_generic_deploy_metrics_docker(): print("passed") + +def test_generic_data_split(): + print("51: Starting test for splitting and saving CSVs") + # read and initialize parameters for specific data dimension + parameters = ConfigManager( + testingDir + "/config_classification.yaml", version_check_flag=False + ) + parameters["nested_training"] = { + "testing": 5, + "validation": 5, + "stratified": True, + } + # read and parse csv + training_data, parameters["headers"] = parseTrainingCSV( + inputDir + "/train_3d_rad_classification.csv" + ) + parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"]) + parameters = populate_header_in_parameters(parameters, parameters["headers"]) + # duplicate the data to test stratified sampling + training_data_duplicate = training_data._append(training_data) + for _ in range(1): + training_data_duplicate = training_data_duplicate._append( + training_data_duplicate + ) + training_data_duplicate.reset_index(drop=True, inplace=True) + # ensure subjects are not duplicated + training_data_duplicate["SubjectID"] = training_data_duplicate.index + + sanitize_outputDir() + + split_data_and_save_csvs(training_data_duplicate, outputDir, parameters) + + files_in_outputDir = os.listdir(outputDir) + assert len(files_in_outputDir) == 15, "CSVs were not split correctly" + sanitize_outputDir() print("passed") From 5a9639bbf7aa1bbad5b7a2b2d4ae93084f214fa1 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:37:42 -0400 Subject: [PATCH 34/59] updated api --- gandlf_splitCSV | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gandlf_splitCSV b/gandlf_splitCSV index e969a8f63..339fec2ac 100644 --- a/gandlf_splitCSV +++ b/gandlf_splitCSV @@ -16,6 +16,7 @@ def main(): "-i", "--inputCSV", metavar="", + default=None, type=str, required=True, help="Input CSV file which contains the data to be split.", @@ -25,13 +26,15 @@ def main(): "--config", metavar="", default=None, + required=True, type=str, - help="The GaNDLF config (in YAML) with the `nested_training` key specified. Defaults to 5 folds for testing and validation.", + help="The GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed.", ) parser.add_argument( "-o", "--outputDir", metavar="", + default=None, type=str, required=True, help="Output directory to save the split data.", From 7cb885f7fd3f2f5c61d097683adb54601350e715 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:37:59 -0400 Subject: [PATCH 35/59] typo fix --- GANDLF/cli/data_split_saver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py index 36f19adf5..c024928ea 100644 --- a/GANDLF/cli/data_split_saver.py +++ b/GANDLF/cli/data_split_saver.py @@ -6,7 +6,7 @@ def split_data_and_save_csvs( - input_data: Union[pd.Dataframe, str], output_dir: str, parameters: dict + input_data: Union[pd.DataFrame, str], output_dir: str, parameters: dict ) -> None: """ Split the data into training, validation, and testing sets and save them as csvs in the output directory From 1737ef0fa11b32d179d38668610a2e39c074d44c Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:41:48 -0400 Subject: [PATCH 36/59] ensure that stratified can only run for classification problems --- GANDLF/utils/data_splitter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index d314f687a..d69216fd5 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -74,6 +74,12 @@ def split_data( "stratified" ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." + assert (parameters["problem_type"] == "classification") and parameters[ + "nested_training" + ].get( + "stratified" + ), "Stratified splitting is only possible for classification problems." + # get the targets for prediction for classification target_testing = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle if parameters["problem_type"] == "classification": From 229a942bf61fb1b4dacbd79fbf6412daf2ce2b03 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 15:42:10 -0400 Subject: [PATCH 37/59] added check --- GANDLF/utils/data_splitter.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 4ae2c9d41..d69216fd5 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -74,6 +74,12 @@ def split_data( "stratified" ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." + assert (parameters["problem_type"] == "classification") and parameters[ + "nested_training" + ].get( + "stratified" + ), "Stratified splitting is only possible for classification problems." + # get the targets for prediction for classification target_testing = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle if parameters["problem_type"] == "classification": @@ -106,9 +112,8 @@ def split_data( if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = ( - None # this should be None to ensure downstream code does not fail - ) + # this should be None to ensure downstream code does not fail + testingData = None else: trainingAndValidationData = full_dataset.loc[trainAndVal_index, :] trainingAndValidationData.reset_index(drop=True, inplace=True) @@ -153,9 +158,8 @@ def split_data( if noTestingData: # don't consider the split indeces for this case trainingAndValidationData = full_dataset - testingData = ( - None # this should be None to ensure downstream code does not fail - ) + # this should be None to ensure downstream code does not fail + testingData = None else: # loop over all trainAndVal_index and construct new dataframe for subject_idx in trainAndVal_index: From 91d80b916e54ea38c83a21334f0095e904963da8 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Sun, 24 Mar 2024 21:52:16 -0400 Subject: [PATCH 38/59] updated logic for check --- GANDLF/utils/data_splitter.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index d69216fd5..6c5c3a81e 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -33,6 +33,8 @@ def split_data( else parameters ) + stratified_splitting = parameters["nested_training"].get("stratified") + return_data = [] # check for single fold training @@ -70,15 +72,16 @@ def split_data( all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index) - assert all_subjects_are_unique or not parameters["nested_training"].get( - "stratified" + assert ( + all_subjects_are_unique or not stratified_splitting ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." - assert (parameters["problem_type"] == "classification") and parameters[ - "nested_training" - ].get( - "stratified" - ), "Stratified splitting is only possible for classification problems." + # assert for Stratified splitting is only possible for classification problems + assert ( + parameters["problem_type"] == "classification" + ) or not stratified_splitting, ( + "Stratified splitting is only possible for classification problems." + ) # get the targets for prediction for classification target_testing = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle @@ -89,7 +92,7 @@ def split_data( target_validation = target_testing folding_type = KFold - if parameters["nested_training"].get("stratified"): + if stratified_splitting: folding_type = StratifiedKFold kf_testing = folding_type(n_splits=testing_folds) @@ -97,7 +100,7 @@ def split_data( # start StratifiedKFold splitting currentTestingFold = 0 - if parameters["nested_training"].get("stratified"): + if stratified_splitting: for trainAndVal_index, testing_index in kf_testing.split( full_dataset, target_testing ): From fbdf8f392b5d630dcb214b159e88d85ab0f3b5e9 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:07:25 -0400 Subject: [PATCH 39/59] updated checks for stratified split --- GANDLF/utils/data_splitter.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py index 6c5c3a81e..6976d7629 100644 --- a/GANDLF/utils/data_splitter.py +++ b/GANDLF/utils/data_splitter.py @@ -72,16 +72,16 @@ def split_data( all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index) - assert ( - all_subjects_are_unique or not stratified_splitting - ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." - - # assert for Stratified splitting is only possible for classification problems - assert ( - parameters["problem_type"] == "classification" - ) or not stratified_splitting, ( - "Stratified splitting is only possible for classification problems." - ) + # checks for stratified splitting + if stratified_splitting: + # it can only be done for classification problems + assert ( + parameters["problem_type"] == "classification" + ), "Stratified splitting is only possible for classification problems." + # it can only be done when all subjects are unique + assert ( + all_subjects_are_unique + ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." # get the targets for prediction for classification target_testing = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle From 4b77920b2dfb77486cffed6378c510864ecff80b Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:42:08 -0400 Subject: [PATCH 40/59] this should not be there --- testing/test_full.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index f7dc2686f..471dd87d0 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -3146,42 +3146,3 @@ def test_generic_deploy_metrics_docker(): sanitize_outputDir() print("passed") - - -def test_generic_data_split(): - print("51: Starting test for splitting and saving CSVs") - # read and initialize parameters for specific data dimension - parameters = ConfigManager( - testingDir + "/config_classification.yaml", version_check_flag=False - ) - parameters["nested_training"] = { - "testing": 5, - "validation": 5, - "stratified": True, - } - # read and parse csv - training_data, parameters["headers"] = parseTrainingCSV( - inputDir + "/train_3d_rad_classification.csv" - ) - parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"]) - parameters = populate_header_in_parameters(parameters, parameters["headers"]) - # duplicate the data to test stratified sampling - training_data_duplicate = training_data._append(training_data) - for _ in range(1): - training_data_duplicate = training_data_duplicate._append( - training_data_duplicate - ) - training_data_duplicate.reset_index(drop=True, inplace=True) - # ensure subjects are not duplicated - training_data_duplicate["SubjectID"] = training_data_duplicate.index - - sanitize_outputDir() - - split_data_and_save_csvs(training_data_duplicate, outputDir, parameters) - - files_in_outputDir = os.listdir(outputDir) - assert len(files_in_outputDir) == 15, "CSVs were not split correctly" - - sanitize_outputDir() - - print("passed") From 30227b6484afafa78b3fa7438a4d006d15d8f71c Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 08:44:55 -0400 Subject: [PATCH 41/59] fix import --- testing/test_full.py | 1 + 1 file changed, 1 insertion(+) diff --git a/testing/test_full.py b/testing/test_full.py index f7dc2686f..93f3127da 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -32,6 +32,7 @@ recover_config, post_training_model_optimization, generate_metrics_dict, + split_data_and_save_csvs, ) from GANDLF.schedulers import global_schedulers_dict from GANDLF.optimizers import global_optimizer_dict From 855edf4240de990f1075529e523cc5ee828d8d1c Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:31:08 -0400 Subject: [PATCH 42/59] Delete GANDLF/cli/data_split_saver.py --- GANDLF/cli/data_split_saver.py | 50 ---------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 GANDLF/cli/data_split_saver.py diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py deleted file mode 100644 index c024928ea..000000000 --- a/GANDLF/cli/data_split_saver.py +++ /dev/null @@ -1,50 +0,0 @@ -from typing import Union -import os - -import pandas as pd -from GANDLF.utils import get_dataframe, split_data - - -def split_data_and_save_csvs( - input_data: Union[pd.DataFrame, str], output_dir: str, parameters: dict -) -> None: - """ - Split the data into training, validation, and testing sets and save them as csvs in the output directory - - Args: - input_data (Union[pd.Dataframe, str]): The input data to be split and saved. - output_dir (str): The output directory to save the split data. - parameters (dict): The parameters dictionary. - """ - - full_data = get_dataframe(input_data) - - dataframe_split = split_data(full_data, parameters) - - for ( - testing_and_valid_indeces, - trainingData, - validationData, - testingData, - ) in dataframe_split: - - # training and validation dataframes use the same index, since they are based on the validation split - training_data_path = os.path.join( - output_dir, f"training_{testing_and_valid_indeces[1]}.csv" - ) - validation_data_path = os.path.join( - output_dir, f"validation_{testing_and_valid_indeces[1]}.csv" - ) - # testing dataframes use the first index - testing_data_path = os.path.join( - output_dir, f"testing_{testing_and_valid_indeces[0]}.csv" - ) - - for data, path in zip( - [trainingData, validationData, testingData], - [training_data_path, validation_data_path, testing_data_path], - ): - # check if the data is not None and the path does not exist - if not os.path.exists(path): - if data is not None: - data.to_csv(path, index=False) From 7ccfa6fcee9a0ee24f67a241806d973c3853981c Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:31:33 -0400 Subject: [PATCH 43/59] Delete gandlf_splitCSV --- gandlf_splitCSV | 66 ------------------------------------------------- 1 file changed, 66 deletions(-) delete mode 100644 gandlf_splitCSV diff --git a/gandlf_splitCSV b/gandlf_splitCSV deleted file mode 100644 index 339fec2ac..000000000 --- a/gandlf_splitCSV +++ /dev/null @@ -1,66 +0,0 @@ -#!usr/bin/env python -# -*- coding: utf-8 -*- - -import os, argparse, sys, yaml -from GANDLF.cli import copyrightMessage, split_data_and_save_csvs - - -def main(): - parser = argparse.ArgumentParser( - prog="GANDLF_SplitCSV", - formatter_class=argparse.RawTextHelpFormatter, - description="Split the data into training, validation, and testing sets and save them as csvs in the output directory.\n\n" - + copyrightMessage, - ) - parser.add_argument( - "-i", - "--inputCSV", - metavar="", - default=None, - type=str, - required=True, - help="Input CSV file which contains the data to be split.", - ) - parser.add_argument( - "-c", - "--config", - metavar="", - default=None, - required=True, - type=str, - help="The GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed.", - ) - parser.add_argument( - "-o", - "--outputDir", - metavar="", - default=None, - type=str, - required=True, - help="Output directory to save the split data.", - ) - - args = parser.parse_args() - - # check for required parameters - this is needed here to keep the cli clean - for param_none_check in [args.inputCSV, args.outputDir, args.config]: - if param_none_check is None: - sys.exit("ERROR: Missing required parameter:", param_none_check) - - inputCSV = os.path.normpath(args.inputCSV) - outputDir = os.path.normpath(args.outputDir) - # initialize default - config = {"nested_training": {"testing": 5, "validation": 5}} - if os.path.isfile(args.config): - config = yaml.safe_load(open(args.config, "r")) - - print("Config used for split:", config) - - split_data_and_save_csvs(inputCSV, outputDir, config) - - print("Finished successfully.") - - -# main function -if __name__ == "__main__": - main() From 43b764fdeea32a104e7c9968a597fffd38f0781c Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:38:39 -0400 Subject: [PATCH 44/59] Update __init__.py --- GANDLF/cli/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/GANDLF/cli/__init__.py b/GANDLF/cli/__init__.py index cc1eda44b..7021caa4b 100644 --- a/GANDLF/cli/__init__.py +++ b/GANDLF/cli/__init__.py @@ -6,7 +6,6 @@ from .recover_config import recover_config from .post_training_model_optimization import post_training_model_optimization from .generate_metrics import generate_metrics_dict -from .data_split_saver import split_data_and_save_csvs from datetime import date From 7dcb6e04907e33b49755bc49e4f390a6d543781b Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:45:27 -0400 Subject: [PATCH 45/59] Create data_split_saver.py --- GANDLF/cli/data_split_saver.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 GANDLF/cli/data_split_saver.py diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py new file mode 100644 index 000000000..c024928ea --- /dev/null +++ b/GANDLF/cli/data_split_saver.py @@ -0,0 +1,50 @@ +from typing import Union +import os + +import pandas as pd +from GANDLF.utils import get_dataframe, split_data + + +def split_data_and_save_csvs( + input_data: Union[pd.DataFrame, str], output_dir: str, parameters: dict +) -> None: + """ + Split the data into training, validation, and testing sets and save them as csvs in the output directory + + Args: + input_data (Union[pd.Dataframe, str]): The input data to be split and saved. + output_dir (str): The output directory to save the split data. + parameters (dict): The parameters dictionary. + """ + + full_data = get_dataframe(input_data) + + dataframe_split = split_data(full_data, parameters) + + for ( + testing_and_valid_indeces, + trainingData, + validationData, + testingData, + ) in dataframe_split: + + # training and validation dataframes use the same index, since they are based on the validation split + training_data_path = os.path.join( + output_dir, f"training_{testing_and_valid_indeces[1]}.csv" + ) + validation_data_path = os.path.join( + output_dir, f"validation_{testing_and_valid_indeces[1]}.csv" + ) + # testing dataframes use the first index + testing_data_path = os.path.join( + output_dir, f"testing_{testing_and_valid_indeces[0]}.csv" + ) + + for data, path in zip( + [trainingData, validationData, testingData], + [training_data_path, validation_data_path, testing_data_path], + ): + # check if the data is not None and the path does not exist + if not os.path.exists(path): + if data is not None: + data.to_csv(path, index=False) From 627a68d3333d67880b498385831c3c4caa037a7d Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:45:52 -0400 Subject: [PATCH 46/59] Update __init__.py --- GANDLF/cli/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/GANDLF/cli/__init__.py b/GANDLF/cli/__init__.py index 7021caa4b..cc1eda44b 100644 --- a/GANDLF/cli/__init__.py +++ b/GANDLF/cli/__init__.py @@ -6,6 +6,7 @@ from .recover_config import recover_config from .post_training_model_optimization import post_training_model_optimization from .generate_metrics import generate_metrics_dict +from .data_split_saver import split_data_and_save_csvs from datetime import date From 30159b74503712d11eca201937ecac666fce9997 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:46:49 -0400 Subject: [PATCH 47/59] Create gandlf_splitCSV --- gandlf_splitCSV | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 gandlf_splitCSV diff --git a/gandlf_splitCSV b/gandlf_splitCSV new file mode 100644 index 000000000..339fec2ac --- /dev/null +++ b/gandlf_splitCSV @@ -0,0 +1,66 @@ +#!usr/bin/env python +# -*- coding: utf-8 -*- + +import os, argparse, sys, yaml +from GANDLF.cli import copyrightMessage, split_data_and_save_csvs + + +def main(): + parser = argparse.ArgumentParser( + prog="GANDLF_SplitCSV", + formatter_class=argparse.RawTextHelpFormatter, + description="Split the data into training, validation, and testing sets and save them as csvs in the output directory.\n\n" + + copyrightMessage, + ) + parser.add_argument( + "-i", + "--inputCSV", + metavar="", + default=None, + type=str, + required=True, + help="Input CSV file which contains the data to be split.", + ) + parser.add_argument( + "-c", + "--config", + metavar="", + default=None, + required=True, + type=str, + help="The GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed.", + ) + parser.add_argument( + "-o", + "--outputDir", + metavar="", + default=None, + type=str, + required=True, + help="Output directory to save the split data.", + ) + + args = parser.parse_args() + + # check for required parameters - this is needed here to keep the cli clean + for param_none_check in [args.inputCSV, args.outputDir, args.config]: + if param_none_check is None: + sys.exit("ERROR: Missing required parameter:", param_none_check) + + inputCSV = os.path.normpath(args.inputCSV) + outputDir = os.path.normpath(args.outputDir) + # initialize default + config = {"nested_training": {"testing": 5, "validation": 5}} + if os.path.isfile(args.config): + config = yaml.safe_load(open(args.config, "r")) + + print("Config used for split:", config) + + split_data_and_save_csvs(inputCSV, outputDir, config) + + print("Finished successfully.") + + +# main function +if __name__ == "__main__": + main() From 17862ec2c059d0bcd605356912a951b08959d5d5 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 09:47:28 -0400 Subject: [PATCH 48/59] Update usage.md --- docs/usage.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 874dd007e..2dcb79a58 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -106,6 +106,8 @@ SubjectID,Channel_0,Channel_1,...,Channel_X,Label N,/full/path/N/0.nii.gz,/full/path/N/1.nii.gz,...,/full/path/N/X.nii.gz,/full/path/N/segmentation.nii.gz ``` +**Notes:** + - `Channel` can be substituted with `Modality` or `Image` - `Label` can be substituted with `Mask` or `Segmentation`and is used to specify the annotation file for segmentation models - For classification/regression, add a column called `ValueToPredict`. Currently, we are supporting only a single value prediction per model. @@ -162,6 +164,19 @@ The following command shows how the script works: - `SubjectID` or `PatientName` is used to ensure that the randomized split is done per-subject rather than per-image. - For data arrangement different to what is described above, a customized script will need to be written to generate the CSV, or you can enter the data manually into the CSV. +### Using the `gandlf_splitCSV` application + +To split the data CSV into training, validation, and testing CSVs, the `gandlf_splitCSV` script can be used. The following command shows how the script works: + +```bash +# continue from previous shell +(venv_gandlf) $> python gandlf_splitCSV \ + # -h, --help Show help message and exit + -i ./experiment_0/train_data.csv \ # output CSV from the `gandlf_constructCSV` script + -c $gandlf_config \ # the GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed + -o $output_dir # the output directory to save the split data +``` + ## Customize the Training From e3f9a45f7df8e74d0d7c149f69bf31f39a766d60 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Mon, 25 Mar 2024 14:23:35 -0400 Subject: [PATCH 49/59] Update test_full.py --- testing/test_full.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/testing/test_full.py b/testing/test_full.py index 88f00ab2f..93f3127da 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -3147,3 +3147,42 @@ def test_generic_deploy_metrics_docker(): sanitize_outputDir() print("passed") + + +def test_generic_data_split(): + print("51: Starting test for splitting and saving CSVs") + # read and initialize parameters for specific data dimension + parameters = ConfigManager( + testingDir + "/config_classification.yaml", version_check_flag=False + ) + parameters["nested_training"] = { + "testing": 5, + "validation": 5, + "stratified": True, + } + # read and parse csv + training_data, parameters["headers"] = parseTrainingCSV( + inputDir + "/train_3d_rad_classification.csv" + ) + parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"]) + parameters = populate_header_in_parameters(parameters, parameters["headers"]) + # duplicate the data to test stratified sampling + training_data_duplicate = training_data._append(training_data) + for _ in range(1): + training_data_duplicate = training_data_duplicate._append( + training_data_duplicate + ) + training_data_duplicate.reset_index(drop=True, inplace=True) + # ensure subjects are not duplicated + training_data_duplicate["SubjectID"] = training_data_duplicate.index + + sanitize_outputDir() + + split_data_and_save_csvs(training_data_duplicate, outputDir, parameters) + + files_in_outputDir = os.listdir(outputDir) + assert len(files_in_outputDir) == 15, "CSVs were not split correctly" + + sanitize_outputDir() + + print("passed") From 0be31c24876f0486d1c64e57d93d76d997e08eff Mon Sep 17 00:00:00 2001 From: Viacheslav Kukushkin Date: Tue, 26 Mar 2024 17:17:06 +0300 Subject: [PATCH 50/59] fixed version dicom-anonymizer==1.0.12 --- GANDLF/anonymize/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GANDLF/anonymize/__init__.py b/GANDLF/anonymize/__init__.py index e0c65ab90..a7c27539e 100644 --- a/GANDLF/anonymize/__init__.py +++ b/GANDLF/anonymize/__init__.py @@ -43,7 +43,7 @@ def run_anonymizer( input_path, output_path, anonymization_actions={}, - deletePrivateTags=parameters["delete_private_tags"], + delete_private_tags=parameters["delete_private_tags"], ) elif parameters["modality"] in ["histo", "path"]: # anonymize_slide( diff --git a/setup.py b/setup.py index b582515c1..5e7d74c4b 100644 --- a/setup.py +++ b/setup.py @@ -108,7 +108,7 @@ def run(self): "segmentation-models-pytorch==0.3.3", "ACSConv==0.1.1", "docker", - "dicom-anonymizer", + "dicom-anonymizer==1.0.12", "twine", "zarr", "keyring", From 228279f5c099dede19ef2e5ae2cb3b9c9160ae4d Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:41:45 -0400 Subject: [PATCH 51/59] black . --- testing/test_full.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index 6e743ad67..c428508ae 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -109,7 +109,7 @@ """ -def prerequisites_hook_download_data(): +def test_generic_download_data(): print("00: Downloading the sample data") urlToDownload = "https://drive.google.com/uc?id=1c4Yrv-jnK6Tk7Ne1HmMTChv-4nYk43NT" @@ -133,7 +133,7 @@ def prerequisites_hook_download_data(): print("passed") -def prerequisites_constructTrainingCSV(): +def test_generic_constructTrainingCSV(): print("01: Constructing training CSVs") # delete previous csv files files = os.listdir(inputDir) @@ -211,13 +211,6 @@ def prerequisites_constructTrainingCSV(): i += 1 -def test_prepare_data_for_ci(): - # is used to run pytest session (i.e. to prepare environment, download data etc) - # without any real test execution - # to see what happens, refer to `conftest.py:pytest_sessionstart` - pass - - # # these are helper functions to be used in other tests def sanitize_outputDir(): print("02_1: Sanitizing outputDir") @@ -891,6 +884,7 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): testingDir + "/config_classification.yaml", version_check_flag=False ) training_data, parameters["headers"] = parseTrainingCSV(temp_infer_csv) + parameters["output_dir"] = outputDir # this is in inference mode parameters["modality"] = "rad" parameters["patch_size"] = patch_size["3D"] parameters["model"]["dimension"] = 3 @@ -3121,11 +3115,7 @@ def test_generic_data_split(): parameters = ConfigManager( testingDir + "/config_classification.yaml", version_check_flag=False ) - parameters["nested_training"] = { - "testing": 5, - "validation": 5, - "stratified": True, - } + parameters["nested_training"] = {"testing": 5, "validation": 5, "stratified": True} # read and parse csv training_data, parameters["headers"] = parseTrainingCSV( inputDir + "/train_3d_rad_classification.csv" From ecbaf9ecfb8e0572362f2a2a33e4267c0a7ca2fb Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:49:43 -0400 Subject: [PATCH 52/59] Update test_full.py --- testing/test_full.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index c428508ae..090a5315f 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -109,7 +109,7 @@ """ -def test_generic_download_data(): +def prerequisites_hook_download_data(): print("00: Downloading the sample data") urlToDownload = "https://drive.google.com/uc?id=1c4Yrv-jnK6Tk7Ne1HmMTChv-4nYk43NT" @@ -133,7 +133,7 @@ def test_generic_download_data(): print("passed") -def test_generic_constructTrainingCSV(): +def prerequisites_constructTrainingCSV(): print("01: Constructing training CSVs") # delete previous csv files files = os.listdir(inputDir) @@ -211,6 +211,13 @@ def test_generic_constructTrainingCSV(): i += 1 +def test_prepare_data_for_ci(): + # is used to run pytest session (i.e. to prepare environment, download data etc) + # without any real test execution + # to see what happens, refer to `conftest.py:pytest_sessionstart` + pass + + # # these are helper functions to be used in other tests def sanitize_outputDir(): print("02_1: Sanitizing outputDir") From dc4c59152be5638b08f79bbb70575964588ee5d3 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:51:07 -0400 Subject: [PATCH 53/59] Update GANDLF/cli/data_split_saver.py Co-authored-by: Viacheslav Kukushkin --- GANDLF/cli/data_split_saver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py index c024928ea..4ac9e7684 100644 --- a/GANDLF/cli/data_split_saver.py +++ b/GANDLF/cli/data_split_saver.py @@ -27,7 +27,6 @@ def split_data_and_save_csvs( validationData, testingData, ) in dataframe_split: - # training and validation dataframes use the same index, since they are based on the validation split training_data_path = os.path.join( output_dir, f"training_{testing_and_valid_indeces[1]}.csv" From 6b69e89143b54eeea7bae5aaa3b6cf6d1b1d8c14 Mon Sep 17 00:00:00 2001 From: BenMalef Date: Wed, 27 Mar 2024 12:12:02 +0200 Subject: [PATCH 54/59] fix: fixing the linting issue --- GANDLF/losses/regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py index 49cb3c32d..6d74a33a2 100644 --- a/GANDLF/losses/regression.py +++ b/GANDLF/losses/regression.py @@ -1,7 +1,7 @@ from typing import Optional import torch import torch.nn.functional as F -from torch.nn import CrossEntropyLoss +from torch.nn import CrossEntropyLoss from GANDLF.utils import one_hot From df9469ecdc5e0fe6a93b59f96e0a1fdee3b8902d Mon Sep 17 00:00:00 2001 From: BenMalef Date: Wed, 27 Mar 2024 12:18:38 +0200 Subject: [PATCH 55/59] fix: fixing test_full format --- testing/test_full.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testing/test_full.py b/testing/test_full.py index 8c1a838af..8417a0352 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -481,7 +481,7 @@ def test_train_regression_brainage_rad_2d(device): parameters["model"]["architecture"] = "brain_age" parameters["model"]["onnx_export"] = False parameters["model"]["print_summary"] = False - # parameters_temp = copy.deepcopy(parameters) + # parameters_temp = copy.deepcopy(parameters) parameters = populate_header_in_parameters(parameters, parameters["headers"]) sanitize_outputDir() TrainingManager( @@ -753,7 +753,7 @@ def test_train_inference_optimize_classification_rad_3d(device): parameters["model"]["architecture"] = all_models_regression[0] parameters["model"]["onnx_export"] = False parameters["model"]["print_summary"] = False - #parameters_temp = copy.deepcopy(parameters) + # parameters_temp = copy.deepcopy(parameters) sanitize_outputDir() TrainingManager( dataframe=training_data, From bc40f14a28cfe39c9460e3c5436877f55cbd7735 Mon Sep 17 00:00:00 2001 From: sarthakpati Date: Wed, 27 Mar 2024 09:27:39 -0400 Subject: [PATCH 56/59] version updated before tagging --- GANDLF/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GANDLF/version.py b/GANDLF/version.py index b0fb39160..d49f8dc96 100644 --- a/GANDLF/version.py +++ b/GANDLF/version.py @@ -2,4 +2,4 @@ # -*- coding: UTF-8 -*- # check GaNDLF wiki for versioning and release guidelines: https://github.com/mlcommons/GaNDLF/wiki -__version__ = "0.0.19-dev" +__version__ = "0.0.19" From baad9567a33e4ee6e6d9c519b0f6fc6f0a4ef006 Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Wed, 27 Mar 2024 10:38:09 -0400 Subject: [PATCH 57/59] removed accidental double print --- GANDLF/utils/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GANDLF/utils/generic.py b/GANDLF/utils/generic.py index a7a3a0f37..d665b1556 100644 --- a/GANDLF/utils/generic.py +++ b/GANDLF/utils/generic.py @@ -1,5 +1,6 @@ import os, datetime, subprocess, sys from copy import deepcopy +from pprint import pprint import random import numpy as np import torch @@ -253,7 +254,6 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict Returns: dict: The output metrics dictionary. """ - print(input_metrics_dict) output_metrics_dict = deepcopy(input_metrics_dict) for metric in input_metrics_dict.keys(): if isinstance(input_metrics_dict[metric], list): @@ -265,7 +265,7 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict .split(",") ) - print(output_metrics_dict) + pprint(output_metrics_dict) return output_metrics_dict output_metrics_dict = deepcopy(cohort_level_metrics) From 657f03eec13b9e63590490db2bb7db3a91e2140f Mon Sep 17 00:00:00 2001 From: scap3yvt <149599669+scap3yvt@users.noreply.github.com> Date: Wed, 27 Mar 2024 10:44:21 -0400 Subject: [PATCH 58/59] no need for print at all --- GANDLF/utils/generic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/GANDLF/utils/generic.py b/GANDLF/utils/generic.py index d665b1556..8604c1cd0 100644 --- a/GANDLF/utils/generic.py +++ b/GANDLF/utils/generic.py @@ -1,6 +1,5 @@ import os, datetime, subprocess, sys from copy import deepcopy -from pprint import pprint import random import numpy as np import torch @@ -265,7 +264,6 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict .split(",") ) - pprint(output_metrics_dict) return output_metrics_dict output_metrics_dict = deepcopy(cohort_level_metrics) From 3c61892f9d5ffc9d17720ccfd413c1b23cfd50f0 Mon Sep 17 00:00:00 2001 From: sarthakpati Date: Wed, 27 Mar 2024 21:52:00 -0400 Subject: [PATCH 59/59] updated version for development --- GANDLF/version.py | 2 +- mlcube/model_mlcube/workspace/config.yml | 4 ++-- samples/config_all_options.yaml | 4 ++-- samples/config_classification.yaml | 4 ++-- samples/config_getting_started_classification_histo2d.yaml | 2 +- samples/config_getting_started_classification_rad3d.yaml | 2 +- samples/config_getting_started_regression_histo2d.yaml | 2 +- samples/config_getting_started_regression_rad3d.yaml | 2 +- samples/config_getting_started_segmentation_histo2d.yaml | 2 +- samples/config_getting_started_segmentation_rad3d.yaml | 4 ++-- samples/config_regression.yaml | 4 ++-- samples/config_segmentation_brats.yaml | 4 ++-- samples/config_segmentation_histology.yaml | 4 ++-- testing/config_classification.yaml | 2 +- testing/config_regression.yaml | 2 +- testing/config_segmentation.yaml | 2 +- tutorials/classification_medmnist_notebook/config.yaml | 2 +- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/GANDLF/version.py b/GANDLF/version.py index d49f8dc96..d06baf5e4 100644 --- a/GANDLF/version.py +++ b/GANDLF/version.py @@ -2,4 +2,4 @@ # -*- coding: UTF-8 -*- # check GaNDLF wiki for versioning and release guidelines: https://github.com/mlcommons/GaNDLF/wiki -__version__ = "0.0.19" +__version__ = "0.0.20-dev" diff --git a/mlcube/model_mlcube/workspace/config.yml b/mlcube/model_mlcube/workspace/config.yml index 312f06d93..9b5138fed 100644 --- a/mlcube/model_mlcube/workspace/config.yml +++ b/mlcube/model_mlcube/workspace/config.yml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } verbose: True # Choose the model parameters here diff --git a/samples/config_all_options.yaml b/samples/config_all_options.yaml index ba21517b1..3c117aa68 100644 --- a/samples/config_all_options.yaml +++ b/samples/config_all_options.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } ## Choose the model parameters here model: diff --git a/samples/config_classification.yaml b/samples/config_classification.yaml index ec8578e82..9795ffca8 100644 --- a/samples/config_classification.yaml +++ b/samples/config_classification.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/samples/config_getting_started_classification_histo2d.yaml b/samples/config_getting_started_classification_histo2d.yaml index b170d2ef2..e9b4e6208 100644 --- a/samples/config_getting_started_classification_histo2d.yaml +++ b/samples/config_getting_started_classification_histo2d.yaml @@ -94,6 +94,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_classification_rad3d.yaml b/samples/config_getting_started_classification_rad3d.yaml index e0dad1afc..3d5466212 100644 --- a/samples/config_getting_started_classification_rad3d.yaml +++ b/samples/config_getting_started_classification_rad3d.yaml @@ -99,6 +99,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_regression_histo2d.yaml b/samples/config_getting_started_regression_histo2d.yaml index 1e7621fbe..9118263ed 100644 --- a/samples/config_getting_started_regression_histo2d.yaml +++ b/samples/config_getting_started_regression_histo2d.yaml @@ -59,6 +59,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_regression_rad3d.yaml b/samples/config_getting_started_regression_rad3d.yaml index e5f3f03ac..4a98b1a4f 100644 --- a/samples/config_getting_started_regression_rad3d.yaml +++ b/samples/config_getting_started_regression_rad3d.yaml @@ -62,6 +62,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: false diff --git a/samples/config_getting_started_segmentation_histo2d.yaml b/samples/config_getting_started_segmentation_histo2d.yaml index 93cd74531..97deb0e34 100644 --- a/samples/config_getting_started_segmentation_histo2d.yaml +++ b/samples/config_getting_started_segmentation_histo2d.yaml @@ -66,6 +66,6 @@ scheduler: track_memory_usage: false verbose: true version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_segmentation_rad3d.yaml b/samples/config_getting_started_segmentation_rad3d.yaml index 986f97fd5..c05256426 100644 --- a/samples/config_getting_started_segmentation_rad3d.yaml +++ b/samples/config_getting_started_segmentation_rad3d.yaml @@ -89,6 +89,6 @@ scheduler: track_memory_usage: false verbose: true version: - maximum: 0.0.19 - minimum: 0.0.19 + maximum: 0.0.20 + minimum: 0.0.20 weighted_loss: true diff --git a/samples/config_regression.yaml b/samples/config_regression.yaml index af0df0d4f..ce7b2c806 100644 --- a/samples/config_regression.yaml +++ b/samples/config_regression.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/samples/config_segmentation_brats.yaml b/samples/config_segmentation_brats.yaml index 44a2aa9fd..e90d5a92c 100644 --- a/samples/config_segmentation_brats.yaml +++ b/samples/config_segmentation_brats.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/samples/config_segmentation_histology.yaml b/samples/config_segmentation_histology.yaml index bee1daf4c..6551b50c9 100644 --- a/samples/config_segmentation_histology.yaml +++ b/samples/config_segmentation_histology.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/testing/config_classification.yaml b/testing/config_classification.yaml index 6e3f6e517..0482a7371 100644 --- a/testing/config_classification.yaml +++ b/testing/config_classification.yaml @@ -55,7 +55,7 @@ save_output: false scaling_factor: 1 scheduler: triangle version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: True diff --git a/testing/config_regression.yaml b/testing/config_regression.yaml index 91ee2a015..106caa969 100644 --- a/testing/config_regression.yaml +++ b/testing/config_regression.yaml @@ -38,7 +38,7 @@ save_output: false scaling_factor: 1 scheduler: triangle version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: false diff --git a/testing/config_segmentation.yaml b/testing/config_segmentation.yaml index 2bf83eab9..3006e1eb2 100644 --- a/testing/config_segmentation.yaml +++ b/testing/config_segmentation.yaml @@ -3,7 +3,7 @@ version: { minimum: 0.0.14, - maximum: 0.0.19 + maximum: 0.0.20 } model: { diff --git a/tutorials/classification_medmnist_notebook/config.yaml b/tutorials/classification_medmnist_notebook/config.yaml index 66ce42637..309860336 100644 --- a/tutorials/classification_medmnist_notebook/config.yaml +++ b/tutorials/classification_medmnist_notebook/config.yaml @@ -2,7 +2,7 @@ version: { minimum: 0.0.14, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here