diff --git a/GANDLF/anonymize/__init__.py b/GANDLF/anonymize/__init__.py index e0c65ab90..a7c27539e 100644 --- a/GANDLF/anonymize/__init__.py +++ b/GANDLF/anonymize/__init__.py @@ -43,7 +43,7 @@ def run_anonymizer( input_path, output_path, anonymization_actions={}, - deletePrivateTags=parameters["delete_private_tags"], + delete_private_tags=parameters["delete_private_tags"], ) elif parameters["modality"] in ["histo", "path"]: # anonymize_slide( diff --git a/GANDLF/cli/__init__.py b/GANDLF/cli/__init__.py index 7021caa4b..cc1eda44b 100644 --- a/GANDLF/cli/__init__.py +++ b/GANDLF/cli/__init__.py @@ -6,6 +6,7 @@ from .recover_config import recover_config from .post_training_model_optimization import post_training_model_optimization from .generate_metrics import generate_metrics_dict +from .data_split_saver import split_data_and_save_csvs from datetime import date diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py new file mode 100644 index 000000000..4ac9e7684 --- /dev/null +++ b/GANDLF/cli/data_split_saver.py @@ -0,0 +1,49 @@ +from typing import Union +import os + +import pandas as pd +from GANDLF.utils import get_dataframe, split_data + + +def split_data_and_save_csvs( + input_data: Union[pd.DataFrame, str], output_dir: str, parameters: dict +) -> None: + """ + Split the data into training, validation, and testing sets and save them as csvs in the output directory + + Args: + input_data (Union[pd.Dataframe, str]): The input data to be split and saved. + output_dir (str): The output directory to save the split data. + parameters (dict): The parameters dictionary. + """ + + full_data = get_dataframe(input_data) + + dataframe_split = split_data(full_data, parameters) + + for ( + testing_and_valid_indeces, + trainingData, + validationData, + testingData, + ) in dataframe_split: + # training and validation dataframes use the same index, since they are based on the validation split + training_data_path = os.path.join( + output_dir, f"training_{testing_and_valid_indeces[1]}.csv" + ) + validation_data_path = os.path.join( + output_dir, f"validation_{testing_and_valid_indeces[1]}.csv" + ) + # testing dataframes use the first index + testing_data_path = os.path.join( + output_dir, f"testing_{testing_and_valid_indeces[0]}.csv" + ) + + for data, path in zip( + [trainingData, validationData, testingData], + [training_data_path, validation_data_path, testing_data_path], + ): + # check if the data is not None and the path does not exist + if not os.path.exists(path): + if data is not None: + data.to_csv(path, index=False) diff --git a/GANDLF/cli/main_run.py b/GANDLF/cli/main_run.py index f9676f76b..45e303254 100644 --- a/GANDLF/cli/main_run.py +++ b/GANDLF/cli/main_run.py @@ -1,4 +1,3 @@ -import os, pickle from typing import Optional from pathlib import Path @@ -9,7 +8,6 @@ populate_header_in_parameters, parseTrainingCSV, parseTestingCSV, - set_determinism, ) diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py index 0a88fa647..61e0e6b0f 100644 --- a/GANDLF/compute/training_loop.py +++ b/GANDLF/compute/training_loop.py @@ -237,9 +237,9 @@ def training_loop( params["validation_data"] = validation_data params["testing_data"] = testing_data testingDataDefined = True - if params["testing_data"] is None: - # testing_data = validation_data - testingDataDefined = False + if not isinstance(testing_data, pd.DataFrame): + if params["testing_data"] is None: + testingDataDefined = False # Setup a few variables for tracking best_loss = 1e7 diff --git a/GANDLF/config_manager.py b/GANDLF/config_manager.py index 4db8ae1c8..49fda1b58 100644 --- a/GANDLF/config_manager.py +++ b/GANDLF/config_manager.py @@ -630,6 +630,12 @@ def _parseConfig( "nested_training" in params ), "The parameter 'nested_training' needs to be defined" # initialize defaults for nested training + params["nested_training"]["stratified"] = params["nested_training"].get( + "stratified", False + ) + params["nested_training"]["stratified"] = params["nested_training"].get( + "proportional", params["nested_training"]["stratified"] + ) params["nested_training"]["testing"] = params["nested_training"].get("testing", -5) params["nested_training"]["validation"] = params["nested_training"].get( "validation", -5 diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py index bd7911895..6d74a33a2 100644 --- a/GANDLF/losses/regression.py +++ b/GANDLF/losses/regression.py @@ -1,7 +1,7 @@ from typing import Optional import torch import torch.nn.functional as F -from torch.nn import MSELoss, CrossEntropyLoss, L1Loss +from torch.nn import CrossEntropyLoss from GANDLF.utils import one_hot diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index 41c188d74..e605af6f6 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -1,10 +1,9 @@ import pandas as pd -import os, sys, pickle, subprocess, shutil -from sklearn.model_selection import KFold +import os, pickle, shutil from pathlib import Path from GANDLF.compute import training_loop -from GANDLF.utils import get_dataframe +from GANDLF.utils import get_dataframe, split_data def TrainingManager( @@ -44,269 +43,95 @@ def TrainingManager( ) parameters = pickle.load(open(currentModelConfigPickle, "rb")) - # check for single fold training - singleFoldValidation = False - singleFoldTesting = False - noTestingData = False - # if the user wants a single fold training - if parameters["nested_training"]["testing"] < 0: - parameters["nested_training"]["testing"] = abs( - parameters["nested_training"]["testing"] - ) - singleFoldTesting = True + dataframe_split = split_data(dataframe, parameters) - # if the user wants a single fold training - if parameters["nested_training"]["validation"] < 0: - parameters["nested_training"]["validation"] = abs( - parameters["nested_training"]["validation"] - ) - singleFoldValidation = True + last_indeces, _, _, _ = dataframe_split[-1] - # this is the condition where testing data is not to be kept - if parameters["nested_training"]["testing"] == 1: - noTestingData = True - singleFoldTesting = True - # put 2 just so that the first for-loop does not fail - parameters["nested_training"]["testing"] = 2 - - # initialize the kfold structures - kf_testing = KFold(n_splits=parameters["nested_training"]["testing"]) - kf_validation = KFold(n_splits=parameters["nested_training"]["validation"]) - - currentTestingFold = 0 - - # split across subjects - subjectIDs_full = ( - dataframe[dataframe.columns[parameters["headers"]["subjectIDHeader"]]] - .unique() - .tolist() - ) - - # get the indeces for kfold splitting - trainingData_full = dataframe - - # start the kFold train for testing - for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full): - # ensure the validation fold is initialized per-testing split - currentValidationFold = 0 - - trainingAndValidationData = pd.DataFrame() # initialize the variable - testingData = pd.DataFrame() # initialize the variable - # get the current training and testing data - if noTestingData: - # don't consider the split indeces for this case - trainingAndValidationData = trainingData_full - testingData = None - else: - # loop over all trainAndVal_index and construct new dataframe - for subject_idx in trainAndVal_index: - trainingAndValidationData = trainingAndValidationData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) - - # loop over all testing_index and construct new dataframe - for subject_idx in testing_index: - testingData = testingData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) + # check the last indeces to see if single fold training is requested + singleFoldTesting = True if last_indeces[0] == 0 else False + singleFoldValidation = True if last_indeces[1] == 0 else False + for ( + testing_and_valid_indeces, + trainingData, + validationData, + testingData, + ) in dataframe_split: # the output of the current fold is only needed if multi-fold training is happening - if singleFoldTesting: - currentOutputFolder = outputDir - else: - currentOutputFolder = os.path.join( - outputDir, "testing_" + str(currentTestingFold) + currentTestingOutputFolder = outputDir + if not singleFoldTesting: + currentTestingOutputFolder = os.path.join( + outputDir, "testing_" + str(testing_and_valid_indeces[0]) ) - Path(currentOutputFolder).mkdir(parents=True, exist_ok=True) + Path(currentTestingOutputFolder).mkdir(parents=True, exist_ok=True) - # save the current training+validation and testing datasets - if noTestingData: - print( - "WARNING: Testing data is empty, which will result in scientifically incorrect results; use at your own risk." - ) - current_training_subject_indeces_full = subjectIDs_full - currentTestingDataPickle = "None" - else: - currentTrainingAndValidationDataPickle = os.path.join( - currentOutputFolder, "data_trainAndVal.pkl" + currentValidationOutputFolder = currentTestingOutputFolder + if not singleFoldValidation: + currentValidationOutputFolder = os.path.join( + currentTestingOutputFolder, str(testing_and_valid_indeces[1]) ) - currentTestingDataPickle = os.path.join( - currentOutputFolder, "data_testing.pkl" - ) - - if (not os.path.exists(currentTestingDataPickle)) or reset or resume: - testingData.to_pickle(currentTestingDataPickle) - else: - if os.path.exists(currentTestingDataPickle): - print( - "Using previously saved testing data", - currentTestingDataPickle, - flush=True, - ) - testingData = pd.read_pickle(currentTestingDataPickle) - - if ( - (not os.path.exists(currentTrainingAndValidationDataPickle)) - or reset - or resume - ): - trainingAndValidationData.to_pickle( - currentTrainingAndValidationDataPickle + Path(currentValidationOutputFolder).mkdir(parents=True, exist_ok=True) + + # initialize the dataframes and save them to disk + data_dict = { + "training": trainingData, + "validation": validationData, + "testing": testingData, + } + data_dict_files = {} + for data_type, data in data_dict.items(): + data_dict_files[data_type] = None + if data is not None: + currentDataPickle = os.path.join( + currentValidationOutputFolder, "data_" + data_type + ".pkl" ) - else: - if os.path.exists(currentTrainingAndValidationDataPickle): - print( - "Using previously saved training+validation data", - currentTrainingAndValidationDataPickle, - flush=True, - ) - trainingAndValidationData = pd.read_pickle( - currentTrainingAndValidationDataPickle - ) - - current_training_subject_indeces_full = ( - trainingAndValidationData[ - trainingAndValidationData.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - .unique() - .tolist() + data_dict_files[data_type] = currentDataPickle + if (not os.path.exists(currentDataPickle)) or reset or resume: + data.to_pickle(currentDataPickle) + data.to_csv(currentDataPickle.replace(".pkl", ".csv"), index=False) + else: + # read the data from the pickle if present + data_dict[data_type] = get_dataframe(currentDataPickle) + + # parallel_compute_command is an empty string, thus no parallel computing requested + if not parameters["parallel_compute_command"]: + training_loop( + training_data=data_dict["training"], + validation_data=data_dict["validation"], + output_dir=currentValidationOutputFolder, + device=device, + params=parameters, + testing_data=data_dict["testing"], ) - # start the kFold train for validation - for train_index, val_index in kf_validation.split( - current_training_subject_indeces_full - ): - # the output of the current fold is only needed if multi-fold training is happening - if singleFoldValidation: - currentValOutputFolder = currentOutputFolder - else: - currentValOutputFolder = os.path.join( - currentOutputFolder, str(currentValidationFold) - ) - Path(currentValOutputFolder).mkdir(parents=True, exist_ok=True) - - trainingData = pd.DataFrame() # initialize the variable - validationData = pd.DataFrame() # initialize the variable - - # loop over all train_index and construct new dataframe - for subject_idx in train_index: - trainingData = trainingData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) - - # loop over all val_index and construct new dataframe - for subject_idx in val_index: - validationData = validationData._append( - trainingData_full[ - trainingData_full[ - trainingData_full.columns[ - parameters["headers"]["subjectIDHeader"] - ] - ] - == subjectIDs_full[subject_idx] - ] - ) - - # # write parameters to pickle - this should not change for the different folds, so keeping is independent - ## pickle/unpickle data - # pickle the data - currentTrainingDataPickle = os.path.join( - currentValOutputFolder, "data_training.pkl" - ) - currentValidationDataPickle = os.path.join( - currentValOutputFolder, "data_validation.pkl" + else: + # call hpc command here + parallel_compute_command_actual = parameters[ + "parallel_compute_command" + ].replace("${outputDir}", currentValidationOutputFolder) + + assert ( + "python" in parallel_compute_command_actual + ), "The 'parallel_compute_command_actual' needs to have the python from the virtual environment, which is usually '${GANDLF_dir}/venv/bin/python'" + + command = ( + parallel_compute_command_actual + + " -m GANDLF.training_loop -train_loader_pickle " + + data_dict_files["training"] + + " -val_loader_pickle " + + data_dict_files["validation"] + + " -parameter_pickle " + + currentModelConfigPickle + + " -device " + + str(device) + + " -outputDir " + + currentValidationOutputFolder + + " -testing_loader_pickle " + + data_dict_files["testing"] ) - if (not os.path.exists(currentTrainingDataPickle)) or reset or resume: - trainingData.to_pickle(currentTrainingDataPickle) - trainingData.to_csv( - currentTrainingDataPickle.replace(".pkl", ".csv"), index=False - ) - else: - trainingData = get_dataframe(currentTrainingDataPickle) - if (not os.path.exists(currentValidationDataPickle)) or reset or resume: - validationData.to_pickle(currentValidationDataPickle) - validationData.to_csv( - currentValidationDataPickle.replace(".pkl", ".csv"), index=False - ) - else: - validationData = get_dataframe(currentValidationDataPickle) - - # parallel_compute_command is an empty string, thus no parallel computing requested - if (not parameters["parallel_compute_command"]) or (singleFoldValidation): - training_loop( - training_data=trainingData, - validation_data=validationData, - output_dir=currentValOutputFolder, - device=device, - params=parameters, - testing_data=testingData, - ) - - else: - # call qsub here - parallel_compute_command_actual = parameters[ - "parallel_compute_command" - ].replace("${outputDir}", currentValOutputFolder) - - if not ("python" in parallel_compute_command_actual): - sys.exit( - "The 'parallel_compute_command_actual' needs to have the python from the virtual environment, which is usually '${GANDLF_dir}/venv/bin/python'" - ) - - command = ( - parallel_compute_command_actual - + " -m GANDLF.training_loop -train_loader_pickle " - + currentTrainingDataPickle - + " -val_loader_pickle " - + currentValidationDataPickle - + " -parameter_pickle " - + currentModelConfigPickle - + " -device " - + str(device) - + " -outputDir " - + currentValOutputFolder - + " -testing_loader_pickle " - + currentTestingDataPickle - ) - - print( - "Submitting job for testing split " - + str(currentTestingFold) - + " and validation split " - + str(currentValidationFold) - ) - subprocess.Popen(command, shell=True).wait() - - if singleFoldValidation: - break - currentValidationFold += 1 # go to next fold - if singleFoldTesting: - break - currentTestingFold += 1 # go to next fold + print("Running command: ", command, flush=True) + os.system(command, flush=True) def TrainingManager_split( diff --git a/GANDLF/utils/__init__.py b/GANDLF/utils/__init__.py index 311edeed8..66d830d3d 100644 --- a/GANDLF/utils/__init__.py +++ b/GANDLF/utils/__init__.py @@ -66,3 +66,5 @@ save_model, optimize_and_save_model, ) + +from .data_splitter import split_data diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py new file mode 100644 index 000000000..6976d7629 --- /dev/null +++ b/GANDLF/utils/data_splitter.py @@ -0,0 +1,253 @@ +from typing import List, Tuple +import pandas as pd +from sklearn.model_selection import KFold, StratifiedKFold + +from . import parseTrainingCSV, populate_header_in_parameters + + +def split_data( + full_dataset: pd.DataFrame, parameters: dict +) -> List[Tuple[Tuple[int, int], pd.DataFrame, pd.DataFrame, pd.DataFrame]]: + """ + Split the data into training, validation, and testing sets. + + Args: + full_dataset (pd.DataFrame): The full dataset to split. + parameters (dict): The parameters to use for splitting the data, which should contain the "nested_training" key with relevant information. + + Returns: + List[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]: A list of tuples, each containing the a tuple of the testing & validation split indeces, and training, validation, and testing sets. + """ + assert ( + "nested_training" in parameters + ), "`nested_training` key missing in parameters" + # populate the headers + _, parameters["headers"] = ( + parseTrainingCSV(full_dataset) if "headers" not in parameters else full_dataset, + parameters["headers"], + ) + + parameters = ( + populate_header_in_parameters(parameters, parameters["headers"]) + if "problem_type" not in parameters + else parameters + ) + + stratified_splitting = parameters["nested_training"].get("stratified") + + return_data = [] + + # check for single fold training + singleFoldValidation = False + singleFoldTesting = False + # if the user wants a single fold training + testing_folds = parameters["nested_training"]["testing"] + if testing_folds < 0: + testing_folds = abs(testing_folds) + singleFoldTesting = True + + # if the user wants a single fold training + validation_folds = parameters["nested_training"]["validation"] + if validation_folds < 0: + validation_folds = abs(validation_folds) + singleFoldValidation = True + + # this is the condition where testing data is not to be kept + noTestingData = False + if testing_folds == 1: + noTestingData = True + singleFoldTesting = True + # put 2 just so that the first for-loop does not fail + testing_folds = 2 + print( + "WARNING: Testing data is empty, which will result in scientifically incorrect results; use at your own risk." + ) + + # get unique subject IDs + subjectIDs_full = ( + full_dataset[full_dataset.columns[parameters["headers"]["subjectIDHeader"]]] + .unique() + .tolist() + ) + + all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index) + + # checks for stratified splitting + if stratified_splitting: + # it can only be done for classification problems + assert ( + parameters["problem_type"] == "classification" + ), "Stratified splitting is only possible for classification problems." + # it can only be done when all subjects are unique + assert ( + all_subjects_are_unique + ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset." + + # get the targets for prediction for classification + target_testing = False # initialize this so that the downstream code does not fail - for KFold, this is shuffle + if parameters["problem_type"] == "classification": + target_testing = full_dataset.loc[ + :, full_dataset.columns[parameters["headers"]["predictionHeaders"]] + ] + target_validation = target_testing + + folding_type = KFold + if stratified_splitting: + folding_type = StratifiedKFold + + kf_testing = folding_type(n_splits=testing_folds) + kf_validation = folding_type(n_splits=validation_folds) + + # start StratifiedKFold splitting + currentTestingFold = 0 + if stratified_splitting: + for trainAndVal_index, testing_index in kf_testing.split( + full_dataset, target_testing + ): + # ensure the validation fold is initialized per-testing split + currentValidationFold = 0 + + trainingAndValidationData, testingData = ( + pd.DataFrame(), + pd.DataFrame(), + ) # initialize the variables + # get the current training and testing data + if noTestingData: + # don't consider the split indeces for this case + trainingAndValidationData = full_dataset + # this should be None to ensure downstream code does not fail + testingData = None + else: + trainingAndValidationData = full_dataset.loc[trainAndVal_index, :] + trainingAndValidationData.reset_index(drop=True, inplace=True) + testingData = full_dataset.loc[testing_index, :] + # update the targets after the split + target_validation = trainingAndValidationData.loc[ + :, full_dataset.columns[parameters["headers"]["predictionHeaders"]] + ] + + for train_index, val_index in kf_validation.split( + trainingAndValidationData, target_validation + ): + # get the current training and validation data + trainingData = trainingAndValidationData.loc[train_index, :] + validationData = trainingAndValidationData.loc[val_index, :] + return_data.append( + ( + (currentTestingFold, currentValidationFold), + trainingData, + validationData, + testingData, + ) + ) + currentValidationFold += 1 # increment the validation fold + if singleFoldValidation: + break + + currentTestingFold += 1 # increment the testing fold + if singleFoldTesting: + break + else: + # start the kFold train for testing + for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full): + # ensure the validation fold is initialized per-testing split + currentValidationFold = 0 + + trainingAndValidationData, testingData = ( + pd.DataFrame(), + pd.DataFrame(), + ) # initialize the variables + # get the current training and testing data + if noTestingData: + # don't consider the split indeces for this case + trainingAndValidationData = full_dataset + # this should be None to ensure downstream code does not fail + testingData = None + else: + # loop over all trainAndVal_index and construct new dataframe + for subject_idx in trainAndVal_index: + trainingAndValidationData = trainingAndValidationData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + # loop over all testing_index and construct new dataframe + for subject_idx in testing_index: + testingData = testingData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + current_training_subject_indeces_full = ( + trainingAndValidationData[ + trainingAndValidationData.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + .unique() + .tolist() + ) + + # start the kFold train for validation + for train_index, val_index in kf_validation.split( + current_training_subject_indeces_full + ): + trainingData = pd.DataFrame() # initialize the variable + validationData = pd.DataFrame() # initialize the variable + + # loop over all train_index and construct new dataframe + for subject_idx in train_index: + trainingData = trainingData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + # loop over all val_index and construct new dataframe + for subject_idx in val_index: + validationData = validationData._append( + full_dataset[ + full_dataset[ + full_dataset.columns[ + parameters["headers"]["subjectIDHeader"] + ] + ] + == subjectIDs_full[subject_idx] + ] + ) + + return_data.append( + ( + (currentTestingFold, currentValidationFold), + trainingData, + validationData, + testingData, + ) + ) + + currentValidationFold += 1 # go to next fold + if singleFoldValidation: + break + + currentTestingFold += 1 # go to next fold + if singleFoldTesting: + break + + return return_data diff --git a/GANDLF/utils/generic.py b/GANDLF/utils/generic.py index a7a3a0f37..8604c1cd0 100644 --- a/GANDLF/utils/generic.py +++ b/GANDLF/utils/generic.py @@ -253,7 +253,6 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict Returns: dict: The output metrics dictionary. """ - print(input_metrics_dict) output_metrics_dict = deepcopy(input_metrics_dict) for metric in input_metrics_dict.keys(): if isinstance(input_metrics_dict[metric], list): @@ -265,7 +264,6 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict .split(",") ) - print(output_metrics_dict) return output_metrics_dict output_metrics_dict = deepcopy(cohort_level_metrics) diff --git a/GANDLF/version.py b/GANDLF/version.py index b0fb39160..d06baf5e4 100644 --- a/GANDLF/version.py +++ b/GANDLF/version.py @@ -2,4 +2,4 @@ # -*- coding: UTF-8 -*- # check GaNDLF wiki for versioning and release guidelines: https://github.com/mlcommons/GaNDLF/wiki -__version__ = "0.0.19-dev" +__version__ = "0.0.20-dev" diff --git a/docs/usage.md b/docs/usage.md index 874dd007e..2dcb79a58 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -106,6 +106,8 @@ SubjectID,Channel_0,Channel_1,...,Channel_X,Label N,/full/path/N/0.nii.gz,/full/path/N/1.nii.gz,...,/full/path/N/X.nii.gz,/full/path/N/segmentation.nii.gz ``` +**Notes:** + - `Channel` can be substituted with `Modality` or `Image` - `Label` can be substituted with `Mask` or `Segmentation`and is used to specify the annotation file for segmentation models - For classification/regression, add a column called `ValueToPredict`. Currently, we are supporting only a single value prediction per model. @@ -162,6 +164,19 @@ The following command shows how the script works: - `SubjectID` or `PatientName` is used to ensure that the randomized split is done per-subject rather than per-image. - For data arrangement different to what is described above, a customized script will need to be written to generate the CSV, or you can enter the data manually into the CSV. +### Using the `gandlf_splitCSV` application + +To split the data CSV into training, validation, and testing CSVs, the `gandlf_splitCSV` script can be used. The following command shows how the script works: + +```bash +# continue from previous shell +(venv_gandlf) $> python gandlf_splitCSV \ + # -h, --help Show help message and exit + -i ./experiment_0/train_data.csv \ # output CSV from the `gandlf_constructCSV` script + -c $gandlf_config \ # the GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed + -o $output_dir # the output directory to save the split data +``` + ## Customize the Training diff --git a/gandlf_splitCSV b/gandlf_splitCSV new file mode 100644 index 000000000..339fec2ac --- /dev/null +++ b/gandlf_splitCSV @@ -0,0 +1,66 @@ +#!usr/bin/env python +# -*- coding: utf-8 -*- + +import os, argparse, sys, yaml +from GANDLF.cli import copyrightMessage, split_data_and_save_csvs + + +def main(): + parser = argparse.ArgumentParser( + prog="GANDLF_SplitCSV", + formatter_class=argparse.RawTextHelpFormatter, + description="Split the data into training, validation, and testing sets and save them as csvs in the output directory.\n\n" + + copyrightMessage, + ) + parser.add_argument( + "-i", + "--inputCSV", + metavar="", + default=None, + type=str, + required=True, + help="Input CSV file which contains the data to be split.", + ) + parser.add_argument( + "-c", + "--config", + metavar="", + default=None, + required=True, + type=str, + help="The GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed.", + ) + parser.add_argument( + "-o", + "--outputDir", + metavar="", + default=None, + type=str, + required=True, + help="Output directory to save the split data.", + ) + + args = parser.parse_args() + + # check for required parameters - this is needed here to keep the cli clean + for param_none_check in [args.inputCSV, args.outputDir, args.config]: + if param_none_check is None: + sys.exit("ERROR: Missing required parameter:", param_none_check) + + inputCSV = os.path.normpath(args.inputCSV) + outputDir = os.path.normpath(args.outputDir) + # initialize default + config = {"nested_training": {"testing": 5, "validation": 5}} + if os.path.isfile(args.config): + config = yaml.safe_load(open(args.config, "r")) + + print("Config used for split:", config) + + split_data_and_save_csvs(inputCSV, outputDir, config) + + print("Finished successfully.") + + +# main function +if __name__ == "__main__": + main() diff --git a/mlcube/model_mlcube/workspace/config.yml b/mlcube/model_mlcube/workspace/config.yml index 312f06d93..9b5138fed 100644 --- a/mlcube/model_mlcube/workspace/config.yml +++ b/mlcube/model_mlcube/workspace/config.yml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } verbose: True # Choose the model parameters here diff --git a/samples/config_all_options.yaml b/samples/config_all_options.yaml index 7eb85cbd4..3c117aa68 100644 --- a/samples/config_all_options.yaml +++ b/samples/config_all_options.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } ## Choose the model parameters here model: @@ -138,6 +138,7 @@ optimizer: adam # for train on a single fold, use '-' before the fold number to make the number of folds "negative" -- NOT recommended nested_training: { + stratified: False, # this will perform stratified k-fold cross-validation but only with offline data splitting, see https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html testing: 5, # this controls the number of testing data folds for final model evaluation; [NOT recommended] to disable this, use '1' validation: 5 # this controls the number of validation data folds to be used for model *selection* during training (not used for back-propagation) } diff --git a/samples/config_classification.yaml b/samples/config_classification.yaml index ec8578e82..9795ffca8 100644 --- a/samples/config_classification.yaml +++ b/samples/config_classification.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/samples/config_getting_started_classification_histo2d.yaml b/samples/config_getting_started_classification_histo2d.yaml index b170d2ef2..e9b4e6208 100644 --- a/samples/config_getting_started_classification_histo2d.yaml +++ b/samples/config_getting_started_classification_histo2d.yaml @@ -94,6 +94,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_classification_rad3d.yaml b/samples/config_getting_started_classification_rad3d.yaml index e0dad1afc..3d5466212 100644 --- a/samples/config_getting_started_classification_rad3d.yaml +++ b/samples/config_getting_started_classification_rad3d.yaml @@ -99,6 +99,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_regression_histo2d.yaml b/samples/config_getting_started_regression_histo2d.yaml index 1e7621fbe..9118263ed 100644 --- a/samples/config_getting_started_regression_histo2d.yaml +++ b/samples/config_getting_started_regression_histo2d.yaml @@ -59,6 +59,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_regression_rad3d.yaml b/samples/config_getting_started_regression_rad3d.yaml index e5f3f03ac..4a98b1a4f 100644 --- a/samples/config_getting_started_regression_rad3d.yaml +++ b/samples/config_getting_started_regression_rad3d.yaml @@ -62,6 +62,6 @@ scheduler: track_memory_usage: false verbose: false version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: false diff --git a/samples/config_getting_started_segmentation_histo2d.yaml b/samples/config_getting_started_segmentation_histo2d.yaml index 93cd74531..97deb0e34 100644 --- a/samples/config_getting_started_segmentation_histo2d.yaml +++ b/samples/config_getting_started_segmentation_histo2d.yaml @@ -66,6 +66,6 @@ scheduler: track_memory_usage: false verbose: true version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: true diff --git a/samples/config_getting_started_segmentation_rad3d.yaml b/samples/config_getting_started_segmentation_rad3d.yaml index 986f97fd5..c05256426 100644 --- a/samples/config_getting_started_segmentation_rad3d.yaml +++ b/samples/config_getting_started_segmentation_rad3d.yaml @@ -89,6 +89,6 @@ scheduler: track_memory_usage: false verbose: true version: - maximum: 0.0.19 - minimum: 0.0.19 + maximum: 0.0.20 + minimum: 0.0.20 weighted_loss: true diff --git a/samples/config_regression.yaml b/samples/config_regression.yaml index af0df0d4f..ce7b2c806 100644 --- a/samples/config_regression.yaml +++ b/samples/config_regression.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/samples/config_segmentation_brats.yaml b/samples/config_segmentation_brats.yaml index 44a2aa9fd..e90d5a92c 100644 --- a/samples/config_segmentation_brats.yaml +++ b/samples/config_segmentation_brats.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/samples/config_segmentation_histology.yaml b/samples/config_segmentation_histology.yaml index bee1daf4c..6551b50c9 100644 --- a/samples/config_segmentation_histology.yaml +++ b/samples/config_segmentation_histology.yaml @@ -1,8 +1,8 @@ # affix version version: { - minimum: 0.0.19, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + minimum: 0.0.20, + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here model: diff --git a/setup.py b/setup.py index b582515c1..5e7d74c4b 100644 --- a/setup.py +++ b/setup.py @@ -108,7 +108,7 @@ def run(self): "segmentation-models-pytorch==0.3.3", "ACSConv==0.1.1", "docker", - "dicom-anonymizer", + "dicom-anonymizer==1.0.12", "twine", "zarr", "keyring", diff --git a/testing/config_classification.yaml b/testing/config_classification.yaml index 6e3f6e517..0482a7371 100644 --- a/testing/config_classification.yaml +++ b/testing/config_classification.yaml @@ -55,7 +55,7 @@ save_output: false scaling_factor: 1 scheduler: triangle version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: True diff --git a/testing/config_regression.yaml b/testing/config_regression.yaml index 91ee2a015..106caa969 100644 --- a/testing/config_regression.yaml +++ b/testing/config_regression.yaml @@ -38,7 +38,7 @@ save_output: false scaling_factor: 1 scheduler: triangle version: - maximum: 0.0.19 + maximum: 0.0.20 minimum: 0.0.14 weighted_loss: false diff --git a/testing/config_segmentation.yaml b/testing/config_segmentation.yaml index 2bf83eab9..3006e1eb2 100644 --- a/testing/config_segmentation.yaml +++ b/testing/config_segmentation.yaml @@ -3,7 +3,7 @@ version: { minimum: 0.0.14, - maximum: 0.0.19 + maximum: 0.0.20 } model: { diff --git a/testing/test_full.py b/testing/test_full.py index 2c8725c91..8417a0352 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -32,6 +32,7 @@ recover_config, post_training_model_optimization, generate_metrics_dict, + split_data_and_save_csvs, ) from GANDLF.schedulers import global_schedulers_dict from GANDLF.optimizers import global_optimizer_dict @@ -480,7 +481,7 @@ def test_train_regression_brainage_rad_2d(device): parameters["model"]["architecture"] = "brain_age" parameters["model"]["onnx_export"] = False parameters["model"]["print_summary"] = False - parameters_temp = copy.deepcopy(parameters) + # parameters_temp = copy.deepcopy(parameters) parameters = populate_header_in_parameters(parameters, parameters["headers"]) sanitize_outputDir() TrainingManager( @@ -752,7 +753,7 @@ def test_train_inference_optimize_classification_rad_3d(device): parameters["model"]["architecture"] = all_models_regression[0] parameters["model"]["onnx_export"] = False parameters["model"]["print_summary"] = False - parameters_temp = copy.deepcopy(parameters) + # parameters_temp = copy.deepcopy(parameters) sanitize_outputDir() TrainingManager( dataframe=training_data, @@ -842,6 +843,13 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): parameters["patch_size"] = patch_size["3D"] parameters["model"]["dimension"] = 3 parameters["model"]["final_layer"] = "logits" + # loop through selected models and train for single epoch + model = all_models_regression[0] + parameters["model"]["architecture"] = model + parameters["model"]["onnx_export"] = False + parameters["model"]["print_summary"] = False + ## add stratified splitting + parameters["nested_training"]["stratified"] = True # read and parse csv training_data, parameters["headers"] = parseTrainingCSV( @@ -849,20 +857,30 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): ) parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"]) parameters = populate_header_in_parameters(parameters, parameters["headers"]) - # loop through selected models and train for single epoch - model = all_models_regression[0] - parameters["model"]["architecture"] = model - parameters["model"]["onnx_export"] = False - parameters["model"]["print_summary"] = False - sanitize_outputDir() - TrainingManager( - dataframe=training_data, - outputDir=outputDir, - parameters=parameters, - device=device, - resume=False, - reset=True, - ) + # duplicate the data to test stratified sampling + training_data_duplicate = training_data._append(training_data) + for _ in range(1): + training_data_duplicate = training_data_duplicate._append( + training_data_duplicate + ) + training_data_duplicate.reset_index(drop=True, inplace=True) + # ensure subjects are not duplicated + training_data_duplicate["SubjectID"] = training_data_duplicate.index + + # ensure every part of the code is tested + for folds in [2, 1, -5]: + ## add stratified folding information + parameters["nested_training"]["testing"] = folds + parameters["nested_training"]["validation"] = folds if folds != 1 else -5 + sanitize_outputDir() + TrainingManager( + dataframe=training_data_duplicate, + outputDir=outputDir, + parameters=parameters, + device=device, + resume=False, + reset=True, + ) ## this is to test if inference can run without having ground truth column training_data.drop("ValueToPredict", axis=1, inplace=True) training_data.drop("Label", axis=1, inplace=True) @@ -874,7 +892,6 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device): ) training_data, parameters["headers"] = parseTrainingCSV(temp_infer_csv) parameters["output_dir"] = outputDir # this is in inference mode - parameters["output_dir"] = outputDir # this is in inference mode parameters["modality"] = "rad" parameters["patch_size"] = patch_size["3D"] parameters["model"]["dimension"] = 3 @@ -3098,6 +3115,37 @@ def test_generic_deploy_metrics_docker(): print("passed") + +def test_generic_data_split(): + print("51: Starting test for splitting and saving CSVs") + # read and initialize parameters for specific data dimension + parameters = ConfigManager( + testingDir + "/config_classification.yaml", version_check_flag=False + ) + parameters["nested_training"] = {"testing": 5, "validation": 5, "stratified": True} + # read and parse csv + training_data, parameters["headers"] = parseTrainingCSV( + inputDir + "/train_3d_rad_classification.csv" + ) + parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"]) + parameters = populate_header_in_parameters(parameters, parameters["headers"]) + # duplicate the data to test stratified sampling + training_data_duplicate = training_data._append(training_data) + for _ in range(1): + training_data_duplicate = training_data_duplicate._append( + training_data_duplicate + ) + training_data_duplicate.reset_index(drop=True, inplace=True) + # ensure subjects are not duplicated + training_data_duplicate["SubjectID"] = training_data_duplicate.index + + sanitize_outputDir() + + split_data_and_save_csvs(training_data_duplicate, outputDir, parameters) + + files_in_outputDir = os.listdir(outputDir) + assert len(files_in_outputDir) == 15, "CSVs were not split correctly" + sanitize_outputDir() print("passed") diff --git a/tutorials/classification_medmnist_notebook/config.yaml b/tutorials/classification_medmnist_notebook/config.yaml index 66ce42637..309860336 100644 --- a/tutorials/classification_medmnist_notebook/config.yaml +++ b/tutorials/classification_medmnist_notebook/config.yaml @@ -2,7 +2,7 @@ version: { minimum: 0.0.14, - maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created + maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created } # Choose the model parameters here