diff --git a/GANDLF/anonymize/__init__.py b/GANDLF/anonymize/__init__.py
index e0c65ab90..a7c27539e 100644
--- a/GANDLF/anonymize/__init__.py
+++ b/GANDLF/anonymize/__init__.py
@@ -43,7 +43,7 @@ def run_anonymizer(
                 input_path,
                 output_path,
                 anonymization_actions={},
-                deletePrivateTags=parameters["delete_private_tags"],
+                delete_private_tags=parameters["delete_private_tags"],
             )
     elif parameters["modality"] in ["histo", "path"]:
         # anonymize_slide(
diff --git a/GANDLF/cli/__init__.py b/GANDLF/cli/__init__.py
index 7021caa4b..cc1eda44b 100644
--- a/GANDLF/cli/__init__.py
+++ b/GANDLF/cli/__init__.py
@@ -6,6 +6,7 @@
 from .recover_config import recover_config
 from .post_training_model_optimization import post_training_model_optimization
 from .generate_metrics import generate_metrics_dict
+from .data_split_saver import split_data_and_save_csvs
 
 from datetime import date
 
diff --git a/GANDLF/cli/data_split_saver.py b/GANDLF/cli/data_split_saver.py
new file mode 100644
index 000000000..4ac9e7684
--- /dev/null
+++ b/GANDLF/cli/data_split_saver.py
@@ -0,0 +1,49 @@
+from typing import Union
+import os
+
+import pandas as pd
+from GANDLF.utils import get_dataframe, split_data
+
+
+def split_data_and_save_csvs(
+    input_data: Union[pd.DataFrame, str], output_dir: str, parameters: dict
+) -> None:
+    """
+    Split the data into training, validation, and testing sets and save them as csvs in the output directory
+
+    Args:
+        input_data (Union[pd.Dataframe, str]): The input data to be split and saved.
+        output_dir (str): The output directory to save the split data.
+        parameters (dict): The parameters dictionary.
+    """
+
+    full_data = get_dataframe(input_data)
+
+    dataframe_split = split_data(full_data, parameters)
+
+    for (
+        testing_and_valid_indeces,
+        trainingData,
+        validationData,
+        testingData,
+    ) in dataframe_split:
+        # training and validation dataframes use the same index, since they are based on the validation split
+        training_data_path = os.path.join(
+            output_dir, f"training_{testing_and_valid_indeces[1]}.csv"
+        )
+        validation_data_path = os.path.join(
+            output_dir, f"validation_{testing_and_valid_indeces[1]}.csv"
+        )
+        # testing dataframes use the first index
+        testing_data_path = os.path.join(
+            output_dir, f"testing_{testing_and_valid_indeces[0]}.csv"
+        )
+
+        for data, path in zip(
+            [trainingData, validationData, testingData],
+            [training_data_path, validation_data_path, testing_data_path],
+        ):
+            # check if the data is not None and the path does not exist
+            if not os.path.exists(path):
+                if data is not None:
+                    data.to_csv(path, index=False)
diff --git a/GANDLF/cli/main_run.py b/GANDLF/cli/main_run.py
index f9676f76b..45e303254 100644
--- a/GANDLF/cli/main_run.py
+++ b/GANDLF/cli/main_run.py
@@ -1,4 +1,3 @@
-import os, pickle
 from typing import Optional
 from pathlib import Path
 
@@ -9,7 +8,6 @@
     populate_header_in_parameters,
     parseTrainingCSV,
     parseTestingCSV,
-    set_determinism,
 )
 
 
diff --git a/GANDLF/compute/training_loop.py b/GANDLF/compute/training_loop.py
index 0a88fa647..61e0e6b0f 100644
--- a/GANDLF/compute/training_loop.py
+++ b/GANDLF/compute/training_loop.py
@@ -237,9 +237,9 @@ def training_loop(
     params["validation_data"] = validation_data
     params["testing_data"] = testing_data
     testingDataDefined = True
-    if params["testing_data"] is None:
-        # testing_data = validation_data
-        testingDataDefined = False
+    if not isinstance(testing_data, pd.DataFrame):
+        if params["testing_data"] is None:
+            testingDataDefined = False
 
     # Setup a few variables for tracking
     best_loss = 1e7
diff --git a/GANDLF/config_manager.py b/GANDLF/config_manager.py
index 4db8ae1c8..49fda1b58 100644
--- a/GANDLF/config_manager.py
+++ b/GANDLF/config_manager.py
@@ -630,6 +630,12 @@ def _parseConfig(
         "nested_training" in params
     ), "The parameter 'nested_training' needs to be defined"
     # initialize defaults for nested training
+    params["nested_training"]["stratified"] = params["nested_training"].get(
+        "stratified", False
+    )
+    params["nested_training"]["stratified"] = params["nested_training"].get(
+        "proportional", params["nested_training"]["stratified"]
+    )
     params["nested_training"]["testing"] = params["nested_training"].get("testing", -5)
     params["nested_training"]["validation"] = params["nested_training"].get(
         "validation", -5
diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py
index bd7911895..6d74a33a2 100644
--- a/GANDLF/losses/regression.py
+++ b/GANDLF/losses/regression.py
@@ -1,7 +1,7 @@
 from typing import Optional
 import torch
 import torch.nn.functional as F
-from torch.nn import MSELoss, CrossEntropyLoss, L1Loss
+from torch.nn import CrossEntropyLoss
 from GANDLF.utils import one_hot
 
 
diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py
index 41c188d74..e605af6f6 100644
--- a/GANDLF/training_manager.py
+++ b/GANDLF/training_manager.py
@@ -1,10 +1,9 @@
 import pandas as pd
-import os, sys, pickle, subprocess, shutil
-from sklearn.model_selection import KFold
+import os, pickle, shutil
 from pathlib import Path
 
 from GANDLF.compute import training_loop
-from GANDLF.utils import get_dataframe
+from GANDLF.utils import get_dataframe, split_data
 
 
 def TrainingManager(
@@ -44,269 +43,95 @@ def TrainingManager(
             )
             parameters = pickle.load(open(currentModelConfigPickle, "rb"))
 
-    # check for single fold training
-    singleFoldValidation = False
-    singleFoldTesting = False
-    noTestingData = False
-    # if the user wants a single fold training
-    if parameters["nested_training"]["testing"] < 0:
-        parameters["nested_training"]["testing"] = abs(
-            parameters["nested_training"]["testing"]
-        )
-        singleFoldTesting = True
+    dataframe_split = split_data(dataframe, parameters)
 
-    # if the user wants a single fold training
-    if parameters["nested_training"]["validation"] < 0:
-        parameters["nested_training"]["validation"] = abs(
-            parameters["nested_training"]["validation"]
-        )
-        singleFoldValidation = True
+    last_indeces, _, _, _ = dataframe_split[-1]
 
-    # this is the condition where testing data is not to be kept
-    if parameters["nested_training"]["testing"] == 1:
-        noTestingData = True
-        singleFoldTesting = True
-        # put 2 just so that the first for-loop does not fail
-        parameters["nested_training"]["testing"] = 2
-
-    # initialize the kfold structures
-    kf_testing = KFold(n_splits=parameters["nested_training"]["testing"])
-    kf_validation = KFold(n_splits=parameters["nested_training"]["validation"])
-
-    currentTestingFold = 0
-
-    # split across subjects
-    subjectIDs_full = (
-        dataframe[dataframe.columns[parameters["headers"]["subjectIDHeader"]]]
-        .unique()
-        .tolist()
-    )
-
-    # get the indeces for kfold splitting
-    trainingData_full = dataframe
-
-    # start the kFold train for testing
-    for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full):
-        # ensure the validation fold is initialized per-testing split
-        currentValidationFold = 0
-
-        trainingAndValidationData = pd.DataFrame()  # initialize the variable
-        testingData = pd.DataFrame()  # initialize the variable
-        # get the current training and testing data
-        if noTestingData:
-            # don't consider the split indeces for this case
-            trainingAndValidationData = trainingData_full
-            testingData = None
-        else:
-            # loop over all trainAndVal_index and construct new dataframe
-            for subject_idx in trainAndVal_index:
-                trainingAndValidationData = trainingAndValidationData._append(
-                    trainingData_full[
-                        trainingData_full[
-                            trainingData_full.columns[
-                                parameters["headers"]["subjectIDHeader"]
-                            ]
-                        ]
-                        == subjectIDs_full[subject_idx]
-                    ]
-                )
-
-            # loop over all testing_index and construct new dataframe
-            for subject_idx in testing_index:
-                testingData = testingData._append(
-                    trainingData_full[
-                        trainingData_full[
-                            trainingData_full.columns[
-                                parameters["headers"]["subjectIDHeader"]
-                            ]
-                        ]
-                        == subjectIDs_full[subject_idx]
-                    ]
-                )
+    # check the last indeces to see if single fold training is requested
+    singleFoldTesting = True if last_indeces[0] == 0 else False
+    singleFoldValidation = True if last_indeces[1] == 0 else False
 
+    for (
+        testing_and_valid_indeces,
+        trainingData,
+        validationData,
+        testingData,
+    ) in dataframe_split:
         # the output of the current fold is only needed if multi-fold training is happening
-        if singleFoldTesting:
-            currentOutputFolder = outputDir
-        else:
-            currentOutputFolder = os.path.join(
-                outputDir, "testing_" + str(currentTestingFold)
+        currentTestingOutputFolder = outputDir
+        if not singleFoldTesting:
+            currentTestingOutputFolder = os.path.join(
+                outputDir, "testing_" + str(testing_and_valid_indeces[0])
             )
-            Path(currentOutputFolder).mkdir(parents=True, exist_ok=True)
+            Path(currentTestingOutputFolder).mkdir(parents=True, exist_ok=True)
 
-        # save the current training+validation and testing datasets
-        if noTestingData:
-            print(
-                "WARNING: Testing data is empty, which will result in scientifically incorrect results; use at your own risk."
-            )
-            current_training_subject_indeces_full = subjectIDs_full
-            currentTestingDataPickle = "None"
-        else:
-            currentTrainingAndValidationDataPickle = os.path.join(
-                currentOutputFolder, "data_trainAndVal.pkl"
+        currentValidationOutputFolder = currentTestingOutputFolder
+        if not singleFoldValidation:
+            currentValidationOutputFolder = os.path.join(
+                currentTestingOutputFolder, str(testing_and_valid_indeces[1])
             )
-            currentTestingDataPickle = os.path.join(
-                currentOutputFolder, "data_testing.pkl"
-            )
-
-            if (not os.path.exists(currentTestingDataPickle)) or reset or resume:
-                testingData.to_pickle(currentTestingDataPickle)
-            else:
-                if os.path.exists(currentTestingDataPickle):
-                    print(
-                        "Using previously saved testing data",
-                        currentTestingDataPickle,
-                        flush=True,
-                    )
-                    testingData = pd.read_pickle(currentTestingDataPickle)
-
-            if (
-                (not os.path.exists(currentTrainingAndValidationDataPickle))
-                or reset
-                or resume
-            ):
-                trainingAndValidationData.to_pickle(
-                    currentTrainingAndValidationDataPickle
+            Path(currentValidationOutputFolder).mkdir(parents=True, exist_ok=True)
+
+        # initialize the dataframes and save them to disk
+        data_dict = {
+            "training": trainingData,
+            "validation": validationData,
+            "testing": testingData,
+        }
+        data_dict_files = {}
+        for data_type, data in data_dict.items():
+            data_dict_files[data_type] = None
+            if data is not None:
+                currentDataPickle = os.path.join(
+                    currentValidationOutputFolder, "data_" + data_type + ".pkl"
                 )
-            else:
-                if os.path.exists(currentTrainingAndValidationDataPickle):
-                    print(
-                        "Using previously saved training+validation data",
-                        currentTrainingAndValidationDataPickle,
-                        flush=True,
-                    )
-                    trainingAndValidationData = pd.read_pickle(
-                        currentTrainingAndValidationDataPickle
-                    )
-
-            current_training_subject_indeces_full = (
-                trainingAndValidationData[
-                    trainingAndValidationData.columns[
-                        parameters["headers"]["subjectIDHeader"]
-                    ]
-                ]
-                .unique()
-                .tolist()
+                data_dict_files[data_type] = currentDataPickle
+                if (not os.path.exists(currentDataPickle)) or reset or resume:
+                    data.to_pickle(currentDataPickle)
+                    data.to_csv(currentDataPickle.replace(".pkl", ".csv"), index=False)
+                else:
+                    # read the data from the pickle if present
+                    data_dict[data_type] = get_dataframe(currentDataPickle)
+
+        # parallel_compute_command is an empty string, thus no parallel computing requested
+        if not parameters["parallel_compute_command"]:
+            training_loop(
+                training_data=data_dict["training"],
+                validation_data=data_dict["validation"],
+                output_dir=currentValidationOutputFolder,
+                device=device,
+                params=parameters,
+                testing_data=data_dict["testing"],
             )
 
-        # start the kFold train for validation
-        for train_index, val_index in kf_validation.split(
-            current_training_subject_indeces_full
-        ):
-            # the output of the current fold is only needed if multi-fold training is happening
-            if singleFoldValidation:
-                currentValOutputFolder = currentOutputFolder
-            else:
-                currentValOutputFolder = os.path.join(
-                    currentOutputFolder, str(currentValidationFold)
-                )
-                Path(currentValOutputFolder).mkdir(parents=True, exist_ok=True)
-
-            trainingData = pd.DataFrame()  # initialize the variable
-            validationData = pd.DataFrame()  # initialize the variable
-
-            # loop over all train_index and construct new dataframe
-            for subject_idx in train_index:
-                trainingData = trainingData._append(
-                    trainingData_full[
-                        trainingData_full[
-                            trainingData_full.columns[
-                                parameters["headers"]["subjectIDHeader"]
-                            ]
-                        ]
-                        == subjectIDs_full[subject_idx]
-                    ]
-                )
-
-            # loop over all val_index and construct new dataframe
-            for subject_idx in val_index:
-                validationData = validationData._append(
-                    trainingData_full[
-                        trainingData_full[
-                            trainingData_full.columns[
-                                parameters["headers"]["subjectIDHeader"]
-                            ]
-                        ]
-                        == subjectIDs_full[subject_idx]
-                    ]
-                )
-
-            # # write parameters to pickle - this should not change for the different folds, so keeping is independent
-            ## pickle/unpickle data
-            # pickle the data
-            currentTrainingDataPickle = os.path.join(
-                currentValOutputFolder, "data_training.pkl"
-            )
-            currentValidationDataPickle = os.path.join(
-                currentValOutputFolder, "data_validation.pkl"
+        else:
+            # call hpc command here
+            parallel_compute_command_actual = parameters[
+                "parallel_compute_command"
+            ].replace("${outputDir}", currentValidationOutputFolder)
+
+            assert (
+                "python" in parallel_compute_command_actual
+            ), "The 'parallel_compute_command_actual' needs to have the python from the virtual environment, which is usually '${GANDLF_dir}/venv/bin/python'"
+
+            command = (
+                parallel_compute_command_actual
+                + " -m GANDLF.training_loop -train_loader_pickle "
+                + data_dict_files["training"]
+                + " -val_loader_pickle "
+                + data_dict_files["validation"]
+                + " -parameter_pickle "
+                + currentModelConfigPickle
+                + " -device "
+                + str(device)
+                + " -outputDir "
+                + currentValidationOutputFolder
+                + " -testing_loader_pickle "
+                + data_dict_files["testing"]
             )
-            if (not os.path.exists(currentTrainingDataPickle)) or reset or resume:
-                trainingData.to_pickle(currentTrainingDataPickle)
-                trainingData.to_csv(
-                    currentTrainingDataPickle.replace(".pkl", ".csv"), index=False
-                )
-            else:
-                trainingData = get_dataframe(currentTrainingDataPickle)
-            if (not os.path.exists(currentValidationDataPickle)) or reset or resume:
-                validationData.to_pickle(currentValidationDataPickle)
-                validationData.to_csv(
-                    currentValidationDataPickle.replace(".pkl", ".csv"), index=False
-                )
-            else:
-                validationData = get_dataframe(currentValidationDataPickle)
-
-            # parallel_compute_command is an empty string, thus no parallel computing requested
-            if (not parameters["parallel_compute_command"]) or (singleFoldValidation):
-                training_loop(
-                    training_data=trainingData,
-                    validation_data=validationData,
-                    output_dir=currentValOutputFolder,
-                    device=device,
-                    params=parameters,
-                    testing_data=testingData,
-                )
-
-            else:
-                # call qsub here
-                parallel_compute_command_actual = parameters[
-                    "parallel_compute_command"
-                ].replace("${outputDir}", currentValOutputFolder)
-
-                if not ("python" in parallel_compute_command_actual):
-                    sys.exit(
-                        "The 'parallel_compute_command_actual' needs to have the python from the virtual environment, which is usually '${GANDLF_dir}/venv/bin/python'"
-                    )
-
-                command = (
-                    parallel_compute_command_actual
-                    + " -m GANDLF.training_loop -train_loader_pickle "
-                    + currentTrainingDataPickle
-                    + " -val_loader_pickle "
-                    + currentValidationDataPickle
-                    + " -parameter_pickle "
-                    + currentModelConfigPickle
-                    + " -device "
-                    + str(device)
-                    + " -outputDir "
-                    + currentValOutputFolder
-                    + " -testing_loader_pickle "
-                    + currentTestingDataPickle
-                )
-
-                print(
-                    "Submitting job for testing split "
-                    + str(currentTestingFold)
-                    + " and validation split "
-                    + str(currentValidationFold)
-                )
-                subprocess.Popen(command, shell=True).wait()
-
-            if singleFoldValidation:
-                break
-            currentValidationFold += 1  # go to next fold
 
-        if singleFoldTesting:
-            break
-        currentTestingFold += 1  # go to next fold
+            print("Running command: ", command, flush=True)
+            os.system(command, flush=True)
 
 
 def TrainingManager_split(
diff --git a/GANDLF/utils/__init__.py b/GANDLF/utils/__init__.py
index 311edeed8..66d830d3d 100644
--- a/GANDLF/utils/__init__.py
+++ b/GANDLF/utils/__init__.py
@@ -66,3 +66,5 @@
     save_model,
     optimize_and_save_model,
 )
+
+from .data_splitter import split_data
diff --git a/GANDLF/utils/data_splitter.py b/GANDLF/utils/data_splitter.py
new file mode 100644
index 000000000..6976d7629
--- /dev/null
+++ b/GANDLF/utils/data_splitter.py
@@ -0,0 +1,253 @@
+from typing import List, Tuple
+import pandas as pd
+from sklearn.model_selection import KFold, StratifiedKFold
+
+from . import parseTrainingCSV, populate_header_in_parameters
+
+
+def split_data(
+    full_dataset: pd.DataFrame, parameters: dict
+) -> List[Tuple[Tuple[int, int], pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
+    """
+    Split the data into training, validation, and testing sets.
+
+    Args:
+        full_dataset (pd.DataFrame): The full dataset to split.
+        parameters (dict): The parameters to use for splitting the data, which should contain the "nested_training" key with relevant information.
+
+    Returns:
+        List[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]: A list of tuples, each containing the a tuple of the testing & validation split indeces, and training, validation, and testing sets.
+    """
+    assert (
+        "nested_training" in parameters
+    ), "`nested_training` key missing in parameters"
+    # populate the headers
+    _, parameters["headers"] = (
+        parseTrainingCSV(full_dataset) if "headers" not in parameters else full_dataset,
+        parameters["headers"],
+    )
+
+    parameters = (
+        populate_header_in_parameters(parameters, parameters["headers"])
+        if "problem_type" not in parameters
+        else parameters
+    )
+
+    stratified_splitting = parameters["nested_training"].get("stratified")
+
+    return_data = []
+
+    # check for single fold training
+    singleFoldValidation = False
+    singleFoldTesting = False
+    # if the user wants a single fold training
+    testing_folds = parameters["nested_training"]["testing"]
+    if testing_folds < 0:
+        testing_folds = abs(testing_folds)
+        singleFoldTesting = True
+
+    # if the user wants a single fold training
+    validation_folds = parameters["nested_training"]["validation"]
+    if validation_folds < 0:
+        validation_folds = abs(validation_folds)
+        singleFoldValidation = True
+
+    # this is the condition where testing data is not to be kept
+    noTestingData = False
+    if testing_folds == 1:
+        noTestingData = True
+        singleFoldTesting = True
+        # put 2 just so that the first for-loop does not fail
+        testing_folds = 2
+        print(
+            "WARNING: Testing data is empty, which will result in scientifically incorrect results; use at your own risk."
+        )
+
+    # get unique subject IDs
+    subjectIDs_full = (
+        full_dataset[full_dataset.columns[parameters["headers"]["subjectIDHeader"]]]
+        .unique()
+        .tolist()
+    )
+
+    all_subjects_are_unique = len(subjectIDs_full) == len(full_dataset.index)
+
+    # checks for stratified splitting
+    if stratified_splitting:
+        # it can only be done for classification problems
+        assert (
+            parameters["problem_type"] == "classification"
+        ), "Stratified splitting is only possible for classification problems."
+        # it can only be done when all subjects are unique
+        assert (
+            all_subjects_are_unique
+        ), "Stratified splitting is not possible when duplicate subjects IDs are present in the dataset."
+
+    # get the targets for prediction for classification
+    target_testing = False  # initialize this so that the downstream code does not fail - for KFold, this is shuffle
+    if parameters["problem_type"] == "classification":
+        target_testing = full_dataset.loc[
+            :, full_dataset.columns[parameters["headers"]["predictionHeaders"]]
+        ]
+    target_validation = target_testing
+
+    folding_type = KFold
+    if stratified_splitting:
+        folding_type = StratifiedKFold
+
+    kf_testing = folding_type(n_splits=testing_folds)
+    kf_validation = folding_type(n_splits=validation_folds)
+
+    # start StratifiedKFold splitting
+    currentTestingFold = 0
+    if stratified_splitting:
+        for trainAndVal_index, testing_index in kf_testing.split(
+            full_dataset, target_testing
+        ):
+            # ensure the validation fold is initialized per-testing split
+            currentValidationFold = 0
+
+            trainingAndValidationData, testingData = (
+                pd.DataFrame(),
+                pd.DataFrame(),
+            )  # initialize the variables
+            # get the current training and testing data
+            if noTestingData:
+                # don't consider the split indeces for this case
+                trainingAndValidationData = full_dataset
+                # this should be None to ensure downstream code does not fail
+                testingData = None
+            else:
+                trainingAndValidationData = full_dataset.loc[trainAndVal_index, :]
+                trainingAndValidationData.reset_index(drop=True, inplace=True)
+                testingData = full_dataset.loc[testing_index, :]
+                # update the targets after the split
+                target_validation = trainingAndValidationData.loc[
+                    :, full_dataset.columns[parameters["headers"]["predictionHeaders"]]
+                ]
+
+            for train_index, val_index in kf_validation.split(
+                trainingAndValidationData, target_validation
+            ):
+                # get the current training and validation data
+                trainingData = trainingAndValidationData.loc[train_index, :]
+                validationData = trainingAndValidationData.loc[val_index, :]
+                return_data.append(
+                    (
+                        (currentTestingFold, currentValidationFold),
+                        trainingData,
+                        validationData,
+                        testingData,
+                    )
+                )
+                currentValidationFold += 1  # increment the validation fold
+                if singleFoldValidation:
+                    break
+
+            currentTestingFold += 1  # increment the testing fold
+            if singleFoldTesting:
+                break
+    else:
+        # start the kFold train for testing
+        for trainAndVal_index, testing_index in kf_testing.split(subjectIDs_full):
+            # ensure the validation fold is initialized per-testing split
+            currentValidationFold = 0
+
+            trainingAndValidationData, testingData = (
+                pd.DataFrame(),
+                pd.DataFrame(),
+            )  # initialize the variables
+            # get the current training and testing data
+            if noTestingData:
+                # don't consider the split indeces for this case
+                trainingAndValidationData = full_dataset
+                # this should be None to ensure downstream code does not fail
+                testingData = None
+            else:
+                # loop over all trainAndVal_index and construct new dataframe
+                for subject_idx in trainAndVal_index:
+                    trainingAndValidationData = trainingAndValidationData._append(
+                        full_dataset[
+                            full_dataset[
+                                full_dataset.columns[
+                                    parameters["headers"]["subjectIDHeader"]
+                                ]
+                            ]
+                            == subjectIDs_full[subject_idx]
+                        ]
+                    )
+
+                # loop over all testing_index and construct new dataframe
+                for subject_idx in testing_index:
+                    testingData = testingData._append(
+                        full_dataset[
+                            full_dataset[
+                                full_dataset.columns[
+                                    parameters["headers"]["subjectIDHeader"]
+                                ]
+                            ]
+                            == subjectIDs_full[subject_idx]
+                        ]
+                    )
+
+            current_training_subject_indeces_full = (
+                trainingAndValidationData[
+                    trainingAndValidationData.columns[
+                        parameters["headers"]["subjectIDHeader"]
+                    ]
+                ]
+                .unique()
+                .tolist()
+            )
+
+            # start the kFold train for validation
+            for train_index, val_index in kf_validation.split(
+                current_training_subject_indeces_full
+            ):
+                trainingData = pd.DataFrame()  # initialize the variable
+                validationData = pd.DataFrame()  # initialize the variable
+
+                # loop over all train_index and construct new dataframe
+                for subject_idx in train_index:
+                    trainingData = trainingData._append(
+                        full_dataset[
+                            full_dataset[
+                                full_dataset.columns[
+                                    parameters["headers"]["subjectIDHeader"]
+                                ]
+                            ]
+                            == subjectIDs_full[subject_idx]
+                        ]
+                    )
+
+                # loop over all val_index and construct new dataframe
+                for subject_idx in val_index:
+                    validationData = validationData._append(
+                        full_dataset[
+                            full_dataset[
+                                full_dataset.columns[
+                                    parameters["headers"]["subjectIDHeader"]
+                                ]
+                            ]
+                            == subjectIDs_full[subject_idx]
+                        ]
+                    )
+
+                return_data.append(
+                    (
+                        (currentTestingFold, currentValidationFold),
+                        trainingData,
+                        validationData,
+                        testingData,
+                    )
+                )
+
+                currentValidationFold += 1  # go to next fold
+                if singleFoldValidation:
+                    break
+
+            currentTestingFold += 1  # go to next fold
+            if singleFoldTesting:
+                break
+
+    return return_data
diff --git a/GANDLF/utils/generic.py b/GANDLF/utils/generic.py
index a7a3a0f37..8604c1cd0 100644
--- a/GANDLF/utils/generic.py
+++ b/GANDLF/utils/generic.py
@@ -253,7 +253,6 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict
         Returns:
             dict: The output metrics dictionary.
         """
-        print(input_metrics_dict)
         output_metrics_dict = deepcopy(input_metrics_dict)
         for metric in input_metrics_dict.keys():
             if isinstance(input_metrics_dict[metric], list):
@@ -265,7 +264,6 @@ def __update_metric_from_list_to_single_string(input_metrics_dict: dict) -> dict
                     .split(",")
                 )
 
-        print(output_metrics_dict)
         return output_metrics_dict
 
     output_metrics_dict = deepcopy(cohort_level_metrics)
diff --git a/GANDLF/version.py b/GANDLF/version.py
index b0fb39160..d06baf5e4 100644
--- a/GANDLF/version.py
+++ b/GANDLF/version.py
@@ -2,4 +2,4 @@
 # -*- coding: UTF-8 -*-
 
 # check GaNDLF wiki for versioning and release guidelines: https://github.com/mlcommons/GaNDLF/wiki
-__version__ = "0.0.19-dev"
+__version__ = "0.0.20-dev"
diff --git a/docs/usage.md b/docs/usage.md
index 874dd007e..2dcb79a58 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -106,6 +106,8 @@ SubjectID,Channel_0,Channel_1,...,Channel_X,Label
 N,/full/path/N/0.nii.gz,/full/path/N/1.nii.gz,...,/full/path/N/X.nii.gz,/full/path/N/segmentation.nii.gz
 ```
 
+**Notes:**
+
 - `Channel` can be substituted with `Modality` or `Image`
 - `Label` can be substituted with `Mask` or `Segmentation`and is used to specify the annotation file for segmentation models
 - For classification/regression, add a column called `ValueToPredict`. Currently, we are supporting only a single value prediction per model.
@@ -162,6 +164,19 @@ The following command shows how the script works:
 - `SubjectID` or `PatientName` is used to ensure that the randomized split is done per-subject rather than per-image.
 - For data arrangement different to what is described above, a customized script will need to be written to generate the CSV, or you can enter the data manually into the CSV. 
 
+### Using the `gandlf_splitCSV` application
+
+To split the data CSV into training, validation, and testing CSVs, the `gandlf_splitCSV` script can be used. The following command shows how the script works:
+
+```bash
+# continue from previous shell
+(venv_gandlf) $> python gandlf_splitCSV \
+  # -h, --help         Show help message and exit
+  -i ./experiment_0/train_data.csv \ # output CSV from the `gandlf_constructCSV` script
+  -c $gandlf_config \ # the GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed
+  -o $output_dir # the output directory to save the split data
+```
+
 
 ## Customize the Training
 
diff --git a/gandlf_splitCSV b/gandlf_splitCSV
new file mode 100644
index 000000000..339fec2ac
--- /dev/null
+++ b/gandlf_splitCSV
@@ -0,0 +1,66 @@
+#!usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os, argparse, sys, yaml
+from GANDLF.cli import copyrightMessage, split_data_and_save_csvs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="GANDLF_SplitCSV",
+        formatter_class=argparse.RawTextHelpFormatter,
+        description="Split the data into training, validation, and testing sets and save them as csvs in the output directory.\n\n"
+        + copyrightMessage,
+    )
+    parser.add_argument(
+        "-i",
+        "--inputCSV",
+        metavar="",
+        default=None,
+        type=str,
+        required=True,
+        help="Input CSV file which contains the data to be split.",
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        metavar="",
+        default=None,
+        required=True,
+        type=str,
+        help="The GaNDLF config (in YAML) with the `nested_training` key specified to the folds needed.",
+    )
+    parser.add_argument(
+        "-o",
+        "--outputDir",
+        metavar="",
+        default=None,
+        type=str,
+        required=True,
+        help="Output directory to save the split data.",
+    )
+
+    args = parser.parse_args()
+
+    # check for required parameters - this is needed here to keep the cli clean
+    for param_none_check in [args.inputCSV, args.outputDir, args.config]:
+        if param_none_check is None:
+            sys.exit("ERROR: Missing required parameter:", param_none_check)
+
+    inputCSV = os.path.normpath(args.inputCSV)
+    outputDir = os.path.normpath(args.outputDir)
+    # initialize default
+    config = {"nested_training": {"testing": 5, "validation": 5}}
+    if os.path.isfile(args.config):
+        config = yaml.safe_load(open(args.config, "r"))
+
+    print("Config used for split:", config)
+
+    split_data_and_save_csvs(inputCSV, outputDir, config)
+
+    print("Finished successfully.")
+
+
+# main function
+if __name__ == "__main__":
+    main()
diff --git a/mlcube/model_mlcube/workspace/config.yml b/mlcube/model_mlcube/workspace/config.yml
index 312f06d93..9b5138fed 100644
--- a/mlcube/model_mlcube/workspace/config.yml
+++ b/mlcube/model_mlcube/workspace/config.yml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.0.19,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.0.20,
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 verbose: True
 # Choose the model parameters here
diff --git a/samples/config_all_options.yaml b/samples/config_all_options.yaml
index 7eb85cbd4..3c117aa68 100644
--- a/samples/config_all_options.yaml
+++ b/samples/config_all_options.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.0.19,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.0.20,
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 ## Choose the model parameters here
 model:
@@ -138,6 +138,7 @@ optimizer: adam
 # for train on a single fold, use '-' before the fold number to make the number of folds "negative" -- NOT recommended
 nested_training:
   {
+    stratified: False, # this will perform stratified k-fold cross-validation but only with offline data splitting, see https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
     testing: 5, # this controls the number of testing data folds for final model evaluation; [NOT recommended] to disable this, use '1'
     validation: 5 # this controls the number of validation data folds to be used for model *selection* during training (not used for back-propagation)
   }
diff --git a/samples/config_classification.yaml b/samples/config_classification.yaml
index ec8578e82..9795ffca8 100644
--- a/samples/config_classification.yaml
+++ b/samples/config_classification.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.0.19,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.0.20,
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_getting_started_classification_histo2d.yaml b/samples/config_getting_started_classification_histo2d.yaml
index b170d2ef2..e9b4e6208 100644
--- a/samples/config_getting_started_classification_histo2d.yaml
+++ b/samples/config_getting_started_classification_histo2d.yaml
@@ -94,6 +94,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_classification_rad3d.yaml b/samples/config_getting_started_classification_rad3d.yaml
index e0dad1afc..3d5466212 100644
--- a/samples/config_getting_started_classification_rad3d.yaml
+++ b/samples/config_getting_started_classification_rad3d.yaml
@@ -99,6 +99,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_regression_histo2d.yaml b/samples/config_getting_started_regression_histo2d.yaml
index 1e7621fbe..9118263ed 100644
--- a/samples/config_getting_started_regression_histo2d.yaml
+++ b/samples/config_getting_started_regression_histo2d.yaml
@@ -59,6 +59,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_regression_rad3d.yaml b/samples/config_getting_started_regression_rad3d.yaml
index e5f3f03ac..4a98b1a4f 100644
--- a/samples/config_getting_started_regression_rad3d.yaml
+++ b/samples/config_getting_started_regression_rad3d.yaml
@@ -62,6 +62,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: false
diff --git a/samples/config_getting_started_segmentation_histo2d.yaml b/samples/config_getting_started_segmentation_histo2d.yaml
index 93cd74531..97deb0e34 100644
--- a/samples/config_getting_started_segmentation_histo2d.yaml
+++ b/samples/config_getting_started_segmentation_histo2d.yaml
@@ -66,6 +66,6 @@ scheduler:
 track_memory_usage: false
 verbose: true
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_segmentation_rad3d.yaml b/samples/config_getting_started_segmentation_rad3d.yaml
index 986f97fd5..c05256426 100644
--- a/samples/config_getting_started_segmentation_rad3d.yaml
+++ b/samples/config_getting_started_segmentation_rad3d.yaml
@@ -89,6 +89,6 @@ scheduler:
 track_memory_usage: false
 verbose: true
 version:
-  maximum: 0.0.19
-  minimum: 0.0.19
+  maximum: 0.0.20
+  minimum: 0.0.20
 weighted_loss: true
diff --git a/samples/config_regression.yaml b/samples/config_regression.yaml
index af0df0d4f..ce7b2c806 100644
--- a/samples/config_regression.yaml
+++ b/samples/config_regression.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.0.19,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.0.20,
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_segmentation_brats.yaml b/samples/config_segmentation_brats.yaml
index 44a2aa9fd..e90d5a92c 100644
--- a/samples/config_segmentation_brats.yaml
+++ b/samples/config_segmentation_brats.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.0.19,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.0.20,
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_segmentation_histology.yaml b/samples/config_segmentation_histology.yaml
index bee1daf4c..6551b50c9 100644
--- a/samples/config_segmentation_histology.yaml
+++ b/samples/config_segmentation_histology.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.0.19,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.0.20,
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/setup.py b/setup.py
index b582515c1..5e7d74c4b 100644
--- a/setup.py
+++ b/setup.py
@@ -108,7 +108,7 @@ def run(self):
     "segmentation-models-pytorch==0.3.3",
     "ACSConv==0.1.1",
     "docker",
-    "dicom-anonymizer",
+    "dicom-anonymizer==1.0.12",
     "twine",
     "zarr",
     "keyring",
diff --git a/testing/config_classification.yaml b/testing/config_classification.yaml
index 6e3f6e517..0482a7371 100644
--- a/testing/config_classification.yaml
+++ b/testing/config_classification.yaml
@@ -55,7 +55,7 @@ save_output: false
 scaling_factor: 1
 scheduler: triangle
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: True
 
diff --git a/testing/config_regression.yaml b/testing/config_regression.yaml
index 91ee2a015..106caa969 100644
--- a/testing/config_regression.yaml
+++ b/testing/config_regression.yaml
@@ -38,7 +38,7 @@ save_output: false
 scaling_factor: 1
 scheduler: triangle
 version:
-  maximum: 0.0.19
+  maximum: 0.0.20
   minimum: 0.0.14
 weighted_loss: false
 
diff --git a/testing/config_segmentation.yaml b/testing/config_segmentation.yaml
index 2bf83eab9..3006e1eb2 100644
--- a/testing/config_segmentation.yaml
+++ b/testing/config_segmentation.yaml
@@ -3,7 +3,7 @@
 version:
   {
     minimum: 0.0.14,
-    maximum: 0.0.19
+    maximum: 0.0.20
   }
 model:
   {
diff --git a/testing/test_full.py b/testing/test_full.py
index 2c8725c91..8417a0352 100644
--- a/testing/test_full.py
+++ b/testing/test_full.py
@@ -32,6 +32,7 @@
     recover_config,
     post_training_model_optimization,
     generate_metrics_dict,
+    split_data_and_save_csvs,
 )
 from GANDLF.schedulers import global_schedulers_dict
 from GANDLF.optimizers import global_optimizer_dict
@@ -480,7 +481,7 @@ def test_train_regression_brainage_rad_2d(device):
     parameters["model"]["architecture"] = "brain_age"
     parameters["model"]["onnx_export"] = False
     parameters["model"]["print_summary"] = False
-    parameters_temp = copy.deepcopy(parameters)
+    # parameters_temp = copy.deepcopy(parameters)
     parameters = populate_header_in_parameters(parameters, parameters["headers"])
     sanitize_outputDir()
     TrainingManager(
@@ -752,7 +753,7 @@ def test_train_inference_optimize_classification_rad_3d(device):
     parameters["model"]["architecture"] = all_models_regression[0]
     parameters["model"]["onnx_export"] = False
     parameters["model"]["print_summary"] = False
-    parameters_temp = copy.deepcopy(parameters)
+    # parameters_temp = copy.deepcopy(parameters)
     sanitize_outputDir()
     TrainingManager(
         dataframe=training_data,
@@ -842,6 +843,13 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device):
     parameters["patch_size"] = patch_size["3D"]
     parameters["model"]["dimension"] = 3
     parameters["model"]["final_layer"] = "logits"
+    # loop through selected models and train for single epoch
+    model = all_models_regression[0]
+    parameters["model"]["architecture"] = model
+    parameters["model"]["onnx_export"] = False
+    parameters["model"]["print_summary"] = False
+    ## add stratified splitting
+    parameters["nested_training"]["stratified"] = True
 
     # read and parse csv
     training_data, parameters["headers"] = parseTrainingCSV(
@@ -849,20 +857,30 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device):
     )
     parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"])
     parameters = populate_header_in_parameters(parameters, parameters["headers"])
-    # loop through selected models and train for single epoch
-    model = all_models_regression[0]
-    parameters["model"]["architecture"] = model
-    parameters["model"]["onnx_export"] = False
-    parameters["model"]["print_summary"] = False
-    sanitize_outputDir()
-    TrainingManager(
-        dataframe=training_data,
-        outputDir=outputDir,
-        parameters=parameters,
-        device=device,
-        resume=False,
-        reset=True,
-    )
+    # duplicate the data to test stratified sampling
+    training_data_duplicate = training_data._append(training_data)
+    for _ in range(1):
+        training_data_duplicate = training_data_duplicate._append(
+            training_data_duplicate
+        )
+    training_data_duplicate.reset_index(drop=True, inplace=True)
+    # ensure subjects are not duplicated
+    training_data_duplicate["SubjectID"] = training_data_duplicate.index
+
+    # ensure every part of the code is tested
+    for folds in [2, 1, -5]:
+        ## add stratified folding information
+        parameters["nested_training"]["testing"] = folds
+        parameters["nested_training"]["validation"] = folds if folds != 1 else -5
+        sanitize_outputDir()
+        TrainingManager(
+            dataframe=training_data_duplicate,
+            outputDir=outputDir,
+            parameters=parameters,
+            device=device,
+            resume=False,
+            reset=True,
+        )
     ## this is to test if inference can run without having ground truth column
     training_data.drop("ValueToPredict", axis=1, inplace=True)
     training_data.drop("Label", axis=1, inplace=True)
@@ -874,7 +892,6 @@ def test_train_inference_classification_with_logits_single_fold_rad_3d(device):
     )
     training_data, parameters["headers"] = parseTrainingCSV(temp_infer_csv)
     parameters["output_dir"] = outputDir  # this is in inference mode
-    parameters["output_dir"] = outputDir  # this is in inference mode
     parameters["modality"] = "rad"
     parameters["patch_size"] = patch_size["3D"]
     parameters["model"]["dimension"] = 3
@@ -3098,6 +3115,37 @@ def test_generic_deploy_metrics_docker():
 
     print("passed")
 
+
+def test_generic_data_split():
+    print("51: Starting test for splitting and saving CSVs")
+    # read and initialize parameters for specific data dimension
+    parameters = ConfigManager(
+        testingDir + "/config_classification.yaml", version_check_flag=False
+    )
+    parameters["nested_training"] = {"testing": 5, "validation": 5, "stratified": True}
+    # read and parse csv
+    training_data, parameters["headers"] = parseTrainingCSV(
+        inputDir + "/train_3d_rad_classification.csv"
+    )
+    parameters["model"]["num_channels"] = len(parameters["headers"]["channelHeaders"])
+    parameters = populate_header_in_parameters(parameters, parameters["headers"])
+    # duplicate the data to test stratified sampling
+    training_data_duplicate = training_data._append(training_data)
+    for _ in range(1):
+        training_data_duplicate = training_data_duplicate._append(
+            training_data_duplicate
+        )
+    training_data_duplicate.reset_index(drop=True, inplace=True)
+    # ensure subjects are not duplicated
+    training_data_duplicate["SubjectID"] = training_data_duplicate.index
+
+    sanitize_outputDir()
+
+    split_data_and_save_csvs(training_data_duplicate, outputDir, parameters)
+
+    files_in_outputDir = os.listdir(outputDir)
+    assert len(files_in_outputDir) == 15, "CSVs were not split correctly"
+
     sanitize_outputDir()
 
     print("passed")
diff --git a/tutorials/classification_medmnist_notebook/config.yaml b/tutorials/classification_medmnist_notebook/config.yaml
index 66ce42637..309860336 100644
--- a/tutorials/classification_medmnist_notebook/config.yaml
+++ b/tutorials/classification_medmnist_notebook/config.yaml
@@ -2,7 +2,7 @@
 version:
   {
     minimum: 0.0.14,
-    maximum: 0.0.19 # this should NOT be made a variable, but should be tested after every tag is created
+    maximum: 0.0.20 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here