From 3b990d3f750d7c79111cdc7cdf3d0ac60ddf1cda Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Tue, 20 Feb 2024 15:46:18 +0000 Subject: [PATCH 01/35] Updating README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index e1e06bb..c9c7949 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ +# Overview +The repository contains the code to train a neural network to emulate the gravity wave drag (GWD) in the WACCM simulation. +The code aims trains a pytorch Feed Forward network (FF) + + + # newCAM-Emulation This is a DNN written with PyTorch to Emulate the gravity wave drag (GWD, both zonal and meridional) in the CAM model. From 39139bb454fde280fe6e798c73267bc072c68c79 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Thu, 22 Feb 2024 13:13:59 +0000 Subject: [PATCH 02/35] basic linting --- newCAM_emulation/Model.py | 1 + newCAM_emulation/train.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index ccdd690..91cf5be 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -10,6 +10,7 @@ from torch.utils.data import DataLoader, Dataset + # Required for feeding the data iinto NN. class myDataset(Dataset): """ diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 198097e..f11f53c 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -19,6 +19,17 @@ print(f"Using device: {DEVICE}") +from torch.cuda import is_available +from torch.backends import mps +if is_available(): + DEVICE = "cuda" +elif mps.is_available(): + DEVICE = "mps" +else: + DEVICE = "cpu" +print(f"Using device: {DEVICE}") + + class EarlyStopper: """Class for implementing early stopping during training.""" From 3cebc20b3386ac7e9fbcebb42850e78d3a0b9c31 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Thu, 22 Feb 2024 13:54:20 +0000 Subject: [PATCH 03/35] basic linting --- newCAM_emulation/train.py | 142 +++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index f11f53c..3d4b4ac 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -3,6 +3,7 @@ import Model import netCDF4 as nc import numpy as np + import torch from loaddata import data_loader, newnorm from torch import nn @@ -10,6 +11,7 @@ from torch.cuda import is_available from torch.utils.data import DataLoader + if is_available(): DEVICE = "cuda" elif mps.is_available(): @@ -21,6 +23,7 @@ from torch.cuda import is_available from torch.backends import mps + if is_available(): DEVICE = "cuda" elif mps.is_available(): @@ -55,8 +58,8 @@ def early_stop(self, validation_loss): self.min_validation_loss = validation_loss self.counter = 0 - #save model - torch.save(model.state_dict(), 'conv_torch.pth') + # save model + torch.save(model.state_dict(), "conv_torch.pth") elif validation_loss > (self.min_validation_loss + self.min_delta): self.counter += 1 @@ -65,44 +68,42 @@ def early_stop(self, validation_loss): return False - - ## load mean and std for normalization -fm = np.load('../Demodata/mean_demo_sub.npz') -fs = np.load('../Demodata/std_demo_sub.npz') - -Um = fm['U'] -Vm = fm['V'] -Tm = fm['T'] -DSEm = fm['DSE'] -NMm = fm['NM'] -NETDTm = fm['NETDT'] -Z3m = fm['Z3'] -RHOIm = fm['RHOI'] -PSm = fm['PS'] -latm = fm['lat'] -lonm = fm['lon'] -UTGWSPECm = fm['UTGWSPEC'] -VTGWSPECm = fm['VTGWSPEC'] - -Us = fs['U'] -Vs = fs['V'] -Ts = fs['T'] -DSEs = fs['DSE'] -NMs = fs['NM'] -NETDTs = fs['NETDT'] -Z3s = fs['Z3'] -RHOIs = fs['RHOI'] -PSs = fs['PS'] -lats = fs['lat'] -lons = fs['lon'] -UTGWSPECs = fs['UTGWSPEC'] -VTGWSPECs = fs['VTGWSPEC'] +fm = np.load("../Demodata/mean_demo_sub.npz") +fs = np.load("../Demodata/std_demo_sub.npz") + +Um = fm["U"] +Vm = fm["V"] +Tm = fm["T"] +DSEm = fm["DSE"] +NMm = fm["NM"] +NETDTm = fm["NETDT"] +Z3m = fm["Z3"] +RHOIm = fm["RHOI"] +PSm = fm["PS"] +latm = fm["lat"] +lonm = fm["lon"] +UTGWSPECm = fm["UTGWSPEC"] +VTGWSPECm = fm["VTGWSPEC"] + +Us = fs["U"] +Vs = fs["V"] +Ts = fs["T"] +DSEs = fs["DSE"] +NMs = fs["NM"] +NETDTs = fs["NETDT"] +Z3s = fs["Z3"] +RHOIs = fs["RHOI"] +PSs = fs["PS"] +lats = fs["lat"] +lons = fs["lon"] +UTGWSPECs = fs["UTGWSPEC"] +VTGWSPECs = fs["VTGWSPEC"] ilev = 93 -dim_NN =int(8*ilev+4) -dim_NNout =int(2*ilev) +dim_NN = int(8 * ilev + 4) +dim_NNout = int(2 * ilev) model = Model.FullyConnected() @@ -111,81 +112,81 @@ def early_stop(self, validation_loss): learning_rate = 1e-5 epochs = 100 -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # weight_decay=1e-5 +optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # weight_decay=1e-5 s_list = list(range(1, 6)) for iter in s_list: - if (iter > 1): - model.load_state_dict(torch.load('conv_torch.pth')) - print ('data loader iteration',iter) - filename = '../Demodata/newCAM_demo_sub_' + str(iter).zfill(1) + '.nc' - print('working on: ', filename) + if iter > 1: + model.load_state_dict(torch.load("conv_torch.pth")) + print("data loader iteration", iter) + filename = "../Demodata/newCAM_demo_sub_" + str(iter).zfill(1) + ".nc" + print("working on: ", filename) F = nc.Dataset(filename) - PS = np.asarray(F['PS'][0,:]) + PS = np.asarray(F["PS"][0, :]) PS = newnorm(PS, PSm, PSs) - Z3 = np.asarray(F['Z3'][0,:,:]) + Z3 = np.asarray(F["Z3"][0, :, :]) Z3 = newnorm(Z3, Z3m, Z3s) - U = np.asarray(F['U'][0,:,:]) + U = np.asarray(F["U"][0, :, :]) U = newnorm(U, Um, Us) - V = np.asarray(F['V'][0,:,:]) + V = np.asarray(F["V"][0, :, :]) V = newnorm(V, Vm, Vs) - T = np.asarray(F['T'][0,:,:]) + T = np.asarray(F["T"][0, :, :]) T = newnorm(T, Tm, Ts) - lat = F['lat'] + lat = F["lat"] lat = newnorm(lat, np.mean(lat), np.std(lat)) - lon = F['lon'] + lon = F["lon"] lon = newnorm(lon, np.mean(lon), np.std(lon)) - DSE = np.asarray(F['DSE'][0,:,:]) + DSE = np.asarray(F["DSE"][0, :, :]) DSE = newnorm(DSE, DSEm, DSEs) - RHOI = np.asarray(F['RHOI'][0,:,:]) + RHOI = np.asarray(F["RHOI"][0, :, :]) RHOI = newnorm(RHOI, RHOIm, RHOIs) - NETDT = np.asarray(F['NETDT'][0,:,:]) + NETDT = np.asarray(F["NETDT"][0, :, :]) NETDT = newnorm(NETDT, NETDTm, NETDTs) - NM = np.asarray(F['NMBV'][0,:,:]) + NM = np.asarray(F["NMBV"][0, :, :]) NM = newnorm(NM, NMm, NMs) - UTGWSPEC = np.asarray(F['UTGWSPEC'][0,:,:]) + UTGWSPEC = np.asarray(F["UTGWSPEC"][0, :, :]) UTGWSPEC = newnorm(UTGWSPEC, UTGWSPECm, UTGWSPECs) - VTGWSPEC = np.asarray(F['VTGWSPEC'][0,:,:]) + VTGWSPEC = np.asarray(F["VTGWSPEC"][0, :, :]) VTGWSPEC = newnorm(VTGWSPEC, VTGWSPECm, VTGWSPECs) - x_train,y_train = data_loader(U,V,T, DSE, NM, NETDT, Z3, - RHOI, PS,lat,lon,UTGWSPEC, VTGWSPEC) + x_train, y_train = data_loader( + U, V, T, DSE, NM, NETDT, Z3, RHOI, PS, lat, lon, UTGWSPEC, VTGWSPEC + ) data = Model.myDataset(X=x_train, Y=y_train) batch_size = 128 - split_data = torch.utils.data.random_split(data, [0.75, 0.25], - generator=torch.Generator().manual_seed(42)) - train_dataloader = DataLoader(split_data[0], - batch_size=batch_size, - shuffle=True) - val_dataloader = DataLoader(split_data[1], - batch_size=len(split_data[1]), - shuffle=True) - - # training - early_stopper = EarlyStopper(patience=5, min_delta=0) # Note the hyper parameters. + split_data = torch.utils.data.random_split( + data, [0.75, 0.25], generator=torch.Generator().manual_seed(42) + ) + train_dataloader = DataLoader(split_data[0], batch_size=batch_size, shuffle=True) + val_dataloader = DataLoader( + split_data[1], batch_size=len(split_data[1]), shuffle=True + ) + + # training + early_stopper = EarlyStopper(patience=5, min_delta=0) # Note the hyper parameters. for t in range(epochs): - if t % 2 ==0: + if t % 2 == 0: print(f"Epoch {t+1}\n-------------------------------") print(val_losses[-1]) - print('counter=' + str(early_stopper.counter)) + print("counter=" + str(early_stopper.counter)) train_loss = Model.train_loop(train_dataloader, model, nn.MSELoss(), optimizer) train_losses.append(train_loss) @@ -194,4 +195,3 @@ def early_stop(self, validation_loss): if early_stopper.early_stop(val_loss): print("BREAK!") break - From cd22ce8ddb6575bf1c9c17a610f5afc652225994 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 26 Feb 2024 15:57:17 +0000 Subject: [PATCH 04/35] Added ruff linting tool --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 93d8d9e..ae7c3d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,8 @@ extend-include = ["*.ipynb"] [tool.ruff.lint] # Enable: D: `pydocstyle`, PL: `pylint`, I: `isort`, W: `pycodestyle whitespace` # NPY: `numpy`, -select = ["D", "PL", "I", "E", "W", "NPY" ] + +select = ["D", "PL", "I", "E", "W", "NPY"] # Enable D417 (Missing argument description) on top of the NumPy convention. extend-select = ["D417"] From 2780cd460b5eb167f5df26f53aca9abff846f3b5 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 4 Mar 2024 18:17:54 +0000 Subject: [PATCH 05/35] Updating train.py --- newCAM_emulation/train.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 3d4b4ac..4ef75a8 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -10,17 +10,6 @@ from torch.backends import mps from torch.cuda import is_available from torch.utils.data import DataLoader - - -if is_available(): - DEVICE = "cuda" -elif mps.is_available(): - DEVICE = "mps" -else: - DEVICE = "cpu" -print(f"Using device: {DEVICE}") - - from torch.cuda import is_available from torch.backends import mps @@ -125,6 +114,7 @@ def early_stop(self, validation_loss): print("working on: ", filename) F = nc.Dataset(filename) + PS = np.asarray(F["PS"][0, :]) PS = newnorm(PS, PSm, PSs) From 4292ab9f43cb29f8d4f0a72f282650bb60f8b2e9 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 4 Mar 2024 18:22:06 +0000 Subject: [PATCH 06/35] Updating train.py --- newCAM_emulation/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 4ef75a8..3f44029 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -12,7 +12,6 @@ from torch.utils.data import DataLoader from torch.cuda import is_available from torch.backends import mps - if is_available(): DEVICE = "cuda" elif mps.is_available(): @@ -57,6 +56,8 @@ def early_stop(self, validation_loss): return False + + ## load mean and std for normalization fm = np.load("../Demodata/mean_demo_sub.npz") fs = np.load("../Demodata/std_demo_sub.npz") From d1cf88e3ff3aea942dc831b48331e252a105d05a Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Fri, 8 Mar 2024 10:35:58 +0000 Subject: [PATCH 07/35] Reverted the changes introduced by 9dde06f --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index c9c7949..e1e06bb 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,3 @@ -# Overview -The repository contains the code to train a neural network to emulate the gravity wave drag (GWD) in the WACCM simulation. -The code aims trains a pytorch Feed Forward network (FF) - - - # newCAM-Emulation This is a DNN written with PyTorch to Emulate the gravity wave drag (GWD, both zonal and meridional) in the CAM model. From a289544ed15033fc6b01c70d11d38a3cd6ebc0ce Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Sun, 17 Mar 2024 21:47:56 +0000 Subject: [PATCH 08/35] Adding normalisation in the model definition --- newCAM_emulation/Model.py | 127 +++++++++----------------------------- 1 file changed, 30 insertions(+), 97 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 91cf5be..7077939 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -10,7 +10,6 @@ from torch.utils.data import DataLoader, Dataset - # Required for feeding the data iinto NN. class myDataset(Dataset): """ @@ -39,6 +38,16 @@ def __getitem__(self, idx): # The NN model. +class NormalizationLayer(nn.Module): + def __init__(self, mean, std): + super(NormalizationLayer, self).__init__() + self.mean = mean + self.std = std + + def forward(self, x): + return (x - self.mean) / self.std + + class FullyConnected(nn.Module): """ Fully connected neural network model. @@ -50,110 +59,34 @@ class FullyConnected(nn.Module): linear_stack (torch.nn.Sequential): Sequential container for layers. """ - def __init__(self): + def __init__(self, ilev, mean, std): """Create an instance of FullyConnected NN model.""" super(FullyConnected, self).__init__() - ilev = 93 - - self.linear_stack = nn.Sequential( - nn.Linear(8 * ilev + 4, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 500, dtype=torch.float64), - nn.SiLU(), - nn.Linear(500, 2 * ilev, dtype=torch.float64), - ) - - def forward(self, X): + self.normalization = NormalizationLayer(mean, std) + self.ilev = ilev + + layers = [] + layers.append(nn.Linear(8 * ilev + 4, 500)) + layers.append(nn.SiLU()) + + num_layers = 10 # Example: Change this to the desired number of hidden layers + for _ in range(num_layers): + layers.append(nn.Linear(500, 500)) + layers.append(nn.SiLU()) + + layers.append(nn.Linear(500, 2 * ilev)) + self.linear_stack = nn.Sequential(*layers) + + def forward(self, x): """ Forward pass through the network. Args: - X (torch.Tensor): Input tensor. + x (torch.Tensor): Input tensor. Returns ------- torch.Tensor: Output tensor. """ - return self.linear_stack(X) - - -# training loop -def train_loop(dataloader, model, loss_fn, optimizer): - """ - Training loop. - - Args: - dataloader (DataLoader): DataLoader for training data. - model (nn.Module): Neural network model. - loss_fn (torch.nn.Module): Loss function. - optimizer (torch.optim.Optimizer): Optimizer. - - Returns - ------- - float: Average training loss. - """ - size = len(dataloader.dataset) - avg_loss = 0 - for batch, (X, Y) in enumerate(dataloader): - # Compute prediction and loss - pred = model(X) - loss = loss_fn(pred, Y) - - # Backpropagation - optimizer.zero_grad(set_to_none=True) - loss.backward() - optimizer.step() - - with torch.no_grad(): - avg_loss += loss.item() - - avg_loss /= len(dataloader) - - return avg_loss - - -# validating loop -def val_loop(dataloader, model, loss_fn): - """ - Validation loop. - - Args: - dataloader (DataLoader): DataLoader for validation data. - model (nn.Module): Neural network model. - loss_fn (torch.nn.Module): Loss function. - - Returns - ------- - float: Average validation loss. - """ - avg_loss = 0 - with torch.no_grad(): - for batch, (X, Y) in enumerate(dataloader): - # Compute prediction and loss - pred = model(X) - loss = loss_fn(pred, Y) - avg_loss += loss.item() - - avg_loss /= len(dataloader) - - return avg_loss + x = self.normalization(x) + return self.linear_stack(x) From a080959c730fbe59a720f5f13521b0c3b55677cd Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Sun, 17 Mar 2024 22:59:57 +0000 Subject: [PATCH 09/35] created convection subfolder in demodata and moving reading of data files to loaddata from train.py --- newCAM_emulation/loaddata.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index 859bf26..aaa4d0b 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -77,3 +77,14 @@ def data_loader (U,V,T, DSE, NM, NETDT, Z3, RHOI, PS, lat, lon, UTGWSPEC, VTGWSP return x_train,y_train + +"""Read the data and the corresponding mean and std deviation""" +"""Iterating through the data files""" +s_list = list(range(1, 6)) + +for iter in s_list: + filename = "Demodata/Convection/newCAM_demo_sub_" + str(iter).zfill(1) + ".nc" # data file + print('working on: ', filename) + fm = np.load('Demodata/mean_demo_sub.npz') # mean file + fs = np.load('Demodata/std_demo_sub.npz') # std deviation file + From 2c1de8bf0f2cd30bf92d273090a02ceb1d278669 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Sun, 17 Mar 2024 23:14:31 +0000 Subject: [PATCH 10/35] changed data_loader function --- newCAM_emulation/loaddata.py | 114 ++++++++++------------------------- 1 file changed, 33 insertions(+), 81 deletions(-) diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index aaa4d0b..f2b8cae 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -3,88 +3,40 @@ import numpy as np ilev = 93 -dim_NN =int(8*ilev+4) -dim_NNout =int(2*ilev) +dim_NN = int(8 * ilev + 4) +dim_NNout = int(2 * ilev) -def newnorm(var, varm, varstd): - """Normalizes the input variable(s) using mean and standard deviation. - Args: - var (numpy.ndarray): Input variable(s) to be normalized. - varm (numpy.ndarray): Mean of the variable(s). - varstd (numpy.ndarray): Standard deviation of the variable(s). - - Returns - ------- - numpy.ndarray: Normalized variable(s). - """ - dim=varm.size - if dim > 1 : - vara = var - varm[:, :] - varstdmax = varstd - varstdmax[varstd==0.0] = 1.0 - tmp = vara / varstdmax[:, :] - else: - tmp = ( var - varm ) / varstd - return tmp - - -def data_loader (U,V,T, DSE, NM, NETDT, Z3, RHOI, PS, lat, lon, UTGWSPEC, VTGWSPEC): - """ - Loads and preprocesses input data for neural network training. - - Args: - U (numpy.ndarray): Zonal wind component. - V (numpy.ndarray): Meridional wind component. - T (numpy.ndarray): Temperature. - DSE (numpy.ndarray): Dry static energy. - NM (numpy.ndarray): Northward mass flux. - NETDT (numpy.ndarray): Net downward total radiation flux. - Z3 (numpy.ndarray): Geopotential height. - RHOI (numpy.ndarray): Air density. - PS (numpy.ndarray): Surface pressure. - lat (numpy.ndarray): Latitude. - lon (numpy.ndarray): Longitude. - UTGWSPEC (numpy.ndarray): Target zonal wind spectral component. - VTGWSPEC (numpy.ndarray): Target meridional wind spectral component. - - Returns - ------- - tuple: A tuple containing the input data and target data arrays. - """ - Ncol = U.shape[1] - #Nlon = U.shape[2] - #Ncol = Nlat*Nlon - - x_train = np.zeros([dim_NN,Ncol]) - y_train = np.zeros([dim_NNout,Ncol]) - - - x_train [0:ilev, : ] = U.reshape(ilev, Ncol) - x_train [ilev:2*ilev, :] = V.reshape(ilev, Ncol) - x_train [2*ilev:3*ilev,:] = T.reshape(ilev, Ncol) - x_train [3*ilev:4*ilev, :] = DSE.reshape(ilev, Ncol) - x_train [4*ilev:5*ilev, :] = NM.reshape(ilev, Ncol) - x_train [5*ilev:6*ilev, :] = NETDT.reshape(ilev, Ncol) - x_train [6*ilev:7*ilev, :] = Z3.reshape(ilev, Ncol) - x_train [7*ilev:8*ilev+1, :] = RHOI.reshape(ilev+1, Ncol) - x_train [8*ilev+1:8*ilev+2, :] = PS.reshape(1, Ncol) - x_train [8*ilev+2:8*ilev+3, :] = lat.reshape(1, Ncol) - x_train [8*ilev+3:ilev*ilev+4, :] = lon.reshape(1, Ncol) - - y_train [0:ilev, :] = UTGWSPEC.reshape(ilev, Ncol) - y_train [ilev:2*ilev, :] = VTGWSPEC.reshape(ilev, Ncol) - - return x_train,y_train - - -"""Read the data and the corresponding mean and std deviation""" -"""Iterating through the data files""" +# Iterating through the data files s_list = list(range(1, 6)) - for iter in s_list: - filename = "Demodata/Convection/newCAM_demo_sub_" + str(iter).zfill(1) + ".nc" # data file - print('working on: ', filename) - fm = np.load('Demodata/mean_demo_sub.npz') # mean file - fs = np.load('Demodata/std_demo_sub.npz') # std deviation file - + filename = ( + "Demodata/Convection/newCAM_demo_sub_" + str(iter).zfill(1) + ".nc" + ) # data file + print("working on: ", filename) + fm = np.load("Demodata/mean_demo_sub.npz") # mean file + fs = np.load("Demodata/std_demo_sub.npz") # std deviation file + + +def data_loader(U, V, T, DSE, NM, NETDT, Z3, RHOI, PS, lat, lon, UTGWSPEC, VTGWSPEC): + """Function to iterate over the data read by the above part of code""" + Ncol = U.shape[1] + x_train = np.vstack( + [ + U.reshape(-1, Ncol), + V.reshape(-1, Ncol), + T.reshape(-1, Ncol), + DSE.reshape(-1, Ncol), + NM.reshape(-1, Ncol), + NETDT.reshape(-1, Ncol), + Z3.reshape(-1, Ncol), + RHOI.reshape(-1, Ncol), + PS.reshape(1, Ncol), + lat.reshape(1, Ncol), + lon.reshape(1, Ncol), + ] + ) + + y_train = np.vstack([UTGWSPEC.reshape(-1, Ncol), VTGWSPEC.reshape(-1, Ncol)]) + + return x_train, y_train From 181c3bc86f91b73adc0f8e949b36151b36a3d06d Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Tue, 14 May 2024 21:34:32 +0100 Subject: [PATCH 11/35] Updated the notebook - work in progess --- newCAM_emulation/GW.ipynb | 581 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 581 insertions(+) create mode 100644 newCAM_emulation/GW.ipynb diff --git a/newCAM_emulation/GW.ipynb b/newCAM_emulation/GW.ipynb new file mode 100644 index 0000000..119be08 --- /dev/null +++ b/newCAM_emulation/GW.ipynb @@ -0,0 +1,581 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 252, + "metadata": {}, + "outputs": [], + "source": [ + "import netCDF4 as nc\n", + "import numpy as np\n", + "import os\n", + "import re\n", + "import torch\n", + "from torch import nn\n", + "from torch.utils.data import Dataset\n", + "from torch.utils.data import DataLoader\n" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "# Define variable names and corresponding mean and std values\n", + "features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC']\n", + "\n", + "directory_path = '../Demodata/Convection'\n", + "file_path_mean = '../Demodata/Convection/mean_demo_sub.npz'\n", + "file_path_std = '../Demodata/Convection/std_demo_sub.npz'" + ] + }, + { + "cell_type": "code", + "execution_count": 267, + "metadata": {}, + "outputs": [], + "source": [ + "def load_variables(directory_path, variable_names, startfile, endfile):\n", + " # Define the variable mapping\n", + " variable_mapping = {\n", + " 'NM': 'NMBV'\n", + " }\n", + "\n", + " # Dictionary to store data for each variable\n", + " variable_data = {}\n", + "\n", + " # Pattern to match file names\n", + " pattern = re.compile(r'^newCAM_demo_sub_\\d{startfile,endfile}$')\n", + "\n", + " # Iterate over each data file in the directory\n", + " for file_name in os.listdir(directory_path):\n", + " # Check if the file starts with 'newCAM_demo_sub_'\n", + " if file_name.startswith('newCAM_demo_sub_'):\n", + " # Construct the full file path\n", + " file_path = os.path.join(directory_path, file_name)\n", + "\n", + " # Load data from the file\n", + " with nc.Dataset(file_path) as dataset:\n", + " # Iterate over each variable name\n", + " for var_name in variable_names:\n", + " # Check if the variable exists in the dataset\n", + " mapped_name = variable_mapping.get(var_name, var_name)\n", + " if mapped_name in dataset.variables:\n", + " # Read the variable data\n", + " var_data = dataset[mapped_name][:]\n", + "\n", + " # Store the variable data in the dictionary\n", + " variable_data[var_name] = var_data\n", + "\n", + " return variable_data\n", + "\n", + "\n", + "def load_mean_std(file_path_mean, file_path_std, variable_names):\n", + " \n", + " # Load mean and standard deviation files\n", + " mean_data = np.load(file_path_mean)\n", + " std_data = np.load(file_path_std)\n", + "\n", + " # Define dictionaries to store mean and std for each variable\n", + " mean_dict = {var_name: mean_data[var_name] for var_name in variable_names}\n", + " std_dict = {var_name: std_data[var_name] for var_name in variable_names}\n", + "\n", + " return mean_dict, std_dict\n", + "\n", + "\n", + "\n", + "def normalize_data(variable_data, mean_values, std_values):\n", + " \n", + " normalized_data = {}\n", + "\n", + " # Iterate over each variable in the variable data\n", + " for var_name, var_data in variable_data.items():\n", + " # Check if variable exists in the mean and std dictionaries\n", + " if var_name in mean_values and var_name in std_values:\n", + " # Extract mean and std for the variable\n", + " mean = mean_values[var_name]\n", + " std = std_values[var_name]\n", + "\n", + " # Perform normalization\n", + " normalized_var_data = (var_data - mean) / std\n", + "\n", + " # Store normalized data\n", + " normalized_data[var_name] = normalized_var_data\n", + "\n", + " return normalized_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 293, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variable: PS Shape: (1, 4419)\n", + "Variable: Z3 Shape: (1, 93, 4419)\n", + "Variable: U Shape: (1, 93, 4419)\n", + "Variable: V Shape: (1, 93, 4419)\n", + "Variable: T Shape: (1, 93, 4419)\n", + "Variable: lat Shape: (1, 4419)\n", + "Variable: lon Shape: (1, 4419)\n", + "Variable: DSE Shape: (1, 93, 4419)\n", + "Variable: RHOI Shape: (1, 94, 4419)\n", + "Variable: NETDT Shape: (1, 93, 4419)\n", + "Variable: NM Shape: (1, 93, 4419)\n", + "Variable: UTGWSPEC Shape: (1, 93, 4419)\n", + "Variable: VTGWSPEC Shape: (1, 93, 4419)\n" + ] + } + ], + "source": [ + "variable_data = load_variables(directory_path, features, 1, 5)\n", + "# print(f'Data variables: {variable_data.keys()}')\n", + "mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features)\n", + "# print(f'Mean variables: {mean_dict.keys()}')\n", + "# print(f'Std variables: {std_dict.keys()}')\n", + "normalized_data = normalize_data(variable_data, mean_dict, std_dict)\n", + "# print(f'Normalised variables: {normalized_data.keys()}')\n", + "\n", + "\n", + "for var_name, var_data in normalized_data.items():\n", + " # Get the shape of the variable data\n", + " var_shape = var_data.shape if isinstance(var_data, np.ndarray) else \"Not an array\"\n", + " print(\"Variable:\", var_name, \" Shape:\", var_shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Correct NCol = 4419\n", + "ilev = 93\n", + "\n", + "Points to be considered:\n", + "1. Some variables are different dimensions, varying over different levels (ilev=93/94 here)\n", + "2. These levels cause the input dimensions to become large (8 variables, each with 93 instances (i.e.varying across 93 vertical levels), and 4 variables not varying across the 93 levels.)\n", + "3. Both Input and Ouput variables have 93 levels" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "metadata": {}, + "outputs": [], + "source": [ + "def data_loader(variable_names, normalized_data, ilev):\n", + " # Determine the shape of the data\n", + " Ncol = normalized_data[variable_names[1]].shape[2]\n", + " # print(f'ilev= {ilev} and Ncol={Ncol}')\n", + "\n", + " # Initialize x_train and y_train arrays\n", + " # Calculate dim_NN and dim_NNout\n", + " dim_NN = int(8 * ilev + 4) # 8 variables varying over 93 levels, 4 constant variables (lat, long, PS )\n", + " dim_NNout = int(2 * ilev) #(UTGWSPEC, VTGWSPEC)\n", + "\n", + " # Initialize x_train and y_train arrays\n", + " x_train = np.zeros([dim_NN, Ncol])\n", + " y_train = np.zeros([dim_NNout, Ncol])\n", + "\n", + " # print(f'Set xtrain shape{x_train.shape}')\n", + " # print(f'Set ytrain shape{y_train.shape}')\n", + " target_var = ['UTGWSPEC','VTGWSPEC']\n", + "\n", + " # Assign variables to x_train\n", + " y_index = 0\n", + " x_index = 0\n", + " for var_name, var_data in normalized_data.items():\n", + " var_shape = var_data.shape\n", + "\n", + " if var_name in target_var:\n", + " # print(var_name, y_index\n", + " y_train[y_index * ilev:(y_index + 1) * ilev, :] = var_data.reshape(ilev, Ncol)\n", + " y_index +=1\n", + " \n", + " elif len(var_shape) == 2: # For 2D variables\n", + " # print(var_name, x_index)\n", + " x_train[x_index, :] = var_data\n", + " \n", + " elif len(var_shape) == 3:\n", + " new_ilev = var_shape[1]\n", + " # print(var_name, x_index)\n", + " x_train[x_index:x_index + new_ilev, :] = var_data ### Issue here in extracting variables level-wise because of difference in levels\n", + " x_index+=1\n", + "\n", + " return x_train, y_train\n" + ] + }, + { + "cell_type": "code", + "execution_count": 294, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(748, 4419) (186, 4419)\n" + ] + } + ], + "source": [ + "xtrain, ytrain = data_loader(features, normalized_data, ilev=93)\n", + "print(xtrain.shape, ytrain.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": {}, + "outputs": [], + "source": [ + "# Required for feeding the data into NN.\n", + "class myDataset(Dataset):\n", + " def __init__(self, X, Y):\n", + " \"\"\"\n", + " Parameters:\n", + " X (tensor): Input data.\n", + " Y (tensor): Output data.\n", + " \"\"\"\n", + " self.features = torch.tensor(X, dtype=torch.float64)\n", + " self.labels = torch.tensor(Y, dtype=torch.float64)\n", + "\n", + " def __len__(self):\n", + " \"\"\"Function that is called when you call len(dataloader)\"\"\"\n", + " return len(self.features.T)\n", + "\n", + " def __getitem__(self, idx):\n", + " \"\"\"Function that is called when you call dataloader\"\"\"\n", + " feature = self.features[:, idx]\n", + " label = self.labels[:, idx]\n", + "\n", + " return feature, label\n" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "metadata": {}, + "outputs": [], + "source": [ + "data = myDataset(X=xtrain, Y=ytrain)\n", + "split_data = torch.utils.data.random_split(data, [0.75, 0.25],generator=torch.Generator().manual_seed(42))\n", + "\n", + "train_dataloader = DataLoader(split_data[0], batch_size=128, shuffle=True)\n", + "val_dataloader = DataLoader(split_data[1], batch_size=len(split_data[1]), shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 260, + "metadata": {}, + "outputs": [], + "source": [ + "class FullyConnected(nn.Module):\n", + " def __init__(self):\n", + " \"\"\"Create an instance of FullyConnected NN model.\"\"\"\n", + " super(FullyConnected, self).__init__()\n", + " ilev = 93\n", + " hidden_layers = 8 # Number of hidden layers\n", + " hidden_size = 500 # Number of neurons in each hidden layer\n", + "\n", + " layers = []\n", + " input_size = 8 * ilev + 4\n", + " for _ in range(hidden_layers):\n", + " layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64))\n", + " layers.append(nn.SiLU())\n", + " input_size = hidden_size\n", + "\n", + " layers.append(nn.Linear(hidden_size, 2 * ilev, dtype=torch.float64))\n", + "\n", + " self.linear_stack = nn.Sequential(*layers)\n", + "\n", + " def forward(self, X):\n", + " return self.linear_stack(X)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 261, + "metadata": {}, + "outputs": [], + "source": [ + "class EarlyStopper:\n", + " def __init__(self, patience=1, min_delta=0):\n", + " \"\"\"Create an instance of EarlyStopper class.\"\"\"\n", + " self.patience = patience\n", + " self.min_delta = min_delta\n", + " self.counter = 0\n", + " self.min_validation_loss = np.inf\n", + "\n", + " def early_stop(self, validation_loss, model=None):\n", + " \"\"\"\n", + " Check if early stopping condition is met.\n", + "\n", + " Args:\n", + " validation_loss (float): Loss value on the validation set.\n", + " model (nn.Module, optional): Model to be saved if early stopping condition is met.\n", + "\n", + " Returns\n", + " -------\n", + " bool: True if early stopping condition is met, False otherwise.\n", + " \"\"\"\n", + " if validation_loss < self.min_validation_loss:\n", + " self.min_validation_loss = validation_loss\n", + " self.counter = 0\n", + "\n", + " # Save model\n", + " if model is not None:\n", + " torch.save(model.state_dict(), 'conv_torch.pth')\n", + "\n", + " elif validation_loss > (self.min_validation_loss + self.min_delta):\n", + " self.counter += 1\n", + " if self.counter >= self.patience:\n", + " return True\n", + " return False\n" + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "metadata": {}, + "outputs": [], + "source": [ + "def train_loop(dataloader, model, loss_fn, optimizer):\n", + " size = len(dataloader.dataset)\n", + " avg_loss = 0\n", + " for batch, (X, Y) in enumerate(dataloader):\n", + " # Compute prediction and loss\n", + " pred = model(X)\n", + " loss = loss_fn(pred, Y)\n", + "\n", + " # Backpropagation\n", + " optimizer.zero_grad(set_to_none=True)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " avg_loss += loss.item() # Accumulate loss as a float\n", + "\n", + " avg_loss /= len(dataloader)\n", + "\n", + " return avg_loss\n", + "\n", + "\n", + "\n", + "def val_loop(dataloader, model, loss_fn):\n", + " avg_loss = sum(loss_fn(model(X), Y).item() for X, Y in dataloader) / len(dataloader)\n", + " return avg_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 263, + "metadata": {}, + "outputs": [], + "source": [ + "def train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs=100):\n", + " train_losses = []\n", + " val_losses = [0]\n", + "\n", + " for epoch in range(epochs):\n", + " if epoch % 2 == 0:\n", + " print(f\"Epoch {epoch+1}\\n-------------------------------\")\n", + " print(val_losses[-1])\n", + " print('counter=' + str(early_stopper.counter))\n", + "\n", + " train_loss = train_loop(train_dataloader, model, criterion, optimizer)\n", + " train_losses.append(train_loss)\n", + "\n", + " val_loss = val_loop(val_dataloader, model, criterion)\n", + " val_losses.append(val_loss)\n", + "\n", + " if early_stopper.early_stop(val_loss, model):\n", + " print(\"BREAK!\")\n", + " break\n", + "\n", + " return train_losses, val_losses\n" + ] + }, + { + "cell_type": "code", + "execution_count": 264, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1\n", + "-------------------------------\n", + "0\n", + "counter=0\n", + "Epoch 3\n", + "-------------------------------\n", + "0.8489934648869807\n", + "counter=0\n", + "Epoch 5\n", + "-------------------------------\n", + "0.848900956575377\n", + "counter=0\n", + "Epoch 7\n", + "-------------------------------\n", + "0.8488277427356834\n", + "counter=0\n", + "Epoch 9\n", + "-------------------------------\n", + "0.8487524616774139\n", + "counter=0\n", + "Epoch 11\n", + "-------------------------------\n", + "0.8486892791301094\n", + "counter=0\n", + "Epoch 13\n", + "-------------------------------\n", + "0.8486358109089807\n", + "counter=0\n", + "Epoch 15\n", + "-------------------------------\n", + "0.8485900223018678\n", + "counter=0\n", + "Epoch 17\n", + "-------------------------------\n", + "0.8485513591271571\n", + "counter=0\n", + "Epoch 19\n", + "-------------------------------\n", + "0.8485374125600498\n", + "counter=0\n", + "Epoch 21\n", + "-------------------------------\n", + "0.8485556882976334\n", + "counter=2\n", + "Epoch 23\n", + "-------------------------------\n", + "0.8485675317451268\n", + "counter=4\n", + "BREAK!\n" + ] + } + ], + "source": [ + "learning_rate = 1e-5\n", + "epochs = 100\n", + "\n", + "model = FullyConnected()\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", + "criterion = nn.MSELoss()\n", + "early_stopper = EarlyStopper(patience=5, min_delta=0)\n", + "\n", + "train_losses, val_losses = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 275, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def predict(input_data, model):\n", + " # Convert input data to tensors\n", + " input_tensors = {key: torch.tensor(value) for key, value in input_data.items()}\n", + "\n", + " # Ensure model is in evaluation mode\n", + " model.eval()\n", + "\n", + " # Forward pass to make predictions\n", + " with torch.no_grad():\n", + " predictions = model(**input_tensors)\n", + "\n", + " return predictions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 287, + "metadata": {}, + "outputs": [], + "source": [ + "test_data = load_variables(directory_path, features, 5,6)\n", + "mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features)\n", + "normalized_test_data = normalize_data(test_data, mean_dict, std_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 284, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 284, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_path = 'conv_torch.pth'\n", + "model = FullyConnected()\n", + "model.load_state_dict(torch.load(model_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 291, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(748, 4419) (186, 4419)\n" + ] + } + ], + "source": [ + "\n", + "x_test, y_test = data_loader(features, normalized_test_data, ilev=93)\n", + "print(x_test.shape, y_test.shape)\n", + "\n", + "test_data = myDataset(X=x_test, Y=y_test)\n", + "\n", + "test_loader = DataLoader(data, batch_size=len(data), shuffle=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From dd921f09ac26ce26a133d07d99c70d9f54d1d9e0 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 17 Jun 2024 13:01:07 +0100 Subject: [PATCH 12/35] Updated loaddata, Model, train files and removed NN_pred, and added a main file --- newCAM_emulation/Model.py | 136 +++++++++-------- newCAM_emulation/loaddata.py | 238 +++++++++++++++++++++++++----- newCAM_emulation/main.py | 67 +++++++++ newCAM_emulation/train.py | 274 +++++++++++++---------------------- 4 files changed, 441 insertions(+), 274 deletions(-) create mode 100644 newCAM_emulation/main.py diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 7077939..096d0cc 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -1,92 +1,104 @@ """Neural Network model for the CAM-EM.""" -import netCDF4 as nc -import numpy as np -import scipy.stats as st import torch -import xarray as xr +import numpy as np from torch import nn -from torch.nn.utils import prune -from torch.utils.data import DataLoader, Dataset -# Required for feeding the data iinto NN. -class myDataset(Dataset): +class FullyConnected(nn.Module): """ - Dataset class for loading features and labels. + Fully connected neural network model. - Args: - X (numpy.ndarray): Input features. - Y (numpy.ndarray): Corresponding labels. + Attributes + ---------- + linear_stack : nn.Sequential + Sequential container of linear layers and activation functions. """ - def __init__(self, X, Y): - """Create an instance of myDataset class.""" - self.features = torch.tensor(X, dtype=torch.float64) - self.labels = torch.tensor(Y, dtype=torch.float64) - - def __len__(self): - """Return the number of samples in the dataset.""" - return len(self.features.T) - - def __getitem__(self, idx): - """Return a sample from the dataset.""" - feature = self.features[:, idx] - label = self.labels[:, idx] + def __init__(self): + super(FullyConnected, self).__init__() + ilev = 93 + hidden_layers = 8 + hidden_size = 500 + layers = [] - return feature, label + input_size = 8 * ilev + 4 + for _ in range(hidden_layers): + layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64)) + layers.append(nn.SiLU()) + input_size = hidden_size + layers.append(nn.Linear(hidden_size, 2 * ilev, dtype=torch.float64)) + self.linear_stack = nn.Sequential(*layers) + def forward(self, X): + """ + Forward pass through the network. -# The NN model. -class NormalizationLayer(nn.Module): - def __init__(self, mean, std): - super(NormalizationLayer, self).__init__() - self.mean = mean - self.std = std + Parameters + ---------- + X : torch.Tensor + Input tensor. - def forward(self, x): - return (x - self.mean) / self.std + Returns + ------- + torch.Tensor + Output tensor. + """ + return self.linear_stack(X) -class FullyConnected(nn.Module): +class EarlyStopper: """ - Fully connected neural network model. + Early stopping utility to stop training when validation loss doesn't improve. - The model consists of multiple fully connected layers with SiLU activation function. + Parameters + ---------- + patience : int, optional + Number of epochs to wait before stopping (default is 1). + min_delta : float, optional + Minimum change in the monitored quantity to qualify as an improvement (default is 0). Attributes ---------- - linear_stack (torch.nn.Sequential): Sequential container for layers. + patience : int + Number of epochs to wait before stopping. + min_delta : float + Minimum change in the monitored quantity to qualify as an improvement. + counter : int + Counter for the number of epochs without improvement. + min_validation_loss : float + Minimum validation loss recorded. """ - def __init__(self, ilev, mean, std): - """Create an instance of FullyConnected NN model.""" - super(FullyConnected, self).__init__() - self.normalization = NormalizationLayer(mean, std) - self.ilev = ilev - - layers = [] - layers.append(nn.Linear(8 * ilev + 4, 500)) - layers.append(nn.SiLU()) - - num_layers = 10 # Example: Change this to the desired number of hidden layers - for _ in range(num_layers): - layers.append(nn.Linear(500, 500)) - layers.append(nn.SiLU()) + def __init__(self, patience=1, min_delta=0): + self.patience = patience + self.min_delta = min_delta + self.counter = 0 + self.min_validation_loss = np.inf - layers.append(nn.Linear(500, 2 * ilev)) - self.linear_stack = nn.Sequential(*layers) - - def forward(self, x): + def early_stop(self, validation_loss, model=None): """ - Forward pass through the network. + Check if training should be stopped early. - Args: - x (torch.Tensor): Input tensor. + Parameters + ---------- + validation_loss : float + Current validation loss. + model : nn.Module, optional + Model to save if validation loss improves (default is None). Returns ------- - torch.Tensor: Output tensor. + bool + True if training should be stopped, False otherwise. """ - x = self.normalization(x) - return self.linear_stack(x) + if validation_loss < self.min_validation_loss: + self.min_validation_loss = validation_loss + self.counter = 0 + if model is not None: + torch.save(model.state_dict(), "conv_torch.pth") + elif validation_loss > (self.min_validation_loss + self.min_delta): + self.counter += 1 + if self.counter >= self.patience: + return True + return False diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index f2b8cae..f4a6309 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -1,42 +1,208 @@ """Implementing data loader for training neural network.""" +import os +import re +import torch import numpy as np +import netCDF4 as nc -ilev = 93 -dim_NN = int(8 * ilev + 4) -dim_NNout = int(2 * ilev) - - -# Iterating through the data files -s_list = list(range(1, 6)) -for iter in s_list: - filename = ( - "Demodata/Convection/newCAM_demo_sub_" + str(iter).zfill(1) + ".nc" - ) # data file - print("working on: ", filename) - fm = np.load("Demodata/mean_demo_sub.npz") # mean file - fs = np.load("Demodata/std_demo_sub.npz") # std deviation file - - -def data_loader(U, V, T, DSE, NM, NETDT, Z3, RHOI, PS, lat, lon, UTGWSPEC, VTGWSPEC): - """Function to iterate over the data read by the above part of code""" - Ncol = U.shape[1] - x_train = np.vstack( - [ - U.reshape(-1, Ncol), - V.reshape(-1, Ncol), - T.reshape(-1, Ncol), - DSE.reshape(-1, Ncol), - NM.reshape(-1, Ncol), - NETDT.reshape(-1, Ncol), - Z3.reshape(-1, Ncol), - RHOI.reshape(-1, Ncol), - PS.reshape(1, Ncol), - lat.reshape(1, Ncol), - lon.reshape(1, Ncol), - ] - ) - - y_train = np.vstack([UTGWSPEC.reshape(-1, Ncol), VTGWSPEC.reshape(-1, Ncol)]) +features = [ + "PS", + "Z3", + "U", + "V", + "T", + "lat", + "lon", + "DSE", + "RHOI", + "NETDT", + "NM", + "UTGWSPEC", + "VTGWSPEC", +] + + +def load_variables(directory_path, variable_names, startfile, endfile): + """ + Loads specified variables from NetCDF files in the given directory. + + Parameters + ---------- + directory_path : str + Path to the directory containing NetCDF files. + variable_names : list of str + List of variable names to load. + startfile : int + Starting file number. + endfile : int + Ending file number. + + Returns + ------- + dict + Dictionary containing loaded variables data. + """ + variable_mapping = {"NM": "NMBV"} + variable_data = {} + pattern = re.compile(r"^newCAM_demo_sub_\d{startfile,endfile}$") + + for file_name in os.listdir(directory_path): + if file_name.startswith("newCAM_demo_sub_"): + file_path = os.path.join(directory_path, file_name) + with nc.Dataset(file_path) as dataset: + for var_name in variable_names: + mapped_name = variable_mapping.get(var_name, var_name) + if mapped_name in dataset.variables: + var_data = dataset[mapped_name][:] + variable_data[var_name] = var_data + + return variable_data + + +def load_mean_std(file_path_mean, file_path_std, variable_names): + """ + Loads mean and standard deviation values for specified variables from files. + + Parameters + ---------- + file_path_mean : str + Path to the file containing mean values. + file_path_std : str + Path to the file containing standard deviation values. + variable_names : list of str + List of variable names. + + Returns + ------- + tuple of dict + Dictionaries containing mean and standard deviation values. + """ + mean_data = np.load(file_path_mean) + std_data = np.load(file_path_std) + mean_dict = {var_name: mean_data[var_name] for var_name in variable_names} + std_dict = {var_name: std_data[var_name] for var_name in variable_names} + return mean_dict, std_dict + + +def normalize_data(variable_data, mean_values, std_values): + """ + Normalizes the data using mean and standard deviation values. + + Parameters + ---------- + variable_data : dict + Dictionary containing the variable data. + mean_values : dict + Dictionary containing mean values. + std_values : dict + Dictionary containing standard deviation values. + + Returns + ------- + dict + Dictionary containing normalized data. + """ + normalized_data = {} + for var_name, var_data in variable_data.items(): + if var_name in mean_values and var_name in std_values: + mean = mean_values[var_name] + std = std_values[var_name] + normalized_var_data = (var_data - mean) / std + normalized_data[var_name] = normalized_var_data + return normalized_data + + +def data_loader(variable_names, normalized_data, ilev): + """ + Prepares the data for training by organizing it into input and output arrays. + + Parameters + ---------- + variable_names : list of str + List of variable names. + normalized_data : dict + Dictionary containing normalized data. + ilev : int + Number of vertical levels. + + Returns + ------- + tuple of np.ndarray + Input and output arrays for training. + """ + Ncol = normalized_data[variable_names[1]].shape[2] + dim_NN = int(8 * ilev + 4) + dim_NNout = int(2 * ilev) + x_train = np.zeros([dim_NN, Ncol]) + y_train = np.zeros([dim_NNout, Ncol]) + target_var = ["UTGWSPEC", "VTGWSPEC"] + y_index = 0 + x_index = 0 + for var_name, var_data in normalized_data.items(): + var_shape = var_data.shape + if var_name in target_var: + y_train[y_index * ilev : (y_index + 1) * ilev, :] = var_data.reshape( + ilev, Ncol + ) + y_index += 1 + elif len(var_shape) == 2: + x_train[x_index, :] = var_data + elif len(var_shape) == 3: + new_ilev = var_shape[1] + x_train[x_index : x_index + new_ilev, :] = var_data + x_index += 1 return x_train, y_train + + +class MyDataset(torch.utils.data.Dataset): + """ + Custom Dataset for loading features and labels. + + Parameters + ---------- + X : np.ndarray + Feature data. + Y : np.ndarray + Label data. + + Attributes + ---------- + features : torch.Tensor + Tensor containing the feature data. + labels : torch.Tensor + Tensor containing the label data. + """ + + def __init__(self, X, Y): + self.features = torch.tensor(X, dtype=torch.float64) + self.labels = torch.tensor(Y, dtype=torch.float64) + + def __len__(self): + """ + Returns the length of the dataset. + Returns + ------- + int + Length of the dataset. + """ + return len(self.features.T) + + def __getitem__(self, idx): + """ + Returns a single sample from the dataset. + + Parameters + ---------- + idx : int + Index of the sample. + + Returns + ------- + tuple of torch.Tensor + Feature and label tensors for the sample. + """ + feature = self.features[:, idx] + label = self.labels[:, idx] + return feature, label diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py new file mode 100644 index 0000000..d0cb791 --- /dev/null +++ b/newCAM_emulation/main.py @@ -0,0 +1,67 @@ +import os +import numpy as np +import torch +from torch.utils.data import DataLoader +import torch.nn as nn +from loaddata import load_variables, load_mean_std, normalize_data, data_loader, MyDataset, features +from Model import FullyConnected, EarlyStopper +from train import train_with_early_stopping + +# File paths and parameters +directory_path = '../Demodata/Convection' +file_path_mean = '../Demodata/Convection/mean_demo_sub.npz' +file_path_std = '../Demodata/Convection/std_demo_sub.npz' +trained_model_path = 'conv_torch.pth' # Path to save and load the trained model + +# Load and preprocess data +variable_data = load_variables(directory_path, features, 1, 5) +mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features) +normalized_data = normalize_data(variable_data, mean_dict, std_dict) +xtrain, ytrain = data_loader(features, normalized_data, ilev=93) + + +# Print the shapes of xtrain and ytrain +print(f"xtrain shape: {xtrain.shape}") +print(f"ytrain shape: {ytrain.shape}") + + +# Prepare dataset and dataloaders +data = MyDataset(X=xtrain, Y=ytrain) +split_data = torch.utils.data.random_split(data, [0.75, 0.25], generator=torch.Generator().manual_seed(42)) +train_dataloader = DataLoader(split_data[0], batch_size=128, shuffle=True) +val_dataloader = DataLoader(split_data[1], batch_size=len(split_data[1]), shuffle=True) + +# Model training parameters +learning_rate = 1e-5 +epochs = 100 +model = FullyConnected() +optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) +criterion = nn.MSELoss() +early_stopper = EarlyStopper(patience=5, min_delta=0) + +# Train the model with early stopping +train_losses, val_losses = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs=epochs) + +# Save the trained model +torch.save(model.state_dict(), trained_model_path) + +# Load the trained model for prediction +model.load_state_dict(torch.load(trained_model_path)) +model.eval() + +# Prepare input data for prediction +# For prediction, we need new input data. Here, we use different files for simplicity. +test_data = load_variables(directory_path, features, 4, 5) +normalized_test_data = normalize_data(test_data, mean_dict, std_dict) +x_test, y_test = data_loader(features, normalized_test_data, ilev=93) + +# Convert test data to tensors +x_test_tensor = torch.tensor(x_test, dtype=torch.float64).T + +# Make predictions +with torch.no_grad(): + predictions = model(x_test_tensor).numpy() + +# Print predictions +print("Predictions Shape:\n", predictions.shape) +# print("Predictions:\n", predictions) diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 3f44029..3f084ad 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -1,188 +1,110 @@ """Training script for the neural network.""" -import Model -import netCDF4 as nc -import numpy as np - import torch -from loaddata import data_loader, newnorm from torch import nn -from torch.backends import mps -from torch.cuda import is_available -from torch.utils.data import DataLoader -from torch.cuda import is_available -from torch.backends import mps -if is_available(): - DEVICE = "cuda" -elif mps.is_available(): - DEVICE = "mps" -else: - DEVICE = "cpu" -print(f"Using device: {DEVICE}") - - -class EarlyStopper: - """Class for implementing early stopping during training.""" - def __init__(self, patience=1, min_delta=0): - """Create an instance of EarlyStopper class.""" - self.patience = patience - self.min_delta = min_delta - self.counter = 0 - self.min_validation_loss = np.inf - def early_stop(self, validation_loss): - """ - Check if early stopping condition is met. - - Args: - validation_loss (float): Loss value on the validation set. +def train_loop(dataloader, model, loss_fn, optimizer): + """ + Training loop for a single epoch. + + Parameters + ---------- + dataloader : torch.utils.data.DataLoader + DataLoader for the training data. + model : nn.Module + Neural network model. + loss_fn : callable + Loss function. + optimizer : torch.optim.Optimizer + Optimizer for training. + + Returns + ------- + float + Average training loss. + """ + avg_loss = 0 + for batch, (X, Y) in enumerate(dataloader): + pred = model(X) + loss = loss_fn(pred, Y) + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + avg_loss += loss.item() + avg_loss /= len(dataloader) + return avg_loss + + +def val_loop(dataloader, model, loss_fn): + """ + Validation loop for a single epoch. + >>>>>>> 70b2c64 (Updated loaddata, Model, train files and removed NN_pred, and added a main file) + + Parameters + ---------- + dataloader : torch.utils.data.DataLoader + DataLoader for the validation data. + model : nn.Module + Neural network model. + loss_fn : callable + Loss function. Returns ------- - bool: True if early stopping condition is met, False otherwise. - """ - if validation_loss < self.min_validation_loss: - self.min_validation_loss = validation_loss - self.counter = 0 - - # save model - torch.save(model.state_dict(), "conv_torch.pth") - - elif validation_loss > (self.min_validation_loss + self.min_delta): - self.counter += 1 - if self.counter >= self.patience: - return True - return False - - - - -## load mean and std for normalization -fm = np.load("../Demodata/mean_demo_sub.npz") -fs = np.load("../Demodata/std_demo_sub.npz") - -Um = fm["U"] -Vm = fm["V"] -Tm = fm["T"] -DSEm = fm["DSE"] -NMm = fm["NM"] -NETDTm = fm["NETDT"] -Z3m = fm["Z3"] -RHOIm = fm["RHOI"] -PSm = fm["PS"] -latm = fm["lat"] -lonm = fm["lon"] -UTGWSPECm = fm["UTGWSPEC"] -VTGWSPECm = fm["VTGWSPEC"] - -Us = fs["U"] -Vs = fs["V"] -Ts = fs["T"] -DSEs = fs["DSE"] -NMs = fs["NM"] -NETDTs = fs["NETDT"] -Z3s = fs["Z3"] -RHOIs = fs["RHOI"] -PSs = fs["PS"] -lats = fs["lat"] -lons = fs["lon"] -UTGWSPECs = fs["UTGWSPEC"] -VTGWSPECs = fs["VTGWSPEC"] - -ilev = 93 - -dim_NN = int(8 * ilev + 4) -dim_NNout = int(2 * ilev) - -model = Model.FullyConnected() - -train_losses = [] -val_losses = [0] - -learning_rate = 1e-5 -epochs = 100 -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # weight_decay=1e-5 - - -s_list = list(range(1, 6)) - -for iter in s_list: - if iter > 1: - model.load_state_dict(torch.load("conv_torch.pth")) - print("data loader iteration", iter) - filename = "../Demodata/newCAM_demo_sub_" + str(iter).zfill(1) + ".nc" - print("working on: ", filename) - - F = nc.Dataset(filename) - - PS = np.asarray(F["PS"][0, :]) - PS = newnorm(PS, PSm, PSs) - - Z3 = np.asarray(F["Z3"][0, :, :]) - Z3 = newnorm(Z3, Z3m, Z3s) - - U = np.asarray(F["U"][0, :, :]) - U = newnorm(U, Um, Us) - - V = np.asarray(F["V"][0, :, :]) - V = newnorm(V, Vm, Vs) - - T = np.asarray(F["T"][0, :, :]) - T = newnorm(T, Tm, Ts) - - lat = F["lat"] - lat = newnorm(lat, np.mean(lat), np.std(lat)) - - lon = F["lon"] - lon = newnorm(lon, np.mean(lon), np.std(lon)) - - DSE = np.asarray(F["DSE"][0, :, :]) - DSE = newnorm(DSE, DSEm, DSEs) - - RHOI = np.asarray(F["RHOI"][0, :, :]) - RHOI = newnorm(RHOI, RHOIm, RHOIs) - - NETDT = np.asarray(F["NETDT"][0, :, :]) - NETDT = newnorm(NETDT, NETDTm, NETDTs) - - NM = np.asarray(F["NMBV"][0, :, :]) - NM = newnorm(NM, NMm, NMs) - - UTGWSPEC = np.asarray(F["UTGWSPEC"][0, :, :]) - UTGWSPEC = newnorm(UTGWSPEC, UTGWSPECm, UTGWSPECs) - - VTGWSPEC = np.asarray(F["VTGWSPEC"][0, :, :]) - VTGWSPEC = newnorm(VTGWSPEC, VTGWSPECm, VTGWSPECs) - - x_train, y_train = data_loader( - U, V, T, DSE, NM, NETDT, Z3, RHOI, PS, lat, lon, UTGWSPEC, VTGWSPEC - ) - - data = Model.myDataset(X=x_train, Y=y_train) - - batch_size = 128 - - split_data = torch.utils.data.random_split( - data, [0.75, 0.25], generator=torch.Generator().manual_seed(42) - ) - train_dataloader = DataLoader(split_data[0], batch_size=batch_size, shuffle=True) - val_dataloader = DataLoader( - split_data[1], batch_size=len(split_data[1]), shuffle=True - ) - - # training - early_stopper = EarlyStopper(patience=5, min_delta=0) # Note the hyper parameters. - for t in range(epochs): - if t % 2 == 0: - print(f"Epoch {t+1}\n-------------------------------") - print(val_losses[-1]) - print("counter=" + str(early_stopper.counter)) - train_loss = Model.train_loop(train_dataloader, model, nn.MSELoss(), optimizer) - + float + Average validation loss. + """ + avg_loss = sum(loss_fn(model(X), Y).item() for X, Y in dataloader) / len(dataloader) + return avg_loss + + +def train_with_early_stopping( + train_dataloader, + val_dataloader, + model, + optimizer, + criterion, + early_stopper, + epochs=100, +): + """ + Train the model with early stopping. + + Parameters + ---------- + train_dataloader : torch.utils.data.DataLoader + DataLoader for the training data. + val_dataloader : torch.utils.data.DataLoader + DataLoader for the validation data. + model : nn.Module + Neural network model. + optimizer : torch.optim.Optimizer + Optimizer for training. + criterion : callable + Loss function. + early_stopper : EarlyStopper + Early stopping utility. + epochs : int, optional + Number of epochs to train (default is 100). + + Returns + ------- + tuple of list of float + Training losses and validation losses for each epoch. + """ + train_losses = [] + val_losses = [0] + for epoch in range(epochs): + # if epoch % 2 == 0: + # print(f"Epoch {epoch + 1}\n-------------------------------") + # print(val_losses[-1]) + # print('counter=' + str(early_stopper.counter)) + train_loss = train_loop(train_dataloader, model, criterion, optimizer) train_losses.append(train_loss) - val_loss = Model.val_loop(val_dataloader, model, nn.MSELoss()) + val_loss = val_loop(val_dataloader, model, criterion) val_losses.append(val_loss) - if early_stopper.early_stop(val_loss): - print("BREAK!") + if early_stopper.early_stop(val_loss, model): + # print("BREAK!") break + return train_losses, val_losses From 975d23b9f0dd902527b8160164d81222a57a4b44 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 24 Jun 2024 20:46:22 +0100 Subject: [PATCH 13/35] Updated Model.py --- newCAM_emulation/Model.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 096d0cc..d39b6d0 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -15,14 +15,10 @@ class FullyConnected(nn.Module): Sequential container of linear layers and activation functions. """ - def __init__(self): + def __init__(self, ilev=93, hidden_layers=8, hidden_size=500): super(FullyConnected, self).__init__() - ilev = 93 - hidden_layers = 8 - hidden_size = 500 layers = [] - - input_size = 8 * ilev + 4 + input_size = 8 * ilev + 4 ### Correct this hard coded part for _ in range(hidden_layers): layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64)) layers.append(nn.SiLU()) From 9bb671cb72636eb2d3aeb2158694aae929d2b0fb Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 24 Jun 2024 20:47:17 +0100 Subject: [PATCH 14/35] removed a side comment --- newCAM_emulation/Model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index d39b6d0..05f7f40 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -18,7 +18,7 @@ class FullyConnected(nn.Module): def __init__(self, ilev=93, hidden_layers=8, hidden_size=500): super(FullyConnected, self).__init__() layers = [] - input_size = 8 * ilev + 4 ### Correct this hard coded part + input_size = 8 * ilev + 4 for _ in range(hidden_layers): layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64)) layers.append(nn.SiLU()) From b578031ac472f857a0c0d01dd332004c4ad4a1c1 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 24 Jun 2024 20:51:05 +0100 Subject: [PATCH 15/35] removed redundant NN_pred.py, replaced with main.py --- newCAM_emulation/NN_pred.py | 170 ------------------------------------ 1 file changed, 170 deletions(-) delete mode 100644 newCAM_emulation/NN_pred.py diff --git a/newCAM_emulation/NN_pred.py b/newCAM_emulation/NN_pred.py deleted file mode 100644 index 8b20304..0000000 --- a/newCAM_emulation/NN_pred.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Prediction module for the neural network.""" - -import matplotlib.pyplot as plt -import Model -import netCDF4 as nc -import numpy as np -import torch -import torch.nn.functional as nnF -import torchvision -from loaddata import data_loader, newnorm -from torch import nn -from torch.utils.data import DataLoader -from torchvision import datasets, transforms -from torchvision.utils import save_image - -""" -Determine if any GPUs are available -""" -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -print(device) - - -""" -Initialize Hyperparameters -""" -ilev = 93 -dim_NN = 8*ilev + 4 -dim_NNout = 2*ilev - -batch_size = 8 -learning_rate = 1e-4 -num_epochs = 1 - - - - - -## load mean and std for normalization -fm = np.load('Demodata/mean_demo.npz') -fs = np.load('Demodata/std_demo.npz') - -Um = fm['U'] -Vm = fm['V'] -Tm = fm['T'] -DSEm = fm['DSE'] -NMm = fm['NM'] -NETDTm = fm['NETDT'] -Z3m = fm['Z3'] -RHOIm = fm['RHOI'] -PSm = fm['PS'] -latm = fm['lat'] -lonm = fm['lon'] -UTGWSPECm = fm['UTGWSPEC'] -VTGWSPECm = fm['VTGWSPEC'] - -Us = fs['U'] -Vs = fs['V'] -Ts = fs['T'] -DSEs = fs['DSE'] -NMs = fs['NM'] -NETDTs = fs['NETDT'] -Z3s = fs['Z3'] -RHOIs = fs['RHOI'] -PSs = fs['PS'] -lats = fs['lat'] -lons = fs['lon'] -UTGWSPECs = fs['UTGWSPEC'] -VTGWSPECs = fs['VTGWSPEC'] - - - -""" -Initialize the network and the Adam optimizer -""" -GWnet = Model.FullyConnected() - -optimizer = torch.optim.Adam(GWnet.parameters(), lr=learning_rate) - - -s_list = list(range(5,6)) - -for iter in s_list: - if (iter > 0): - GWnet.load_state_dict(torch.load('./conv_torch.pth')) - GWnet.eval() - print ('data loader iteration',iter) - filename = './Demodata/Demo_timestep_' + str(iter).zfill(3) + '.nc' - - F = nc.Dataset(filename) - PS = np.asarray(F['PS'][0,:]) - PS = newnorm(PS, PSm, PSs) - - Z3 = np.asarray(F['Z3'][0,:,:]) - Z3 = newnorm(Z3, Z3m, Z3s) - - U = np.asarray(F['U'][0,:,:]) - U = newnorm(U, Um, Us) - - V = np.asarray(F['V'][0,:,:]) - V = newnorm(V, Vm, Vs) - - T = np.asarray(F['T'][0,:,:]) - T = newnorm(T, Tm, Ts) - - lat = F['lat'] - lat = newnorm(lat, np.mean(lat), np.std(lat)) - - lon = F['lon'] - lon = newnorm(lon, np.mean(lon), np.std(lon)) - - DSE = np.asarray(F['DSE'][0,:,:]) - DSE = newnorm(DSE, DSEm, DSEs) - - RHOI = np.asarray(F['RHOI'][0,:,:]) - RHOI = newnorm(RHOI, RHOIm, RHOIs) - - NETDT = np.asarray(F['NETDT'][0,:,:]) - NETDT = newnorm(NETDT, NETDTm, NETDTs) - - NM = np.asarray(F['NMBV'][0,:,:]) - NM = newnorm(NM, NMm, NMs) - - UTGWSPEC = np.asarray(F['BUTGWSPEC'][0,:,:]) - UTGWSPEC = newnorm(UTGWSPEC, UTGWSPECm, UTGWSPECs) - - VTGWSPEC = np.asarray(F['BVTGWSPEC'][0,:,:]) - VTGWSPEC = newnorm(VTGWSPEC, VTGWSPECm, VTGWSPECs) - - - - print('shape of PS',np.shape(PS)) - print('shape of Z3',np.shape(Z3)) - print('shape of U',np.shape(U)) - print('shape of V',np.shape(V)) - print('shape of T',np.shape(T)) - print('shape of DSE',np.shape(DSE)) - print('shape of RHOI',np.shape(RHOI)) - print('shape of NETDT',np.shape(NETDT)) - print('shape of NM',np.shape(NM)) - print('shape of UTGWSPEC',np.shape(UTGWSPEC)) - print('shape of VTGWSPEC',np.shape(VTGWSPEC)) - - x_test,y_test = data_loader (U,V,T, DSE, NM, NETDT, Z3, - RHOI, PS,lat,lon,UTGWSPEC, VTGWSPEC) - - print('shape of x_test', np.shape(x_test)) - print('shape of y_test', np.shape(y_test)) - - - data = Model.myDataset(X=x_test, Y=y_test) - test_loader = DataLoader(data, batch_size=len(data), shuffle=False) - print(test_loader) - - - for batch, (X, Y) in enumerate(test_loader): - print(np.shape(Y)) - pred = GWnet(X) - truth = Y.cpu().detach().numpy() - predict = pred.cpu().detach().numpy() - - print(np.corrcoef(truth.flatten(), predict.flatten())[0, 1]) - print('shape of truth ',np.shape(truth)) - print('shape of prediction',np.shape(predict)) - - np.save('./pred_data_' + str(iter) + '.npy', predict) - - - - - From 7167611dad674f4fa67ce7bac6b387e85343fd40 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 24 Jun 2024 20:52:08 +0100 Subject: [PATCH 16/35] removed --- newCAM_emulation/GW.ipynb | 581 -------------------------------------- 1 file changed, 581 deletions(-) delete mode 100644 newCAM_emulation/GW.ipynb diff --git a/newCAM_emulation/GW.ipynb b/newCAM_emulation/GW.ipynb deleted file mode 100644 index 119be08..0000000 --- a/newCAM_emulation/GW.ipynb +++ /dev/null @@ -1,581 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 252, - "metadata": {}, - "outputs": [], - "source": [ - "import netCDF4 as nc\n", - "import numpy as np\n", - "import os\n", - "import re\n", - "import torch\n", - "from torch import nn\n", - "from torch.utils.data import Dataset\n", - "from torch.utils.data import DataLoader\n" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [], - "source": [ - "# Define variable names and corresponding mean and std values\n", - "features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC']\n", - "\n", - "directory_path = '../Demodata/Convection'\n", - "file_path_mean = '../Demodata/Convection/mean_demo_sub.npz'\n", - "file_path_std = '../Demodata/Convection/std_demo_sub.npz'" - ] - }, - { - "cell_type": "code", - "execution_count": 267, - "metadata": {}, - "outputs": [], - "source": [ - "def load_variables(directory_path, variable_names, startfile, endfile):\n", - " # Define the variable mapping\n", - " variable_mapping = {\n", - " 'NM': 'NMBV'\n", - " }\n", - "\n", - " # Dictionary to store data for each variable\n", - " variable_data = {}\n", - "\n", - " # Pattern to match file names\n", - " pattern = re.compile(r'^newCAM_demo_sub_\\d{startfile,endfile}$')\n", - "\n", - " # Iterate over each data file in the directory\n", - " for file_name in os.listdir(directory_path):\n", - " # Check if the file starts with 'newCAM_demo_sub_'\n", - " if file_name.startswith('newCAM_demo_sub_'):\n", - " # Construct the full file path\n", - " file_path = os.path.join(directory_path, file_name)\n", - "\n", - " # Load data from the file\n", - " with nc.Dataset(file_path) as dataset:\n", - " # Iterate over each variable name\n", - " for var_name in variable_names:\n", - " # Check if the variable exists in the dataset\n", - " mapped_name = variable_mapping.get(var_name, var_name)\n", - " if mapped_name in dataset.variables:\n", - " # Read the variable data\n", - " var_data = dataset[mapped_name][:]\n", - "\n", - " # Store the variable data in the dictionary\n", - " variable_data[var_name] = var_data\n", - "\n", - " return variable_data\n", - "\n", - "\n", - "def load_mean_std(file_path_mean, file_path_std, variable_names):\n", - " \n", - " # Load mean and standard deviation files\n", - " mean_data = np.load(file_path_mean)\n", - " std_data = np.load(file_path_std)\n", - "\n", - " # Define dictionaries to store mean and std for each variable\n", - " mean_dict = {var_name: mean_data[var_name] for var_name in variable_names}\n", - " std_dict = {var_name: std_data[var_name] for var_name in variable_names}\n", - "\n", - " return mean_dict, std_dict\n", - "\n", - "\n", - "\n", - "def normalize_data(variable_data, mean_values, std_values):\n", - " \n", - " normalized_data = {}\n", - "\n", - " # Iterate over each variable in the variable data\n", - " for var_name, var_data in variable_data.items():\n", - " # Check if variable exists in the mean and std dictionaries\n", - " if var_name in mean_values and var_name in std_values:\n", - " # Extract mean and std for the variable\n", - " mean = mean_values[var_name]\n", - " std = std_values[var_name]\n", - "\n", - " # Perform normalization\n", - " normalized_var_data = (var_data - mean) / std\n", - "\n", - " # Store normalized data\n", - " normalized_data[var_name] = normalized_var_data\n", - "\n", - " return normalized_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 293, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variable: PS Shape: (1, 4419)\n", - "Variable: Z3 Shape: (1, 93, 4419)\n", - "Variable: U Shape: (1, 93, 4419)\n", - "Variable: V Shape: (1, 93, 4419)\n", - "Variable: T Shape: (1, 93, 4419)\n", - "Variable: lat Shape: (1, 4419)\n", - "Variable: lon Shape: (1, 4419)\n", - "Variable: DSE Shape: (1, 93, 4419)\n", - "Variable: RHOI Shape: (1, 94, 4419)\n", - "Variable: NETDT Shape: (1, 93, 4419)\n", - "Variable: NM Shape: (1, 93, 4419)\n", - "Variable: UTGWSPEC Shape: (1, 93, 4419)\n", - "Variable: VTGWSPEC Shape: (1, 93, 4419)\n" - ] - } - ], - "source": [ - "variable_data = load_variables(directory_path, features, 1, 5)\n", - "# print(f'Data variables: {variable_data.keys()}')\n", - "mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features)\n", - "# print(f'Mean variables: {mean_dict.keys()}')\n", - "# print(f'Std variables: {std_dict.keys()}')\n", - "normalized_data = normalize_data(variable_data, mean_dict, std_dict)\n", - "# print(f'Normalised variables: {normalized_data.keys()}')\n", - "\n", - "\n", - "for var_name, var_data in normalized_data.items():\n", - " # Get the shape of the variable data\n", - " var_shape = var_data.shape if isinstance(var_data, np.ndarray) else \"Not an array\"\n", - " print(\"Variable:\", var_name, \" Shape:\", var_shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Correct NCol = 4419\n", - "ilev = 93\n", - "\n", - "Points to be considered:\n", - "1. Some variables are different dimensions, varying over different levels (ilev=93/94 here)\n", - "2. These levels cause the input dimensions to become large (8 variables, each with 93 instances (i.e.varying across 93 vertical levels), and 4 variables not varying across the 93 levels.)\n", - "3. Both Input and Ouput variables have 93 levels" - ] - }, - { - "cell_type": "code", - "execution_count": 256, - "metadata": {}, - "outputs": [], - "source": [ - "def data_loader(variable_names, normalized_data, ilev):\n", - " # Determine the shape of the data\n", - " Ncol = normalized_data[variable_names[1]].shape[2]\n", - " # print(f'ilev= {ilev} and Ncol={Ncol}')\n", - "\n", - " # Initialize x_train and y_train arrays\n", - " # Calculate dim_NN and dim_NNout\n", - " dim_NN = int(8 * ilev + 4) # 8 variables varying over 93 levels, 4 constant variables (lat, long, PS )\n", - " dim_NNout = int(2 * ilev) #(UTGWSPEC, VTGWSPEC)\n", - "\n", - " # Initialize x_train and y_train arrays\n", - " x_train = np.zeros([dim_NN, Ncol])\n", - " y_train = np.zeros([dim_NNout, Ncol])\n", - "\n", - " # print(f'Set xtrain shape{x_train.shape}')\n", - " # print(f'Set ytrain shape{y_train.shape}')\n", - " target_var = ['UTGWSPEC','VTGWSPEC']\n", - "\n", - " # Assign variables to x_train\n", - " y_index = 0\n", - " x_index = 0\n", - " for var_name, var_data in normalized_data.items():\n", - " var_shape = var_data.shape\n", - "\n", - " if var_name in target_var:\n", - " # print(var_name, y_index\n", - " y_train[y_index * ilev:(y_index + 1) * ilev, :] = var_data.reshape(ilev, Ncol)\n", - " y_index +=1\n", - " \n", - " elif len(var_shape) == 2: # For 2D variables\n", - " # print(var_name, x_index)\n", - " x_train[x_index, :] = var_data\n", - " \n", - " elif len(var_shape) == 3:\n", - " new_ilev = var_shape[1]\n", - " # print(var_name, x_index)\n", - " x_train[x_index:x_index + new_ilev, :] = var_data ### Issue here in extracting variables level-wise because of difference in levels\n", - " x_index+=1\n", - "\n", - " return x_train, y_train\n" - ] - }, - { - "cell_type": "code", - "execution_count": 294, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(748, 4419) (186, 4419)\n" - ] - } - ], - "source": [ - "xtrain, ytrain = data_loader(features, normalized_data, ilev=93)\n", - "print(xtrain.shape, ytrain.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 258, - "metadata": {}, - "outputs": [], - "source": [ - "# Required for feeding the data into NN.\n", - "class myDataset(Dataset):\n", - " def __init__(self, X, Y):\n", - " \"\"\"\n", - " Parameters:\n", - " X (tensor): Input data.\n", - " Y (tensor): Output data.\n", - " \"\"\"\n", - " self.features = torch.tensor(X, dtype=torch.float64)\n", - " self.labels = torch.tensor(Y, dtype=torch.float64)\n", - "\n", - " def __len__(self):\n", - " \"\"\"Function that is called when you call len(dataloader)\"\"\"\n", - " return len(self.features.T)\n", - "\n", - " def __getitem__(self, idx):\n", - " \"\"\"Function that is called when you call dataloader\"\"\"\n", - " feature = self.features[:, idx]\n", - " label = self.labels[:, idx]\n", - "\n", - " return feature, label\n" - ] - }, - { - "cell_type": "code", - "execution_count": 259, - "metadata": {}, - "outputs": [], - "source": [ - "data = myDataset(X=xtrain, Y=ytrain)\n", - "split_data = torch.utils.data.random_split(data, [0.75, 0.25],generator=torch.Generator().manual_seed(42))\n", - "\n", - "train_dataloader = DataLoader(split_data[0], batch_size=128, shuffle=True)\n", - "val_dataloader = DataLoader(split_data[1], batch_size=len(split_data[1]), shuffle=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 260, - "metadata": {}, - "outputs": [], - "source": [ - "class FullyConnected(nn.Module):\n", - " def __init__(self):\n", - " \"\"\"Create an instance of FullyConnected NN model.\"\"\"\n", - " super(FullyConnected, self).__init__()\n", - " ilev = 93\n", - " hidden_layers = 8 # Number of hidden layers\n", - " hidden_size = 500 # Number of neurons in each hidden layer\n", - "\n", - " layers = []\n", - " input_size = 8 * ilev + 4\n", - " for _ in range(hidden_layers):\n", - " layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64))\n", - " layers.append(nn.SiLU())\n", - " input_size = hidden_size\n", - "\n", - " layers.append(nn.Linear(hidden_size, 2 * ilev, dtype=torch.float64))\n", - "\n", - " self.linear_stack = nn.Sequential(*layers)\n", - "\n", - " def forward(self, X):\n", - " return self.linear_stack(X)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 261, - "metadata": {}, - "outputs": [], - "source": [ - "class EarlyStopper:\n", - " def __init__(self, patience=1, min_delta=0):\n", - " \"\"\"Create an instance of EarlyStopper class.\"\"\"\n", - " self.patience = patience\n", - " self.min_delta = min_delta\n", - " self.counter = 0\n", - " self.min_validation_loss = np.inf\n", - "\n", - " def early_stop(self, validation_loss, model=None):\n", - " \"\"\"\n", - " Check if early stopping condition is met.\n", - "\n", - " Args:\n", - " validation_loss (float): Loss value on the validation set.\n", - " model (nn.Module, optional): Model to be saved if early stopping condition is met.\n", - "\n", - " Returns\n", - " -------\n", - " bool: True if early stopping condition is met, False otherwise.\n", - " \"\"\"\n", - " if validation_loss < self.min_validation_loss:\n", - " self.min_validation_loss = validation_loss\n", - " self.counter = 0\n", - "\n", - " # Save model\n", - " if model is not None:\n", - " torch.save(model.state_dict(), 'conv_torch.pth')\n", - "\n", - " elif validation_loss > (self.min_validation_loss + self.min_delta):\n", - " self.counter += 1\n", - " if self.counter >= self.patience:\n", - " return True\n", - " return False\n" - ] - }, - { - "cell_type": "code", - "execution_count": 262, - "metadata": {}, - "outputs": [], - "source": [ - "def train_loop(dataloader, model, loss_fn, optimizer):\n", - " size = len(dataloader.dataset)\n", - " avg_loss = 0\n", - " for batch, (X, Y) in enumerate(dataloader):\n", - " # Compute prediction and loss\n", - " pred = model(X)\n", - " loss = loss_fn(pred, Y)\n", - "\n", - " # Backpropagation\n", - " optimizer.zero_grad(set_to_none=True)\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " avg_loss += loss.item() # Accumulate loss as a float\n", - "\n", - " avg_loss /= len(dataloader)\n", - "\n", - " return avg_loss\n", - "\n", - "\n", - "\n", - "def val_loop(dataloader, model, loss_fn):\n", - " avg_loss = sum(loss_fn(model(X), Y).item() for X, Y in dataloader) / len(dataloader)\n", - " return avg_loss" - ] - }, - { - "cell_type": "code", - "execution_count": 263, - "metadata": {}, - "outputs": [], - "source": [ - "def train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs=100):\n", - " train_losses = []\n", - " val_losses = [0]\n", - "\n", - " for epoch in range(epochs):\n", - " if epoch % 2 == 0:\n", - " print(f\"Epoch {epoch+1}\\n-------------------------------\")\n", - " print(val_losses[-1])\n", - " print('counter=' + str(early_stopper.counter))\n", - "\n", - " train_loss = train_loop(train_dataloader, model, criterion, optimizer)\n", - " train_losses.append(train_loss)\n", - "\n", - " val_loss = val_loop(val_dataloader, model, criterion)\n", - " val_losses.append(val_loss)\n", - "\n", - " if early_stopper.early_stop(val_loss, model):\n", - " print(\"BREAK!\")\n", - " break\n", - "\n", - " return train_losses, val_losses\n" - ] - }, - { - "cell_type": "code", - "execution_count": 264, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1\n", - "-------------------------------\n", - "0\n", - "counter=0\n", - "Epoch 3\n", - "-------------------------------\n", - "0.8489934648869807\n", - "counter=0\n", - "Epoch 5\n", - "-------------------------------\n", - "0.848900956575377\n", - "counter=0\n", - "Epoch 7\n", - "-------------------------------\n", - "0.8488277427356834\n", - "counter=0\n", - "Epoch 9\n", - "-------------------------------\n", - "0.8487524616774139\n", - "counter=0\n", - "Epoch 11\n", - "-------------------------------\n", - "0.8486892791301094\n", - "counter=0\n", - "Epoch 13\n", - "-------------------------------\n", - "0.8486358109089807\n", - "counter=0\n", - "Epoch 15\n", - "-------------------------------\n", - "0.8485900223018678\n", - "counter=0\n", - "Epoch 17\n", - "-------------------------------\n", - "0.8485513591271571\n", - "counter=0\n", - "Epoch 19\n", - "-------------------------------\n", - "0.8485374125600498\n", - "counter=0\n", - "Epoch 21\n", - "-------------------------------\n", - "0.8485556882976334\n", - "counter=2\n", - "Epoch 23\n", - "-------------------------------\n", - "0.8485675317451268\n", - "counter=4\n", - "BREAK!\n" - ] - } - ], - "source": [ - "learning_rate = 1e-5\n", - "epochs = 100\n", - "\n", - "model = FullyConnected()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", - "criterion = nn.MSELoss()\n", - "early_stopper = EarlyStopper(patience=5, min_delta=0)\n", - "\n", - "train_losses, val_losses = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 275, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def predict(input_data, model):\n", - " # Convert input data to tensors\n", - " input_tensors = {key: torch.tensor(value) for key, value in input_data.items()}\n", - "\n", - " # Ensure model is in evaluation mode\n", - " model.eval()\n", - "\n", - " # Forward pass to make predictions\n", - " with torch.no_grad():\n", - " predictions = model(**input_tensors)\n", - "\n", - " return predictions\n" - ] - }, - { - "cell_type": "code", - "execution_count": 287, - "metadata": {}, - "outputs": [], - "source": [ - "test_data = load_variables(directory_path, features, 5,6)\n", - "mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features)\n", - "normalized_test_data = normalize_data(test_data, mean_dict, std_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 284, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 284, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_path = 'conv_torch.pth'\n", - "model = FullyConnected()\n", - "model.load_state_dict(torch.load(model_path))" - ] - }, - { - "cell_type": "code", - "execution_count": 291, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(748, 4419) (186, 4419)\n" - ] - } - ], - "source": [ - "\n", - "x_test, y_test = data_loader(features, normalized_test_data, ilev=93)\n", - "print(x_test.shape, y_test.shape)\n", - "\n", - "test_data = myDataset(X=x_test, Y=y_test)\n", - "\n", - "test_loader = DataLoader(data, batch_size=len(data), shuffle=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From dcd75858982324fa31a317cec9f75cb92e13ad52 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 24 Jun 2024 20:54:29 +0100 Subject: [PATCH 17/35] Updated name of the file Model.py to model.py --- newCAM_emulation/modelrun.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 newCAM_emulation/modelrun.py diff --git a/newCAM_emulation/modelrun.py b/newCAM_emulation/modelrun.py new file mode 100644 index 0000000..3833b5c --- /dev/null +++ b/newCAM_emulation/modelrun.py @@ -0,0 +1,34 @@ +import torch +from torch import nn +import numpy as np + +import netCDF4 as nc + +# Path to the .nc file +file_path = 'Demodata/Convection/newCAM_demo_sub_5.nc' + + +# Open the netCDF file +with nc.Dataset(file_path, "r") as f: + # List all variables in the file along with their shapes + print("Variables in the file:") + for var_name, var in f.variables.items(): + print(f"Variable '{var_name}' shape: {var.shape}") + + + +# # import numpy as np + +# def get_variable_names(npz_file_path): +# # Load the .npz file +# npz_data = np.load(npz_file_path) + +# # Get the list of variable names +# variable_names = list(npz_data.keys()) + +# return variable_names + +# # # Example usage: +# # npz_file_path = 'Demodata/Convection/std_demo_sub.npz' # Replace 'your_file_path.npz' with the path to your .npz file +# # variable_names = get_variable_names(npz_file_path) +# # print("Variable names:", variable_names) From 1a8c86f7a60e99d1ab57b0606d2ceefd9f5661be Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 24 Jun 2024 20:55:45 +0100 Subject: [PATCH 18/35] removed modelrun file, unused here --- newCAM_emulation/modelrun.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 newCAM_emulation/modelrun.py diff --git a/newCAM_emulation/modelrun.py b/newCAM_emulation/modelrun.py deleted file mode 100644 index 3833b5c..0000000 --- a/newCAM_emulation/modelrun.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -from torch import nn -import numpy as np - -import netCDF4 as nc - -# Path to the .nc file -file_path = 'Demodata/Convection/newCAM_demo_sub_5.nc' - - -# Open the netCDF file -with nc.Dataset(file_path, "r") as f: - # List all variables in the file along with their shapes - print("Variables in the file:") - for var_name, var in f.variables.items(): - print(f"Variable '{var_name}' shape: {var.shape}") - - - -# # import numpy as np - -# def get_variable_names(npz_file_path): -# # Load the .npz file -# npz_data = np.load(npz_file_path) - -# # Get the list of variable names -# variable_names = list(npz_data.keys()) - -# return variable_names - -# # # Example usage: -# # npz_file_path = 'Demodata/Convection/std_demo_sub.npz' # Replace 'your_file_path.npz' with the path to your .npz file -# # variable_names = get_variable_names(npz_file_path) -# # print("Variable names:", variable_names) From 8b2a8a2659327d43a52fddcd01e46024a9401888 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Tue, 25 Jun 2024 21:35:31 +0100 Subject: [PATCH 19/35] Changed hard coded input output values to variables --- newCAM_emulation/Model.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 05f7f40..fe73724 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -15,15 +15,24 @@ class FullyConnected(nn.Module): Sequential container of linear layers and activation functions. """ - def __init__(self, ilev=93, hidden_layers=8, hidden_size=500): + def __init__( + self, ilev=93, in_ver=8, in_nover=4, out_ver=2, hidden_layers=8, hidden_size=500 + ): super(FullyConnected, self).__init__() + self.ilev = ilev + self.in_ver = in_ver + self.in_nover = in_nover + self.out_ver = out_ver + self.hidden_layers = hidden_layers + self.hidden_size = hidden_size + layers = [] - input_size = 8 * ilev + 4 + input_size = in_ver * ilev + in_nover for _ in range(hidden_layers): layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64)) layers.append(nn.SiLU()) input_size = hidden_size - layers.append(nn.Linear(hidden_size, 2 * ilev, dtype=torch.float64)) + layers.append(nn.Linear(hidden_size, out_ver * ilev, dtype=torch.float64)) self.linear_stack = nn.Sequential(*layers) def forward(self, X): From 9a63f3fa10d4d0a06240eb7c1ad494e5d4f0a062 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Tue, 25 Jun 2024 21:41:29 +0100 Subject: [PATCH 20/35] Removed hard-coded values in data_loader --- newCAM_emulation/loaddata.py | 23 +++-------------------- newCAM_emulation/main.py | 20 +++++++++++++++----- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index f4a6309..f790993 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -7,23 +7,6 @@ import netCDF4 as nc -features = [ - "PS", - "Z3", - "U", - "V", - "T", - "lat", - "lon", - "DSE", - "RHOI", - "NETDT", - "NM", - "UTGWSPEC", - "VTGWSPEC", -] - - def load_variables(directory_path, variable_names, startfile, endfile): """ Loads specified variables from NetCDF files in the given directory. @@ -114,7 +97,7 @@ def normalize_data(variable_data, mean_values, std_values): return normalized_data -def data_loader(variable_names, normalized_data, ilev): +def data_loader(variable_names, normalized_data, ilev, in_ver, in_nover, out_ver): """ Prepares the data for training by organizing it into input and output arrays. @@ -133,8 +116,8 @@ def data_loader(variable_names, normalized_data, ilev): Input and output arrays for training. """ Ncol = normalized_data[variable_names[1]].shape[2] - dim_NN = int(8 * ilev + 4) - dim_NNout = int(2 * ilev) + dim_NN = int(in_ver * ilev + in_nover) + dim_NNout = int(out_ver * ilev) x_train = np.zeros([dim_NN, Ncol]) y_train = np.zeros([dim_NNout, Ncol]) target_var = ["UTGWSPEC", "VTGWSPEC"] diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py index d0cb791..60771d9 100644 --- a/newCAM_emulation/main.py +++ b/newCAM_emulation/main.py @@ -3,8 +3,8 @@ import torch from torch.utils.data import DataLoader import torch.nn as nn -from loaddata import load_variables, load_mean_std, normalize_data, data_loader, MyDataset, features -from Model import FullyConnected, EarlyStopper +from loaddata import load_variables, load_mean_std, normalize_data, data_loader, MyDataset +from model import FullyConnected, EarlyStopper from train import train_with_early_stopping # File paths and parameters @@ -13,11 +13,18 @@ file_path_std = '../Demodata/Convection/std_demo_sub.npz' trained_model_path = 'conv_torch.pth' # Path to save and load the trained model +#variable information +features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC'] +ilev = 93 +in_ver = 8 +in_nover = 4 +out_ver = 2 + # Load and preprocess data variable_data = load_variables(directory_path, features, 1, 5) mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features) normalized_data = normalize_data(variable_data, mean_dict, std_dict) -xtrain, ytrain = data_loader(features, normalized_data, ilev=93) +xtrain, ytrain = data_loader(features, normalized_data, ilev=ilev, in_ver=in_ver, in_nover=in_nover, out_ver=out_ver) # Print the shapes of xtrain and ytrain @@ -34,7 +41,10 @@ # Model training parameters learning_rate = 1e-5 epochs = 100 -model = FullyConnected() +hidden_layers = 8 +hidden_size = 500 + +model = FullyConnected(ilev, in_ver,in_nover,out_ver, hidden_layers, hidden_size) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.MSELoss() early_stopper = EarlyStopper(patience=5, min_delta=0) @@ -53,7 +63,7 @@ # For prediction, we need new input data. Here, we use different files for simplicity. test_data = load_variables(directory_path, features, 4, 5) normalized_test_data = normalize_data(test_data, mean_dict, std_dict) -x_test, y_test = data_loader(features, normalized_test_data, ilev=93) +x_test, y_test = data_loader(features, normalized_test_data, ilev=ilev, in_ver=in_ver, in_nover=in_nover, out_ver=out_ver) # Convert test data to tensors x_test_tensor = torch.tensor(x_test, dtype=torch.float64).T From e041e3badf6d8df6c50d4270295e2c7e1170635d Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 29 Jul 2024 15:56:23 +0100 Subject: [PATCH 21/35] Updated model saving to trained_models folder --- newCAM_emulation/main.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py index 60771d9..05800f8 100644 --- a/newCAM_emulation/main.py +++ b/newCAM_emulation/main.py @@ -8,10 +8,10 @@ from train import train_with_early_stopping # File paths and parameters -directory_path = '../Demodata/Convection' -file_path_mean = '../Demodata/Convection/mean_demo_sub.npz' -file_path_std = '../Demodata/Convection/std_demo_sub.npz' -trained_model_path = 'conv_torch.pth' # Path to save and load the trained model +directory_path = 'Demodata/Convection' +file_path_mean = 'Demodata/Convection/mean_demo_sub.npz' +file_path_std = 'Demodata/Convection/std_demo_sub.npz' +trained_model_path = 'trained_models/weights_conv' # Path to save and load the trained model #variable information features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC'] @@ -21,12 +21,11 @@ out_ver = 2 # Load and preprocess data -variable_data = load_variables(directory_path, features, 1, 5) +variable_data = load_variables(directory_path, features, 1, 5) mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features) normalized_data = normalize_data(variable_data, mean_dict, std_dict) xtrain, ytrain = data_loader(features, normalized_data, ilev=ilev, in_ver=in_ver, in_nover=in_nover, out_ver=out_ver) - # Print the shapes of xtrain and ytrain print(f"xtrain shape: {xtrain.shape}") print(f"ytrain shape: {ytrain.shape}") @@ -51,6 +50,8 @@ # Train the model with early stopping train_losses, val_losses = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs=epochs) +print(f'Train Loss: {train_losses}') +print(f'Valid Loss: {val_losses}') # Save the trained model torch.save(model.state_dict(), trained_model_path) @@ -58,6 +59,7 @@ # Load the trained model for prediction model.load_state_dict(torch.load(trained_model_path)) model.eval() +print() # Prepare input data for prediction # For prediction, we need new input data. Here, we use different files for simplicity. From 19576f2bf7ed628e934c10daafc4065e10b4aa2b Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Tue, 30 Jul 2024 11:45:14 +0100 Subject: [PATCH 22/35] Updated reshaping in loaddata dataloader, inccluded print message in train and deleted commented out code in model --- newCAM_emulation/Model.py | 5 +++-- newCAM_emulation/loaddata.py | 6 +++--- newCAM_emulation/train.py | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index fe73724..da6049d 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -100,8 +100,9 @@ def early_stop(self, validation_loss, model=None): if validation_loss < self.min_validation_loss: self.min_validation_loss = validation_loss self.counter = 0 - if model is not None: - torch.save(model.state_dict(), "conv_torch.pth") + # if model is not None: + # # torch.save(model.state_dict(), 'conv_torch.pth') + # torch.save(model.state_dict(), 'trained_models/weights_conv') elif validation_loss > (self.min_validation_loss + self.min_delta): self.counter += 1 if self.counter >= self.patience: diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index f790993..2860b31 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -126,9 +126,9 @@ def data_loader(variable_names, normalized_data, ilev, in_ver, in_nover, out_ver for var_name, var_data in normalized_data.items(): var_shape = var_data.shape if var_name in target_var: - y_train[y_index * ilev : (y_index + 1) * ilev, :] = var_data.reshape( - ilev, Ncol - ) + # y_train[y_index * ilev:(y_index + 1) * ilev, :] = var_data.reshape(ilev, Ncol) + y_train[y_index * ilev : (y_index + 1) * ilev, :] = var_data + y_index += 1 elif len(var_shape) == 2: x_train[x_index, :] = var_data diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 3f084ad..cb38e9c 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -96,10 +96,10 @@ def train_with_early_stopping( train_losses = [] val_losses = [0] for epoch in range(epochs): - # if epoch % 2 == 0: - # print(f"Epoch {epoch + 1}\n-------------------------------") - # print(val_losses[-1]) - # print('counter=' + str(early_stopper.counter)) + if epoch % 2 == 0: + print(f"Epoch {epoch + 1}\n-------------------------------") + print(val_losses[-1]) + print("counter=" + str(early_stopper.counter)) train_loss = train_loop(train_dataloader, model, criterion, optimizer) train_losses.append(train_loss) val_loss = val_loop(val_dataloader, model, criterion) From 61e6fbaf04e498205cb1909345a1a96cb8298cfd Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 5 Aug 2024 12:26:17 +0100 Subject: [PATCH 23/35] Added a comment to clarify the looping through identical layers of the net --- newCAM_emulation/Model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index da6049d..9947f2d 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -27,7 +27,10 @@ def __init__( self.hidden_size = hidden_size layers = [] + input_size = in_ver * ilev + in_nover + + # The following for loop provides the sequential layer by layer flow of data in the model as the layers used in our model are identical. for _ in range(hidden_layers): layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64)) layers.append(nn.SiLU()) From c22b64f184413d1877b183c15586cb4301bb86ed Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 5 Aug 2024 15:30:29 +0100 Subject: [PATCH 24/35] Updated the documentation to add latest code changes --- README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e1e06bb..cab0759 100644 --- a/README.md +++ b/README.md @@ -48,13 +48,23 @@ The model is trained using the script `train.py` using the demo data. The optimi The `Demodata` folder contains the demo data used to train and test the model The `newCAM_emulation` folder contains the code that is required to load data, train the model and make predictions which is structured as following: -> `train.py` - train the model -> `NN-pred.py` - predict the GWD using the trained model - -> `loaddata.py` - load the data and reshape it to the NN input +> `loaddata.py` - load the data from source .nc files and normalises before feeding it to the neural network. -> `model.py` - define the NN model +> `model.py` - defines the NN class and the early stopping mechanism. + +> `train.py` - trains the model for given number of epochs using the training and validation loops. + +> `main.py` - uses the above three modules to sequentially +1. Read the features list (would vary depending on the GW source, currently is convection) +2. Take information on data like ilev, number of variables varying acrross vertical levels etc. +3. Use `loaddata.py` to load data for the variables in the feature list defined earlier, normalise it, build an `xtrain` `ytrain` for model using a data loader and finally create a custom dataset for easy iteration over the xtrain and ytrain. +4. Take model hyperparameters such as learning rate, epochs, hidden layers and passes to `model.py` +5. Also take Loss function, optimiser and early stopping parameters ans pass it to `train.py` along with the defined model and the custom dataset. +6. Train the model and save the weights in the +`trained_models` folder. +7. The saved model can be loaded and tested on any dataset here. + ## Usage Instructions To use the repository, following steps are required: From 6a8d1eaa41a0920fe8aedda71bc5cfb05d318df8 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Tue, 13 Aug 2024 10:56:52 +0100 Subject: [PATCH 25/35] Changed the directory path to load data from Demodata --- newCAM_emulation/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py index 05800f8..bc15714 100644 --- a/newCAM_emulation/main.py +++ b/newCAM_emulation/main.py @@ -4,13 +4,13 @@ from torch.utils.data import DataLoader import torch.nn as nn from loaddata import load_variables, load_mean_std, normalize_data, data_loader, MyDataset -from model import FullyConnected, EarlyStopper +from Model import FullyConnected, EarlyStopper from train import train_with_early_stopping # File paths and parameters -directory_path = 'Demodata/Convection' -file_path_mean = 'Demodata/Convection/mean_demo_sub.npz' -file_path_std = 'Demodata/Convection/std_demo_sub.npz' +directory_path = 'Demodata' +file_path_mean = 'Demodata/mean_demo_sub.npz' +file_path_std = 'Demodata/std_demo_sub.npz' trained_model_path = 'trained_models/weights_conv' # Path to save and load the trained model #variable information From 53e9069a88eb7a0d3acf412f4da559d3fd1158cf Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 19 Aug 2024 10:29:43 +0100 Subject: [PATCH 26/35] Rebased and added 2 ruff warning supreesors PLR0913 and PLR2004 --- newCAM_emulation/Model.py | 11 ++++++---- newCAM_emulation/__init__.py | 1 + newCAM_emulation/loaddata.py | 29 ++++++++++++++++--------- newCAM_emulation/main.py | 42 +++++++++++++++++++++++++++--------- newCAM_emulation/train.py | 28 ++++++++++++------------ 5 files changed, 73 insertions(+), 38 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 9947f2d..be01c30 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -1,9 +1,10 @@ """Neural Network model for the CAM-EM.""" -import torch import numpy as np +import torch from torch import nn +# ruff: noqa: PLR0913 class FullyConnected(nn.Module): """ @@ -16,7 +17,8 @@ class FullyConnected(nn.Module): """ def __init__( - self, ilev=93, in_ver=8, in_nover=4, out_ver=2, hidden_layers=8, hidden_size=500 + self, ilev=93, in_ver=8, in_nover=4, out_ver=2, + hidden_layers=8, hidden_size=500 ): super(FullyConnected, self).__init__() self.ilev = ilev @@ -30,7 +32,8 @@ def __init__( input_size = in_ver * ilev + in_nover - # The following for loop provides the sequential layer by layer flow of data in the model as the layers used in our model are identical. + # The following for loop provides the sequential layer by layer flow + # of data in the model as the layers used in our model are identical. for _ in range(hidden_layers): layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64)) layers.append(nn.SiLU()) @@ -64,7 +67,7 @@ class EarlyStopper: patience : int, optional Number of epochs to wait before stopping (default is 1). min_delta : float, optional - Minimum change in the monitored quantity to qualify as an improvement (default is 0). + Minimum change in the loss to qualify as an improvement (default is 0). Attributes ---------- diff --git a/newCAM_emulation/__init__.py b/newCAM_emulation/__init__.py index e69de29..2cf3fc8 100644 --- a/newCAM_emulation/__init__.py +++ b/newCAM_emulation/__init__.py @@ -0,0 +1 @@ +"""Intantiate the Emulation.""" diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index 2860b31..cb0b4f5 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -2,14 +2,17 @@ import os import re -import torch -import numpy as np + import netCDF4 as nc +import numpy as np +import torch +# ruff: noqa: PLR0913 +# ruff: noqa: PLR2004 def load_variables(directory_path, variable_names, startfile, endfile): """ - Loads specified variables from NetCDF files in the given directory. + Load specified variables from NetCDF files in the given directory. Parameters ---------- @@ -46,7 +49,7 @@ def load_variables(directory_path, variable_names, startfile, endfile): def load_mean_std(file_path_mean, file_path_std, variable_names): """ - Loads mean and standard deviation values for specified variables from files. + Load mean and standard deviation values for specified variables from files. Parameters ---------- @@ -71,7 +74,7 @@ def load_mean_std(file_path_mean, file_path_std, variable_names): def normalize_data(variable_data, mean_values, std_values): """ - Normalizes the data using mean and standard deviation values. + Normalize the data using mean and standard deviation values. Parameters ---------- @@ -99,7 +102,7 @@ def normalize_data(variable_data, mean_values, std_values): def data_loader(variable_names, normalized_data, ilev, in_ver, in_nover, out_ver): """ - Prepares the data for training by organizing it into input and output arrays. + Prepare the data for training by organizing it into input and output arrays. Parameters ---------- @@ -109,6 +112,13 @@ def data_loader(variable_names, normalized_data, ilev, in_ver, in_nover, out_ver Dictionary containing normalized data. ilev : int Number of vertical levels. + in_ver : int + Number of input variables that vary across vertical levels. + in_nover : int + Number of input variables that do not vary across vertical levels. + out_ver : int + Number of output variables that vary across vertical levels. + Returns ------- @@ -126,9 +136,7 @@ def data_loader(variable_names, normalized_data, ilev, in_ver, in_nover, out_ver for var_name, var_data in normalized_data.items(): var_shape = var_data.shape if var_name in target_var: - # y_train[y_index * ilev:(y_index + 1) * ilev, :] = var_data.reshape(ilev, Ncol) y_train[y_index * ilev : (y_index + 1) * ilev, :] = var_data - y_index += 1 elif len(var_shape) == 2: x_train[x_index, :] = var_data @@ -164,7 +172,8 @@ def __init__(self, X, Y): def __len__(self): """ - Returns the length of the dataset. + Return the length of the dataset. + Returns ------- int @@ -174,7 +183,7 @@ def __len__(self): def __getitem__(self, idx): """ - Returns a single sample from the dataset. + Return a single sample from the dataset. Parameters ---------- diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py index bc15714..a3c6a7d 100644 --- a/newCAM_emulation/main.py +++ b/newCAM_emulation/main.py @@ -1,30 +1,42 @@ +"""Script to load data and train the neural network.""" + import os + import numpy as np import torch +from loaddata import ( + MyDataset, + data_loader, + load_mean_std, + load_variables, + normalize_data, +) +from Model import EarlyStopper, FullyConnected +from torch import nn from torch.utils.data import DataLoader -import torch.nn as nn -from loaddata import load_variables, load_mean_std, normalize_data, data_loader, MyDataset -from Model import FullyConnected, EarlyStopper from train import train_with_early_stopping # File paths and parameters directory_path = 'Demodata' file_path_mean = 'Demodata/mean_demo_sub.npz' file_path_std = 'Demodata/std_demo_sub.npz' -trained_model_path = 'trained_models/weights_conv' # Path to save and load the trained model +trained_model_path = 'trained_models/weights_conv' #variable information -features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC'] +features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', + 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC'] ilev = 93 in_ver = 8 in_nover = 4 out_ver = 2 # Load and preprocess data -variable_data = load_variables(directory_path, features, 1, 5) +variable_data = load_variables(directory_path, features, 1, 5) mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features) normalized_data = normalize_data(variable_data, mean_dict, std_dict) -xtrain, ytrain = data_loader(features, normalized_data, ilev=ilev, in_ver=in_ver, in_nover=in_nover, out_ver=out_ver) +xtrain, ytrain = data_loader( + features, normalized_data, ilev=ilev, in_ver=in_ver, + in_nover=in_nover, out_ver=out_ver) # Print the shapes of xtrain and ytrain print(f"xtrain shape: {xtrain.shape}") @@ -33,7 +45,8 @@ # Prepare dataset and dataloaders data = MyDataset(X=xtrain, Y=ytrain) -split_data = torch.utils.data.random_split(data, [0.75, 0.25], generator=torch.Generator().manual_seed(42)) +split_data = torch.utils.data.random_split( + data, [0.75, 0.25], generator=torch.Generator().manual_seed(42)) train_dataloader = DataLoader(split_data[0], batch_size=128, shuffle=True) val_dataloader = DataLoader(split_data[1], batch_size=len(split_data[1]), shuffle=True) @@ -49,7 +62,14 @@ early_stopper = EarlyStopper(patience=5, min_delta=0) # Train the model with early stopping -train_losses, val_losses = train_with_early_stopping(train_dataloader, val_dataloader, model, optimizer, criterion, early_stopper, epochs=epochs) +train_losses, val_losses = train_with_early_stopping( + train_dataloader, + val_dataloader, + model, + optimizer, + criterion, + early_stopper, + epochs=epochs) print(f'Train Loss: {train_losses}') print(f'Valid Loss: {val_losses}') @@ -65,7 +85,9 @@ # For prediction, we need new input data. Here, we use different files for simplicity. test_data = load_variables(directory_path, features, 4, 5) normalized_test_data = normalize_data(test_data, mean_dict, std_dict) -x_test, y_test = data_loader(features, normalized_test_data, ilev=ilev, in_ver=in_ver, in_nover=in_nover, out_ver=out_ver) +x_test, y_test = data_loader( + features, normalized_test_data, ilev=ilev, in_ver=in_ver, + in_nover=in_nover, out_ver=out_ver) # Convert test data to tensors x_test_tensor = torch.tensor(x_test, dtype=torch.float64).T diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index cb38e9c..733d90b 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -3,6 +3,7 @@ import torch from torch import nn +# ruff: noqa: PLR0913 def train_loop(dataloader, model, loss_fn, optimizer): """ @@ -38,22 +39,21 @@ def train_loop(dataloader, model, loss_fn, optimizer): def val_loop(dataloader, model, loss_fn): """ - Validation loop for a single epoch. - >>>>>>> 70b2c64 (Updated loaddata, Model, train files and removed NN_pred, and added a main file) + Validate loop for a single epoch. - Parameters - ---------- - dataloader : torch.utils.data.DataLoader - DataLoader for the validation data. - model : nn.Module - Neural network model. - loss_fn : callable - Loss function. + Parameters + ---------- + dataloader : torch.utils.data.DataLoader + DataLoader for the validation data. + model : nn.Module + Neural network model. + loss_fn : callable + Loss function. - Returns - ------- - float - Average validation loss. + Returns + ------- + float + Average validation loss. """ avg_loss = sum(loss_fn(model(X), Y).item() for X, Y in dataloader) / len(dataloader) return avg_loss From 4775e45482f732172f107f45fee104b2d3e680ea Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 19 Aug 2024 10:55:45 +0100 Subject: [PATCH 27/35] Removed ruff warning supressor ruff: noqa: PLR0913 --- newCAM_emulation/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 733d90b..aeb3631 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -3,7 +3,6 @@ import torch from torch import nn -# ruff: noqa: PLR0913 def train_loop(dataloader, model, loss_fn, optimizer): """ From e76ab22b5dd9b4ce7656a3cf862a7040d19f10d7 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 19 Aug 2024 11:14:13 +0100 Subject: [PATCH 28/35] Added ruff warning supressor ruff: noqa: PLR0913 --- newCAM_emulation/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index aeb3631..733d90b 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -3,6 +3,7 @@ import torch from torch import nn +# ruff: noqa: PLR0913 def train_loop(dataloader, model, loss_fn, optimizer): """ From 122253c06f4adc60ed0cdbf91c7867bf7b9d6e61 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 19 Aug 2024 11:21:07 +0100 Subject: [PATCH 29/35] Reformatted using Ruff --- newCAM_emulation/Model.py | 4 +-- newCAM_emulation/loaddata.py | 1 + newCAM_emulation/main.py | 59 +++++++++++++++++++++++++----------- newCAM_emulation/train.py | 1 + 4 files changed, 46 insertions(+), 19 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index be01c30..1b16cba 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -6,6 +6,7 @@ # ruff: noqa: PLR0913 + class FullyConnected(nn.Module): """ Fully connected neural network model. @@ -17,8 +18,7 @@ class FullyConnected(nn.Module): """ def __init__( - self, ilev=93, in_ver=8, in_nover=4, out_ver=2, - hidden_layers=8, hidden_size=500 + self, ilev=93, in_ver=8, in_nover=4, out_ver=2, hidden_layers=8, hidden_size=500 ): super(FullyConnected, self).__init__() self.ilev = ilev diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index cb0b4f5..2764cc5 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -10,6 +10,7 @@ # ruff: noqa: PLR0913 # ruff: noqa: PLR2004 + def load_variables(directory_path, variable_names, startfile, endfile): """ Load specified variables from NetCDF files in the given directory. diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py index a3c6a7d..9cce3bd 100644 --- a/newCAM_emulation/main.py +++ b/newCAM_emulation/main.py @@ -17,14 +17,27 @@ from train import train_with_early_stopping # File paths and parameters -directory_path = 'Demodata' -file_path_mean = 'Demodata/mean_demo_sub.npz' -file_path_std = 'Demodata/std_demo_sub.npz' -trained_model_path = 'trained_models/weights_conv' - -#variable information -features = ['PS', 'Z3', 'U', 'V', 'T', 'lat', 'lon', 'DSE', - 'RHOI', 'NETDT', 'NM', 'UTGWSPEC', 'VTGWSPEC'] +directory_path = "Demodata" +file_path_mean = "Demodata/mean_demo_sub.npz" +file_path_std = "Demodata/std_demo_sub.npz" +trained_model_path = "trained_models/weights_conv" + +# variable information +features = [ + "PS", + "Z3", + "U", + "V", + "T", + "lat", + "lon", + "DSE", + "RHOI", + "NETDT", + "NM", + "UTGWSPEC", + "VTGWSPEC", +] ilev = 93 in_ver = 8 in_nover = 4 @@ -35,8 +48,13 @@ mean_dict, std_dict = load_mean_std(file_path_mean, file_path_std, features) normalized_data = normalize_data(variable_data, mean_dict, std_dict) xtrain, ytrain = data_loader( - features, normalized_data, ilev=ilev, in_ver=in_ver, - in_nover=in_nover, out_ver=out_ver) + features, + normalized_data, + ilev=ilev, + in_ver=in_ver, + in_nover=in_nover, + out_ver=out_ver, +) # Print the shapes of xtrain and ytrain print(f"xtrain shape: {xtrain.shape}") @@ -46,7 +64,8 @@ # Prepare dataset and dataloaders data = MyDataset(X=xtrain, Y=ytrain) split_data = torch.utils.data.random_split( - data, [0.75, 0.25], generator=torch.Generator().manual_seed(42)) + data, [0.75, 0.25], generator=torch.Generator().manual_seed(42) +) train_dataloader = DataLoader(split_data[0], batch_size=128, shuffle=True) val_dataloader = DataLoader(split_data[1], batch_size=len(split_data[1]), shuffle=True) @@ -56,7 +75,7 @@ hidden_layers = 8 hidden_size = 500 -model = FullyConnected(ilev, in_ver,in_nover,out_ver, hidden_layers, hidden_size) +model = FullyConnected(ilev, in_ver, in_nover, out_ver, hidden_layers, hidden_size) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.MSELoss() early_stopper = EarlyStopper(patience=5, min_delta=0) @@ -69,9 +88,10 @@ optimizer, criterion, early_stopper, - epochs=epochs) -print(f'Train Loss: {train_losses}') -print(f'Valid Loss: {val_losses}') + epochs=epochs, +) +print(f"Train Loss: {train_losses}") +print(f"Valid Loss: {val_losses}") # Save the trained model torch.save(model.state_dict(), trained_model_path) @@ -86,8 +106,13 @@ test_data = load_variables(directory_path, features, 4, 5) normalized_test_data = normalize_data(test_data, mean_dict, std_dict) x_test, y_test = data_loader( - features, normalized_test_data, ilev=ilev, in_ver=in_ver, - in_nover=in_nover, out_ver=out_ver) + features, + normalized_test_data, + ilev=ilev, + in_ver=in_ver, + in_nover=in_nover, + out_ver=out_ver, +) # Convert test data to tensors x_test_tensor = torch.tensor(x_test, dtype=torch.float64).T diff --git a/newCAM_emulation/train.py b/newCAM_emulation/train.py index 733d90b..690ed27 100644 --- a/newCAM_emulation/train.py +++ b/newCAM_emulation/train.py @@ -5,6 +5,7 @@ # ruff: noqa: PLR0913 + def train_loop(dataloader, model, loss_fn, optimizer): """ Training loop for a single epoch. From 3bb2324d88d00c7c867bd36b7f3174220edc2c1a Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 19 Aug 2024 12:08:23 +0100 Subject: [PATCH 30/35] Rearranged sequence of variables in the list --- newCAM_emulation/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/newCAM_emulation/main.py b/newCAM_emulation/main.py index 9cce3bd..dbc02b9 100644 --- a/newCAM_emulation/main.py +++ b/newCAM_emulation/main.py @@ -24,17 +24,17 @@ # variable information features = [ - "PS", - "Z3", "U", "V", "T", - "lat", - "lon", "DSE", - "RHOI", - "NETDT", "NM", + "NETDT", + "Z3", + "RHOI", + "PS", + "lat", + "lon", "UTGWSPEC", "VTGWSPEC", ] From dac7400553c1840bd88b4f30d2eafb7b4d73f6ee Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 16 Sep 2024 13:05:42 +0100 Subject: [PATCH 31/35] Removed hard-coded target variables in data loader --- newCAM_emulation/loaddata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newCAM_emulation/loaddata.py b/newCAM_emulation/loaddata.py index 2764cc5..17b203a 100644 --- a/newCAM_emulation/loaddata.py +++ b/newCAM_emulation/loaddata.py @@ -131,7 +131,7 @@ def data_loader(variable_names, normalized_data, ilev, in_ver, in_nover, out_ver dim_NNout = int(out_ver * ilev) x_train = np.zeros([dim_NN, Ncol]) y_train = np.zeros([dim_NNout, Ncol]) - target_var = ["UTGWSPEC", "VTGWSPEC"] + target_var = variable_names[-2:] y_index = 0 x_index = 0 for var_name, var_data in normalized_data.items(): From ff53367b9047cbd30c512ab4274f283596eae7ca Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 16 Sep 2024 13:08:20 +0100 Subject: [PATCH 32/35] Moved Early Stopper class from Model --- newCAM_emulation/Model.py | 56 --------------------------------------- 1 file changed, 56 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 1b16cba..2f1da62 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -58,59 +58,3 @@ def forward(self, X): return self.linear_stack(X) -class EarlyStopper: - """ - Early stopping utility to stop training when validation loss doesn't improve. - - Parameters - ---------- - patience : int, optional - Number of epochs to wait before stopping (default is 1). - min_delta : float, optional - Minimum change in the loss to qualify as an improvement (default is 0). - - Attributes - ---------- - patience : int - Number of epochs to wait before stopping. - min_delta : float - Minimum change in the monitored quantity to qualify as an improvement. - counter : int - Counter for the number of epochs without improvement. - min_validation_loss : float - Minimum validation loss recorded. - """ - - def __init__(self, patience=1, min_delta=0): - self.patience = patience - self.min_delta = min_delta - self.counter = 0 - self.min_validation_loss = np.inf - - def early_stop(self, validation_loss, model=None): - """ - Check if training should be stopped early. - - Parameters - ---------- - validation_loss : float - Current validation loss. - model : nn.Module, optional - Model to save if validation loss improves (default is None). - - Returns - ------- - bool - True if training should be stopped, False otherwise. - """ - if validation_loss < self.min_validation_loss: - self.min_validation_loss = validation_loss - self.counter = 0 - # if model is not None: - # # torch.save(model.state_dict(), 'conv_torch.pth') - # torch.save(model.state_dict(), 'trained_models/weights_conv') - elif validation_loss > (self.min_validation_loss + self.min_delta): - self.counter += 1 - if self.counter >= self.patience: - return True - return False From 773b32e148830356cc1fabb4edbbd6ff49beecd7 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 16 Sep 2024 13:12:49 +0100 Subject: [PATCH 33/35] Moved Early Stopper class from Model --- newCAM_emulation/Model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py index 2f1da62..9d2c44d 100644 --- a/newCAM_emulation/Model.py +++ b/newCAM_emulation/Model.py @@ -56,5 +56,3 @@ def forward(self, X): Output tensor. """ return self.linear_stack(X) - - From 1a07f0915c9778d0b61ee5400b38f69b8095e976 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 16 Sep 2024 16:30:58 +0100 Subject: [PATCH 34/35] Added pre-commit-config file --- .pre-commit-config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..a3f1f16 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.1.0 + hooks: + - id: ruff + args: [--fix] # This will auto-fix issues if possible \ No newline at end of file From aaa1d2c978379e979b0bd6151fdc9185f2255636 Mon Sep 17 00:00:00 2001 From: Surbhi Goel Date: Mon, 16 Sep 2024 16:33:59 +0100 Subject: [PATCH 35/35] Added pre-commit-config file --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a3f1f16..086021b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,5 @@ repos: rev: v0.1.0 hooks: - id: ruff - args: [--fix] # This will auto-fix issues if possible \ No newline at end of file + args: [--fix] # This will auto-fix issues if possible + \ No newline at end of file