From e72be0487f173a0ef562873603cb8500a79ab942 Mon Sep 17 00:00:00 2001 From: Abhijith Gandrakota Date: Wed, 19 Jul 2023 11:01:25 -0400 Subject: [PATCH] Part 2 Notebooks --- part2/2.JetRegressionMLP.ipynb | 356 +++++++++++++++++++++++++++ part2/2.JetTaggingMLP.ipynb | 434 +++++++++++++++++++++++++++++++++ 2 files changed, 790 insertions(+) create mode 100644 part2/2.JetRegressionMLP.ipynb create mode 100644 part2/2.JetTaggingMLP.ipynb diff --git a/part2/2.JetRegressionMLP.ipynb b/part2/2.JetRegressionMLP.ipynb new file mode 100644 index 0000000..a828df2 --- /dev/null +++ b/part2/2.JetRegressionMLP.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Rf258xT0XIwV" + }, + "source": [ + "# Training a Jet Regression with **DNN** \n", + "\n", + "---\n", + "In this notebook, we perform a Jet identification task using a multiclass classifier based on a \n", + "Dense Neural Network (DNN), also called multi-layer perceptron (MLP). The problem consists is \n", + "regression of $tau_{32}$, given $tau_3$ and $tau_2$.\n", + "\n", + "For details on the physics problem, see https://arxiv.org/pdf/1804.06913.pdf \n", + "\n", + "For details on the dataset, see Notebook1\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "4OMAZgtyXIwY" + }, + "outputs": [], + "source": [ + "import os\n", + "import h5py\n", + "import glob\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2lbB-J3hXIwb" + }, + "source": [ + "# Preparation of the training and validation samples\n", + "\n", + "---\n", + "In order to import the dataset, we now\n", + "- clone the dataset repository (to import the data in Colab)\n", + "- load the h5 files in the data/ repository\n", + "- extract the data we need: a target and jetImage \n", + "\n", + "To type shell commands, we start the command line with !\n", + "\n", + "**nb, if you are running locally and you have already downloaded the datasets you can skip the cell below and, if needed, change the paths later to point to the folder with your previous download of the datasets.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jWjxFaRPXIwb" + }, + "outputs": [], + "source": [ + "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n", + "! tar -xvzf Data-MLtutorial.tar.gz \n", + "! ls Data-MLtutorial/JetDataset/\n", + "! rm Data-MLtutorial.tar.gz " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "cCGhrKdwXIwc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\n", + "(50000,) (50000, 2)\n" + ] + } + ], + "source": [ + "target = np.array([])\n", + "features = np.array([])\n", + "ptype = np.array([])\n", + "# we cannot load all data on Colab. So we just take a few files\n", + "datafiles = ['Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5']\n", + "# if you are running locallt, you can use the full dataset doing\n", + "# for fileIN in glob.glob(\"tutorials/HiggsSchool/data/*h5\"):\n", + "for fileIN in datafiles:\n", + " print(\"Appending %s\" %fileIN)\n", + " f = h5py.File(fileIN)\n", + " myFeatures = np.array(f.get(\"jets\")[:,[5,6]])\n", + " myptype = np.array(f.get('jets')[0:,-6:-1])\n", + " mytarget = np.array(f.get('jets')[0:,10])\n", + " features = np.concatenate([features, myFeatures], axis=0) if features.size else myFeatures\n", + " target = np.concatenate([target, mytarget], axis=0) if target.size else mytarget\n", + " ptype = np.concatenate([ptype, myptype], axis=0) if ptype.size else myptype\n", + " f.close()\n", + "print(target.shape, features.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6a333RYPXIwe" + }, + "source": [ + "The dataset consists of 50000 jets, each represented by 16 features\n", + "\n", + "---\n", + "\n", + "We now shuffle the data, splitting them into a training and a validation dataset with 2:1 ratio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZBqFs1eBXIwf" + }, + "outputs": [], + "source": [ + "features = features[ptype[:,-1]>0]\n", + "target = target[ptype[:,-1]>0]\n", + "ptype = ptype[ptype[:,-1]>0]\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n", + "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n", + "del features, target" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GkNz5UAhXIwg" + }, + "source": [ + "# DNN model building" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tTSDOiEHXIwh" + }, + "outputs": [], + "source": [ + "# keras imports\n", + "from tensorflow.keras.models import Model\n", + "from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Activation\n", + "from tensorflow.keras.utils import plot_model\n", + "from tensorflow.keras import backend as K\n", + "from tensorflow.keras import metrics\n", + "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rAl0DZTxXIwi" + }, + "outputs": [], + "source": [ + "input_shape = X_train.shape[1]\n", + "dropoutRate = 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2l492G8BXIwj" + }, + "outputs": [], + "source": [ + "####\n", + "inputArray = Input(shape=(input_shape,))\n", + "#\n", + "x = Dense(40, activation='relu')(inputArray)\n", + "x = Dropout(dropoutRate)(x)\n", + "#\n", + "x = Dense(20)(x)\n", + "x = Activation('relu')(x)\n", + "x = Dropout(dropoutRate)(x)\n", + "#\n", + "x = Dense(10, activation='relu')(x)\n", + "x = Dropout(dropoutRate)(x)\n", + "#\n", + "x = Dense(5, activation='relu')(x)\n", + "#\n", + "output = Dense(1)(x)\n", + "####\n", + "model = Model(inputs=inputArray, outputs=output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xu8rRUkhXIwj" + }, + "outputs": [], + "source": [ + "model.compile(loss='mse', optimizer='adam')\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2HfKWoOtXIwk" + }, + "source": [ + "We now train the model with these settings:\n", + "\n", + "- the **batch size** is a hyperparameter of gradient descent that controls the number of training samples to work through before the model internal parameters are updated\n", + " - batch size = 1 results in fast computation but noisy training that is slow to converge\n", + " - batch size = dataset size results in slow computation but faster convergence)\n", + "\n", + "- the **number of epochs** controls the number of complete passes through the full training dataset -- at each epoch gradients are computed for each of the mini batches and model internal parameters are updated.\n", + "\n", + "- the **callbacks** are algorithms used to optimize the training (full list [here](https://keras.io/api/callbacks/)):\n", + " - *EarlyStopping*: stop training when a monitored metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n", + " - *ReduceLROnPlateau*: reduce learning rate when a metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n", + " - *TerminateOnNaN*: terminates training when a NaN loss is encountered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KzO-lyLEXIwk" + }, + "outputs": [], + "source": [ + "batch_size = 128\n", + "n_epochs = 100\n", + "\n", + "# train \n", + "history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, verbose = 2,\n", + " validation_split=0.2,\n", + " callbacks = [EarlyStopping(monitor='val_loss', patience=10, verbose=1),\n", + " ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),\n", + " TerminateOnNaN()]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "044bCLqVXIwl" + }, + "outputs": [], + "source": [ + "# plot training history\n", + "plt.plot(history.history['loss'])\n", + "plt.plot(history.history['val_loss'])\n", + "plt.yscale('log')\n", + "plt.title('Training History')\n", + "plt.ylabel('loss')\n", + "plt.xlabel('epoch')\n", + "plt.legend(['training', 'validation'], loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predict = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.scatter(y_test,predict.flatten(),s=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(y_test,bins=100,range=(0,1),histtype='step',label='true')\n", + "plt.hist(predict.flatten(),bins=100,range=(0,1),histtype='step',label='predict')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oESSmNLxXIwm" + }, + "source": [ + "# Plot performce with 2D histograms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lzbQ-d0RKVmV" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "name": "Notebook2_JetID_DNN.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/part2/2.JetTaggingMLP.ipynb b/part2/2.JetTaggingMLP.ipynb new file mode 100644 index 0000000..66f68df --- /dev/null +++ b/part2/2.JetTaggingMLP.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Rf258xT0XIwV" + }, + "source": [ + "# Training a Jet Tagging with **DNN** \n", + "\n", + "---\n", + "In this notebook, we perform a Jet identification task using a multiclass classifier based on a \n", + "Dense Neural Network (DNN), also called multi-layer perceptron (MLP). The problem consists on identifying a given jet as a quark, a gluon, a W, a Z, or a top,\n", + "based on set of physics-motivated high-level features.\n", + "\n", + "For details on the physics problem, see https://arxiv.org/pdf/1804.06913.pdf \n", + "\n", + "For details on the dataset, see Notebook1\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "4OMAZgtyXIwY" + }, + "outputs": [], + "source": [ + "import os\n", + "import h5py\n", + "import glob\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2lbB-J3hXIwb" + }, + "source": [ + "# Preparation of the training and validation samples\n", + "\n", + "---\n", + "In order to import the dataset, we now\n", + "- clone the dataset repository (to import the data in Colab)\n", + "- load the h5 files in the data/ repository\n", + "- extract the data we need: a target and jet features\n", + "\n", + "To type shell commands, we start the command line with !\n", + "\n", + "**nb, if you are running locally and you have already downloaded the datasets you can skip the cell below and, if needed, change the paths later to point to the folder with your previous download of the datasets.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jWjxFaRPXIwb" + }, + "outputs": [], + "source": [ + "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n", + "! tar -xvzf Data-MLtutorial.tar.gz \n", + "! ls Data-MLtutorial/JetDataset/\n", + "! rm Data-MLtutorial.tar.gz " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's open the file\n", + "data_dir = 'Data-MLtutorial/JetDataset/'\n", + "fileIN = data_dir+'jetImage_7_100p_30000_40000.h5'\n", + "f = h5py.File(fileIN)\n", + "# and see what it contains\n", + "print(list(f.keys()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* 'jetImage' ,' jetImageECAL' and 'jetImageHCAL' contains the image representation of the jets . We will not use them today but build our point cloud from the other information.\n", + "* 'jetConstituentList' is the list of particles cointained in the jet. For each particle, a list of relevant quantities is stored. This is the dataset we will consider in this notebook.\n", + "* 'particleFeatureNames' is the list of the names corresponding to the quantities contained in 'jetConstituentList'\n", + "* 'jets' is the list of jets with the high-level jet features stored. We will only use jet ID from it, indecies [-6:-1]\n", + "* 'jetFeatureNames' is the list of the names corresponding to the quantities contained in 'jets'. These quantities are build using physics knowledge and correspond to high-level infromation and features per graph (as opposed to per node)\n", + "\n", + "The first 100 highest transverse momentum $p_T$ particles are considered for each jet.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "id": "cCGhrKdwXIwc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\n", + "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\n", + "(50000, 5) (50000, 49)\n", + "[b'j_tau1_b1', b'j_tau2_b1', b'j_tau3_b1', b'j_tau1_b2', b'j_tau2_b2', b'j_tau3_b2', b'j_tau32_b1', b'j_tau32_b2', b'j_zlogz', b'j_c1_b0', b'j_c1_b1', b'j_c1_b2', b'j_c2_b1', b'j_c2_b2', b'j_d2_b1', b'j_d2_b2', b'j_d2_a1_b1', b'j_d2_a1_b2', b'j_m2_b1', b'j_m2_b2', b'j_n2_b1', b'j_n2_b2', b'j_tau1_b1_mmdt', b'j_tau2_b1_mmdt', b'j_tau3_b1_mmdt', b'j_tau1_b2_mmdt', b'j_tau2_b2_mmdt', b'j_tau3_b2_mmdt', b'j_tau32_b1_mmdt', b'j_tau32_b2_mmdt', b'j_c1_b0_mmdt', b'j_c1_b1_mmdt', b'j_c1_b2_mmdt', b'j_c2_b1_mmdt', b'j_c2_b2_mmdt', b'j_d2_b1_mmdt', b'j_d2_b2_mmdt', b'j_d2_a1_b1_mmdt', b'j_d2_a1_b2_mmdt', b'j_m2_b1_mmdt', b'j_m2_b2_mmdt', b'j_n2_b1_mmdt', b'j_n2_b2_mmdt', b'j_mass_trim', b'j_mass_mmdt', b'j_mass_prun', b'j_mass_sdb2', b'j_mass_sdm1', b'j_multiplicity']\n" + ] + } + ], + "source": [ + "target = np.array([])\n", + "AllFeatures = np.array([])\n", + "\n", + "# we cannot load all data on Colab. So we just take a few files\n", + "datafiles = ['Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5',\n", + " 'Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5']\n", + "# if you are running locallt, you can use the full dataset doing\n", + "# for fileIN in glob.glob(\"tutorials/HiggsSchool/data/*h5\"):\n", + "for fileIN in datafiles:\n", + " print(\"Appending %s\" %fileIN)\n", + " f = h5py.File(fileIN)\n", + " myAllFeatures = np.array(f.get(\"jets\")[:,4:-6])\n", + " mytarget = np.array(f.get('jets')[0:,-6:-1])\n", + " AllFeatures = np.concatenate([AllFeatures, myAllFeatures], axis=0) if AllFeatures.size else myAllFeatures\n", + " target = np.concatenate([target, mytarget], axis=0) if target.size else mytarget\n", + " f.close()\n", + "print(target.shape, AllFeatures.shape)\n", + "\n", + "# let's see what we have\n", + "f = h5py.File(datafiles[-1])\n", + "features_list = list(f.get(\"jetFeatureNames\"))[4:-6]\n", + "f.close()\n", + "print(features_list)\n", + "labels = ['gluon', 'quark', 'W', 'Z', 'top']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#data exploration\n", + "fig = plt.figure(figsize=(10, 10))\n", + "for i, label in enumerate(labels):\n", + " plt.hist(AllFeatures[target[:,i]==1][:,30], bins=50, range=(0,0.5), histtype='step', label=label)\n", + "plt.legend()\n", + "plt.xlabel(features_list[30])\n", + "plt.ylabel('Number of jets')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#feature selection\n", + "features = AllFeatures[:,[8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 48]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6a333RYPXIwe" + }, + "source": [ + "The dataset consists of 50000 jets, each represented by 16 features\n", + "\n", + "---\n", + "\n", + "We now shuffle the data, splitting them into a training and a validation dataset with 2:1 ratio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZBqFs1eBXIwf" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.33)\n", + "print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)\n", + "del features, target" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GkNz5UAhXIwg" + }, + "source": [ + "# DNN model building" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tTSDOiEHXIwh" + }, + "outputs": [], + "source": [ + "# keras imports\n", + "from tensorflow.keras.models import Model\n", + "from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Activation\n", + "from tensorflow.keras.utils import plot_model\n", + "from tensorflow.keras import backend as K\n", + "from tensorflow.keras import metrics\n", + "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rAl0DZTxXIwi" + }, + "outputs": [], + "source": [ + "input_shape = X_train.shape[1]\n", + "dropoutRate = 0.1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2l492G8BXIwj" + }, + "outputs": [], + "source": [ + "####\n", + "inputArray = Input(shape=(input_shape,))\n", + "#\n", + "x = Dense(40, activation='relu')(inputArray)\n", + "x = Dropout(dropoutRate)(x)\n", + "#\n", + "x = Dense(20)(x)\n", + "x = Activation('relu')(x)\n", + "x = Dropout(dropoutRate)(x)\n", + "#\n", + "x = Dense(10, activation='relu')(x)\n", + "x = Dropout(dropoutRate)(x)\n", + "#\n", + "x = Dense(5, activation='relu')(x)\n", + "#\n", + "output = Dense(5, activation='softmax')(x)\n", + "####\n", + "model = Model(inputs=inputArray, outputs=output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xu8rRUkhXIwj" + }, + "outputs": [], + "source": [ + "model.compile(loss='categorical_crossentropy', optimizer='adam')\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2HfKWoOtXIwk" + }, + "source": [ + "We now train the model with these settings:\n", + "\n", + "- the **batch size** is a hyperparameter of gradient descent that controls the number of training samples to work through before the model internal parameters are updated\n", + " - batch size = 1 results in fast computation but noisy training that is slow to converge\n", + " - batch size = dataset size results in slow computation but faster convergence)\n", + "\n", + "- the **number of epochs** controls the number of complete passes through the full training dataset -- at each epoch gradients are computed for each of the mini batches and model internal parameters are updated.\n", + "\n", + "- the **callbacks** are algorithms used to optimize the training (full list [here](https://keras.io/api/callbacks/)):\n", + " - *EarlyStopping*: stop training when a monitored metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n", + " - *ReduceLROnPlateau*: reduce learning rate when a metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n", + " - *TerminateOnNaN*: terminates training when a NaN loss is encountered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KzO-lyLEXIwk" + }, + "outputs": [], + "source": [ + "batch_size = 1024\n", + "n_epochs = 50\n", + "\n", + "# train \n", + "history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, verbose = 2,\n", + " validation_data=(X_val, y_val), learning_rate=0.01,\n", + " # callbacks = [\n", + " # EarlyStopping(monitor='val_loss', patience=10, verbose=1),\n", + " # ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),\n", + " # TerminateOnNaN()]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "044bCLqVXIwl" + }, + "outputs": [], + "source": [ + "# plot training history\n", + "plt.plot(history.history['loss'])\n", + "plt.plot(history.history['val_loss'])\n", + "plt.yscale('log')\n", + "plt.title('Training History')\n", + "plt.ylabel('loss')\n", + "plt.xlabel('epoch')\n", + "plt.legend(['training', 'validation'], loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oESSmNLxXIwm" + }, + "source": [ + "# Building the ROC Curves" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gjKT7EjUXIwn" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import roc_curve, auc\n", + "predict_val = model.predict(X_val)\n", + "df = pd.DataFrame()\n", + "fpr = {}\n", + "tpr = {}\n", + "auc1 = {}\n", + "\n", + "plt.figure()\n", + "for i, label in enumerate(labels):\n", + " df[label] = y_val[:,i]\n", + " df[label + '_pred'] = predict_val[:,i]\n", + "\n", + " fpr[label], tpr[label], threshold = roc_curve(df[label],df[label+'_pred'])\n", + "\n", + " auc1[label] = auc(fpr[label], tpr[label])\n", + "\n", + " plt.plot(tpr[label],fpr[label],label='%s tagger, auc = %.1f%%'%(label,auc1[label]*100.))\n", + "plt.semilogy()\n", + "plt.xlabel(\"sig. efficiency\")\n", + "plt.ylabel(\"bkg. mistag rate\")\n", + "plt.ylim(0.000001,1)\n", + "plt.grid(True)\n", + "plt.legend(loc='lower right')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How do you build a QCD vs Top / W / Z ROC curve?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lzbQ-d0RKVmV" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "name": "Notebook2_JetID_DNN.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}