comet-ml · Lothiraldan · Nov 12, 2024 · Nov 4, 2024
diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml
@@ -35,6 +35,7 @@ jobs:
           - integrations/model-training/pytorch/notebooks/Histogram_Logging_Pytorch.ipynb
           - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb
           - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb
+          - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb
           - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb
           - integrations/model-training/tensorflow/notebooks/Comet_and_Tensorflow.ipynb
           - integrations/model-training/transformers/notebooks/Comet_with_Hugging_Face_Trainer.ipynb

diff --git a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"https://cdn.comet.ml/img/notebook_logo.png\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=ray_train&utm_medium=colab) is an MLOps Platform that is designed to help Data Scientists and Teams build better models faster! Comet provides tooling to track, Explain, Manage, and Monitor your models in a single place! It works with Jupyter Notebooks and Scripts and most importantly it's 100% free to get started!\n",
+    "\n",
+    "[Ray Train](https://docs.ray.io/en/latest/train/train.html) abstracts away the complexity of setting up a distributed training system.\n",
+    "\n",
+    "Instrument your runs with Comet to start managing experiments, create dataset versions and track hyperparameters for faster and easier reproducibility and collaboration.\n",
+    "\n",
+    "[Find more information about our integration with Ray Train](https://www.comet.ml/docs/v2/integrations/ml-frameworks/ray/)\n",
+    "\n",
+    "Get a preview for what's to come. Check out a completed experiment created from this notebook [here](https://www.comet.com/examples/comet-example-ray-train-keras/99d169308c854be7ac222c995a2bfa26?experiment-tab=systemMetrics).\n",
+    "\n",
+    "This example is based on the [following Ray Train Lightning example](https://docs.ray.io/en/latest/train/getting-started-pytorch-lightning.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZYchV5RWwdv5"
+   },
+   "source": [
+    "# Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DJnmqphuY2eI"
+   },
+   "outputs": [],
+   "source": [
+    "%pip install \"comet_ml>=3.47.1\" \"ray[air]>=2.1.0\" \"lightning\" \"torchvision\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "crOcPHobwhGL"
+   },
+   "source": [
+    "# Initialize Comet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HNQRM0U3caiY"
+   },
+   "outputs": [],
+   "source": [
+    "import comet_ml\n",
+    "import comet_ml.integration.ray\n",
+    "\n",
+    "comet_ml.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cgqwGSwtzVWD"
+   },
+   "source": [
+    "# Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "e-5rRYaUw5AF"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import tempfile\n",
+    "\n",
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
+    "from torchvision.models import resnet18\n",
+    "from torchvision.datasets import FashionMNIST\n",
+    "from torchvision.transforms import ToTensor, Normalize, Compose\n",
+    "import lightning.pytorch as pl\n",
+    "\n",
+    "import ray.train.lightning\n",
+    "from ray.train.torch import TorchTrainer\n",
+    "from ray.train import ScalingConfig, RunConfig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare your model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model, Loss, Optimizer\n",
+    "class ImageClassifier(pl.LightningModule):\n",
+    "    def __init__(self):\n",
+    "        super(ImageClassifier, self).__init__()\n",
+    "        self.model = resnet18(num_classes=10)\n",
+    "        self.model.conv1 = torch.nn.Conv2d(\n",
+    "            1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False\n",
+    "        )\n",
+    "        self.criterion = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.model(x)\n",
+    "\n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        x, y = batch\n",
+    "        outputs = self.forward(x)\n",
+    "        loss = self.criterion(outputs, y)\n",
+    "        self.log(\"ligthning_loss\", loss, on_step=True, prog_bar=True)\n",
+    "        return loss\n",
+    "\n",
+    "    def configure_optimizers(self):\n",
+    "        return torch.optim.Adam(self.model.parameters(), lr=0.001)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TJuThf1TxP_G"
+   },
+   "source": [
+    "# Define your distributed training function\n",
+    "\n",
+    "This function is gonna be distributed and executed on each distributed worker."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_func(config):\n",
+    "    from comet_ml.integration.ray import comet_worker_logger\n",
+    "    from lightning.pytorch.loggers import CometLogger\n",
+    "\n",
+    "    with comet_worker_logger(config) as experiment:\n",
+    "        # Data\n",
+    "        transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])\n",
+    "        data_dir = os.path.join(tempfile.gettempdir(), \"data\")\n",
+    "        train_data = FashionMNIST(\n",
+    "            root=data_dir, train=True, download=True, transform=transform\n",
+    "        )\n",
+    "        train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)\n",
+    "\n",
+    "        # Training\n",
+    "        model = ImageClassifier()\n",
+    "\n",
+    "        comet_logger = CometLogger()\n",
+    "\n",
+    "        # Temporary workaround, can be removed once\n",
+    "        # https://github.com/Lightning-AI/pytorch-lightning/pull/20275 has\n",
+    "        # been merged and released\n",
+    "        comet_logger._experiment = experiment\n",
+    "\n",
+    "        # [1] Configure PyTorch Lightning Trainer.\n",
+    "        trainer = pl.Trainer(\n",
+    "            max_epochs=config[\"epochs\"],\n",
+    "            devices=\"auto\",\n",
+    "            accelerator=\"auto\",\n",
+    "            strategy=ray.train.lightning.RayDDPStrategy(),\n",
+    "            plugins=[ray.train.lightning.RayLightningEnvironment()],\n",
+    "            callbacks=[ray.train.lightning.RayTrainReportCallback()],\n",
+    "            logger=comet_logger,\n",
+    "            # [1a] Optionally, disable the default checkpointing behavior\n",
+    "            # in favor of the `RayTrainReportCallback` above.\n",
+    "            enable_checkpointing=False,\n",
+    "            log_every_n_steps=2,\n",
+    "        )\n",
+    "        trainer = ray.train.lightning.prepare_trainer(trainer)\n",
+    "        trainer.fit(model, train_dataloaders=train_dataloader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define the function that schedule the distributed job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(num_workers: int = 2, use_gpu: bool = False, epochs=1):\n",
+    "    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)\n",
+    "    config = {\"use_gpu\": use_gpu, \"epochs\": epochs}\n",
+    "\n",
+    "    callback = comet_ml.integration.ray.CometTrainLoggerCallback(\n",
+    "        config, project_name=\"comet-example-ray-train-pytorch-lightning\"\n",
+    "    )\n",
+    "\n",
+    "    ray_trainer = TorchTrainer(\n",
+    "        train_func,\n",
+    "        scaling_config=scaling_config,\n",
+    "        train_loop_config=config,\n",
+    "        run_config=RunConfig(callbacks=[callback]),\n",
+    "    )\n",
+    "    result = ray_trainer.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train the model\n",
+    "\n",
+    "Ray will wait indefinitely if we request more num_workers that the available resources, the code below ensure we never request more CPU than available locally."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ideal_num_workers = 2\n",
+    "\n",
+    "available_local_cpu_count = os.cpu_count() - 1\n",
+    "num_workers = min(ideal_num_workers, available_local_cpu_count)\n",
+    "\n",
+    "if num_workers < 1:\n",
+    "    num_workers = 1\n",
+    "\n",
+    "train(num_workers, use_gpu=False, epochs=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comet_ml.end()"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}