From d318e0b81255a319f1302073c1ebeb5488074fda Mon Sep 17 00:00:00 2001 From: Francesco Paissan <46992226+fpaissan@users.noreply.github.com> Date: Thu, 12 Oct 2023 12:04:17 +0200 Subject: [PATCH] Version 0.1.0 (#41) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Extended unit tests to classifier and fixed pooling (#17) * Extended unit tests to classifier and fixed pooling * Changed trigger of doctest workflow * Fixing issue #18 * fixed linters * Add pre-commit hooks * Doctest only on PRs * Fixed network conversion from GPU Also tested on Windows machine. * Create python_versions.yml * Update and rename python_versions.yml to tests.yml * Update export.yaml * CI fix (#21) * Create pre-commit.yaml * remove code.yaml * fixing pre-commit * Doctest with pytest * change trigger * change trigger * Delete LICENSE * checkpoint from filesystem (#20) * checkpoint from filesystem * fixed deps * Update README.md * Update LICENSE * Updating LICENSE --------- Co-authored-by: fpaissan Co-authored-by: Francesco Paissan <46992226+fpaissan@users.noreply.github.com> * Create LICENSE (#22) * Update README.md (#23) * new min python version to 3.8 * 🐛 extra_requirements now have a version - fixed CI (#24) * 🐛 extra_requirements now have a version * fixed linter errors * testing actions * fixed linter * removing tf_probability * fixed tf prob version --------- Co-authored-by: fpaissan * Documentation upgrade - guide for contribution (#25) * add contribution guide to docs * documentation with contribution guide * cosmetic * bump version 0.0.4 -> 0.0.5 * Bump requests from 2.28.2 to 2.31.0 (#27) Bumps [requests](https://github.com/psf/requests) from 2.28.2 to 2.31.0. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.28.2...v2.31.0) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * fix pypi release * Update README.md (#29) * Patch for faster GPU inference (#35) * Patch for faster GPU inference * remove unused zeropad def --------- Co-authored-by: fpaissan * initial commit * add eval loop * add acceleration * modules as dict * add checkpointer * minor * load best checkpoint * restore epoch, optimizer, lr sched * fix logging on multi-gpu * minor fixes * working on single gpu * fix checkpointer + multi-gpu * fp16 might not be ok yet * load_modules and unwrap_model * fixed convert and export * cosmetic on export * add argparse * add metrics -- check something is off with acc * its print strange * fixed checkpointer viz * fix checkpointers and metrics * cosmetic * linters * add credits * fix requirements * fix unittest * remove recipes * remove unused files * remove unused fuctions from networks * fix tests * hot fix * onnx conversion without convert * fix requirements * add default class config and temp folder for debug mode * add doc for class Metric * finish doc MicroMind * update docs * linters fix * new initial page * bump version 0.0.5 -> 0.1.0 * final touches and bumpver --------- Signed-off-by: dependabot[bot] Co-authored-by: Matteo Beltrami <71525176+matteobeltrami@users.noreply.github.com> Co-authored-by: SebastianCavada Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matteo Tremonti <102596472+Tremo8@users.noreply.github.com> Co-authored-by: Matteo Beltrami --- README.md | 8 + docs/source/index.rst | 68 + docs/source/micromind.conversion.rst | 21 - docs/source/micromind.networks.rst | 8 - docs/source/micromind.rst | 27 +- docs/source/micromind.utils.rst | 20 +- examples/mind.py | 67 + micromind/__init__.py | 6 +- micromind/conversion/__init__.py | 1 - micromind/{conversion => }/convert.py | 124 +- micromind/core.py | 561 ++++++ micromind/networks/phinet.py | 352 +--- micromind/utils/checkpointer.py | 99 ++ micromind/utils/configlib.py | 46 - micromind/utils/helpers.py | 43 + micromind/utils/parse.py | 28 + pyproject.toml | 5 +- recipes/image_classification/README.md | 61 - .../image_classification/classification.py | 1551 ----------------- .../image_classification/distributed_train.sh | 4 - .../extra_requirements.txt | 1 - .../image_classification/launch_training.sh | 22 - tests/test_networks.py | 12 +- 23 files changed, 1097 insertions(+), 2038 deletions(-) delete mode 100644 docs/source/micromind.conversion.rst create mode 100644 examples/mind.py delete mode 100644 micromind/conversion/__init__.py rename micromind/{conversion => }/convert.py (50%) create mode 100644 micromind/core.py create mode 100644 micromind/utils/checkpointer.py delete mode 100644 micromind/utils/configlib.py create mode 100644 micromind/utils/helpers.py create mode 100644 micromind/utils/parse.py delete mode 100644 recipes/image_classification/README.md delete mode 100644 recipes/image_classification/classification.py delete mode 100755 recipes/image_classification/distributed_train.sh delete mode 100644 recipes/image_classification/extra_requirements.txt delete mode 100755 recipes/image_classification/launch_training.sh diff --git a/README.md b/README.md index b3e0fd2..fd39001 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,14 @@ for the basic install. To install `micromind` with the full exportability featur pip install -e .[conversion] ``` +### Training networks with recipes + +After the installation, get started looking at the examples and the docs! + +### Export your model and run it on your MCU +Check out [this](https://docs.google.com/document/d/1zt5urvNtI9VSJcoJdIeo10YrdH-tZNcS4JHbT1z5udI/edit?usp=sharing) +tutorial and have fun deploying your network on MCU! + --------------------------------------------------------------------------------------------------------- ## 📧 Contact diff --git a/docs/source/index.rst b/docs/source/index.rst index 9f114f7..742fcc8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,6 +6,74 @@ Welcome to micromind's documentation! ===================================== +.. image:: https://img.shields.io/badge/python-3.9%20|%203.10-blue + :target: https://www.python.org/downloads/ + +.. image:: https://img.shields.io/badge/License-Apache_2.0-blue.svg + :target: https://github.com/fpaissan/micromind/blob/main/LICENSE + +.. image:: https://img.shields.io/pypi/v/micromind + +This is the official repository of `micromind`, a toolkit that aims to bridge two communities: artificial intelligence and embedded systems. `micromind` is based on `PyTorch `_ and provides exportability for the supported models in ONNX, Intel OpenVINO, and TFLite. + +Key Features +------------ + +- Smooth flow from research to deployment; +- Support for multimedia analytics recipes (image classification, sound event detection, etc); +- Detailed API documentation; +- Tutorials for embedded deployment. + +Installation +------------ + +Using Pip +~~~~~~~~~ + +First of all, install `Python 3.8 or later `_. Open a terminal and run: + +.. code:: shell + + pip install micromind + +for the basic install. To install `micromind` with the full exportability features, run + +.. code:: shell + + pip install micromind[conversion] + + +Basic how-to +------------ + +If you want to launch a simple training on an image classification model, you just need to define a class that extends `MicroMind `_, defining the modules you want to use, such as a `PhiNet`, the forward method of the model and the way in which to calculate your loss function. micromind takes care of the rest for you. + +.. code-block:: python + + class ImageClassification(MicroMind): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.modules["classifier"] = PhiNet( + (3, 32, 32), include_top=True, num_classes=10 + ) + + def forward(self, batch): + return self.modules["classifier"](batch[0]) + + def compute_loss(self, pred, batch): + return nn.CrossEntropyLoss()(pred, batch[1]) + +Afterwards, you can export the model in the format you like best between **ONNX**, **TFLite** and **OpenVINO**, just run this simple code: + +.. code-block:: python + + m = ImageClassification() + m.export("output_onnx", "onnx", (3, 32, 32)) + + +Here is the link to the Python `file `_ inside our repository that illustrates how to use the MicroMind class. + .. toctree:: :maxdepth: 2 :caption: Contents: diff --git a/docs/source/micromind.conversion.rst b/docs/source/micromind.conversion.rst deleted file mode 100644 index 30195cc..0000000 --- a/docs/source/micromind.conversion.rst +++ /dev/null @@ -1,21 +0,0 @@ -micromind.conversion package -============================ - -Submodules ----------- - -micromind.conversion.convert module ------------------------------------ - -.. automodule:: micromind.conversion.convert - :members: - :undoc-members: - :show-inheritance: - -Module contents ---------------- - -.. automodule:: micromind.conversion - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/micromind.networks.rst b/docs/source/micromind.networks.rst index f9eb3b6..327a2b4 100644 --- a/docs/source/micromind.networks.rst +++ b/docs/source/micromind.networks.rst @@ -11,11 +11,3 @@ micromind.networks.phinet module :members: :undoc-members: :show-inheritance: - -Module contents ---------------- - -.. automodule:: micromind.networks - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/micromind.rst b/docs/source/micromind.rst index 9825874..d85c6b5 100644 --- a/docs/source/micromind.rst +++ b/docs/source/micromind.rst @@ -1,20 +1,27 @@ micromind package ================= +micromind.core module +--------------------- + +.. automodule:: micromind.core + :members: + :undoc-members: + :show-inheritance: + +micromind.convert module +------------------------ + +.. automodule:: micromind.convert + :members: + :undoc-members: + :show-inheritance: + Subpackages ----------- .. toctree:: - :maxdepth: 4 + :maxdepth: 2 - micromind.conversion micromind.networks micromind.utils - -Module contents ---------------- - -.. automodule:: micromind - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/micromind.utils.rst b/docs/source/micromind.utils.rst index 0692030..fd2a7b7 100644 --- a/docs/source/micromind.utils.rst +++ b/docs/source/micromind.utils.rst @@ -4,18 +4,26 @@ micromind.utils package Submodules ---------- -micromind.utils.configlib module --------------------------------- +micromind.utils.checkpointer module +----------------------------------- -.. automodule:: micromind.utils.configlib +.. automodule:: micromind.utils.checkpointer :members: :undoc-members: :show-inheritance: -Module contents ---------------- +micromind.utils.helpers module +------------------------------ -.. automodule:: micromind.utils +.. automodule:: micromind.utils.helpers + :members: + :undoc-members: + :show-inheritance: + +micromind.utils.parse module +---------------------------- + +.. automodule:: micromind.utils.parse :members: :undoc-members: :show-inheritance: diff --git a/examples/mind.py b/examples/mind.py new file mode 100644 index 0000000..958d612 --- /dev/null +++ b/examples/mind.py @@ -0,0 +1,67 @@ +from micromind import MicroMind, Metric +from micromind.networks import PhiNet +from micromind.utils.parse import parse_arguments + +import torch +import torch.nn as nn +import torchvision +import torchvision.transforms as transforms + +batch_size = 128 + + +class ImageClassification(MicroMind): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.modules["classifier"] = PhiNet( + (3, 32, 32), include_top=True, num_classes=10 + ) + + def forward(self, batch): + return self.modules["classifier"](batch[0]) + + def compute_loss(self, pred, batch): + return nn.CrossEntropyLoss()(pred, batch[1]) + + +if __name__ == "__main__": + hparams = parse_arguments() + m = ImageClassification(hparams) + + def compute_accuracy(pred, batch): + tmp = (pred.argmax(1) == batch[1]).float() + return tmp + + transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + ) + + trainset = torchvision.datasets.CIFAR10( + root="data/cifar-10", train=True, download=True, transform=transform + ) + trainloader = torch.utils.data.DataLoader( + trainset, batch_size=batch_size, shuffle=True, num_workers=1 + ) + + testset = torchvision.datasets.CIFAR10( + root="data/cifar-10", train=False, download=True, transform=transform + ) + testloader = torch.utils.data.DataLoader( + testset, batch_size=batch_size, shuffle=False, num_workers=1 + ) + + acc = Metric(name="accuracy", fn=compute_accuracy) + + m.train( + epochs=10, + datasets={"train": trainloader, "val": testloader, "test": testloader}, + metrics=[acc], + debug=hparams.debug, + ) + + m.test( + datasets={"test": testloader}, + ) + + m.export("output_onnx", "onnx", (3, 32, 32)) diff --git a/micromind/__init__.py b/micromind/__init__.py index 3711e9f..7ec91f3 100644 --- a/micromind/__init__.py +++ b/micromind/__init__.py @@ -1,9 +1,7 @@ -from .networks.phinet import PhiNet -from .utils import configlib - +from .core import MicroMind, Metric, Stage # Package version -__version__ = "0.0.5" +__version__ = "0.1.0" """datasets_info is a dictionary that contains information about the attributes of the datasets. diff --git a/micromind/conversion/__init__.py b/micromind/conversion/__init__.py deleted file mode 100644 index ca0a917..0000000 --- a/micromind/conversion/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .convert import convert_to_onnx, convert_to_openvino, convert_to_tflite diff --git a/micromind/conversion/convert.py b/micromind/convert.py similarity index 50% rename from micromind/conversion/convert.py rename to micromind/convert.py index f8be0aa..25878fc 100644 --- a/micromind/conversion/convert.py +++ b/micromind/convert.py @@ -6,60 +6,89 @@ - Francesco Paissan, 2023 - Alberto Ancilotto, 2023 """ -try: - import os - import shutil - import sys - from pathlib import Path - - import numpy as np - import onnx - import onnxsim - import tensorflow as tf - import torch - import torch.nn as nn - from onnx_tf.backend import prepare - from openvino.tools.mo import main as mo_main -except Exception as e: - print(str(e)) - print("Did you install micromind with conversion capabilities?") - print("Please try again after pip install micromind[conversion].") - quit() +from pathlib import Path +from loguru import logger +from typing import Union +import torch.nn as nn +import torch +import os @torch.no_grad() -def convert_to_onnx(net: nn.Module, save_path: Path, simplify: bool = True): +def convert_to_onnx( + net: nn.Module, + save_path: Union[Path, str] = "model.onnx", + simplify: bool = False, + replace_forward: bool = False, +): """Converts nn.Module to onnx and saves it to save_path. Optionally simplifies it.""" + save_path = Path(save_path) + os.makedirs(save_path.parent, exist_ok=True) x = torch.zeros([1] + list(net.input_shape)) + if replace_forward: + # add forward to ModuleDict + bound_method = net.forward.__get__(net.modules, net.modules.__class__) + setattr(net.modules, "forward", bound_method) + + net.modules.input_shape = net.input_shape + net = net.modules + x = [torch.zeros([1] + list(net.input_shape)), None] + torch.onnx.export( net.cpu(), x, save_path, verbose=False, - input_names=["input"], + input_names=["input", "labels"], output_names=["output"], opset_version=11, ) if simplify: + import onnx + import onnxsim + onnx_model = onnx.load(save_path) onnx_model, check = onnxsim.simplify(onnx_model) onnx.save(onnx_model, save_path) - return onnx.load(save_path) + logger.info(f"Saved converted ONNX model to {save_path}.") + + return save_path @torch.no_grad() -def convert_to_openvino(net: nn.Module, save_dir: Path) -> str: +def convert_to_openvino( + net: nn.Module, save_path: Path, replace_forward: bool = False +) -> str: """Converts nn.Module to OpenVINO.""" - os.makedirs(save_dir, exist_ok=True) - if not isinstance(save_dir, Path): - save_dir = Path(save_dir) + try: + import os + + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + import sys + from pathlib import Path + from loguru import logger + + import onnx + from onnx_tf.backend import prepare + from openvino.tools.mo import main as mo_main + + except Exception as e: + print(str(e)) + print("Did you install micromind with conversion capabilities?") + print("Please try again after pip install micromind[conversion].") + exit(0) + os.makedirs(save_path, exist_ok=True) + if not isinstance(save_path, Path): + save_path = Path(save_path) - onnx_path = save_dir.joinpath("model.onnx") - onnx_model = convert_to_onnx(net, onnx_path, simplify=True) + onnx_path = save_path.joinpath("model.onnx") + onnx_model = onnx.load( + convert_to_onnx(net, onnx_path, simplify=True, replace_forward=replace_forward) + ) tf_rep = prepare(onnx_model) @@ -76,21 +105,45 @@ def convert_to_openvino(net: nn.Module, save_dir: Path) -> str: "--input_shape", input_shape_str, "--output_dir", - str(save_dir), + str(save_path), "--data_type", "FP32", + "--silent", + "True", ] - os.system(" ".join(cmd)) + os.popen(" ".join(cmd)).read() - return str(save_dir.joinpath("model.xml")) + logger.info(f"Saved converted OpenVINO model to {save_path}.") + + return str(save_path.joinpath("model.xml")) @torch.no_grad() def convert_to_tflite( - net: nn.Module, save_path: Path, batch_quant: torch.Tensor = None + net: nn.Module, + save_path: Path, + batch_quant: torch.Tensor = None, + replace_forward: bool = False, ) -> None: """Converts nn.Module to tf_lite, optionally quantizes it.""" + try: + import os + + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + import shutil + import sys + from pathlib import Path + from loguru import logger + + import numpy as np + import tensorflow as tf + + except Exception as e: + print(str(e)) + print("Did you install micromind with conversion capabilities?") + print("Please try again after pip install micromind[conversion].") + exit(0) if not isinstance(save_path, Path): save_path = Path(save_path) @@ -99,7 +152,7 @@ def convert_to_tflite( vino_sub = save_path.joinpath("vino") os.makedirs(vino_sub, exist_ok=True) - vino_path = convert_to_openvino(net, vino_sub) + vino_path = convert_to_openvino(net, vino_sub, replace_forward=replace_forward) if os.name == "nt": openvino2tensorflow_exe_cmd = [ sys.executable, @@ -117,9 +170,10 @@ def convert_to_tflite( str(save_path), "--output_saved_model", "--output_no_quant_float32_tflite", + "--non_verbose", ] - os.system(" ".join(cmd)) + os.popen(" ".join(cmd)).read() shutil.rmtree(vino_sub) @@ -140,3 +194,5 @@ def representative_dataset(): with open(save_path.joinpath("model.int8.tflite"), "wb") as f: f.write(tflite_quant_model) + + logger.info(f"Saved converted TFLite model to {save_path}.") diff --git a/micromind/core.py b/micromind/core.py new file mode 100644 index 0000000..9c50a62 --- /dev/null +++ b/micromind/core.py @@ -0,0 +1,561 @@ +""" +Core class for micromind. Supports helper function for exports. Out-of-the-box +multi-gpu and FP16 training with HF Accelerate and much more. + +Authors: + - Francesco Paissan, 2023 +""" +from typing import Dict, Union, Tuple, Callable, List +from abc import ABC, abstractmethod +from dataclasses import dataclass +from argparse import Namespace +from pathlib import Path +from loguru import logger +from tqdm import tqdm +import shutil + +from accelerate import Accelerator +import torch +import os + +from .utils.helpers import select_and_load_checkpoint, get_random_string +from .utils.checkpointer import Checkpointer + +# This is used ONLY if you are not using argparse to get the hparams +default_cfg = { + "output_folder": "results", + "experiment_name": "micromind_exp", + "opt": "adam", # this is ignored if you are overriding the configure_optimizers + "lr": 0.001, # this is ignored if you are overriding the configure_optimizers + "debug": False, +} + + +@dataclass +class Stage: + """enum to track training stage""" + + train: int = 0 + val: int = 1 + test: int = 2 + + +class Metric: + """ + Class for tracking evaluation metrics during training. + + This class allows you to create custom evaluation metrics by providing a + function to compute the metric and specifying a reduction method. + + Arguments + --------- + name : str + The name of the metric. + fn : Callable + A function that computes the metric given predictions and batch data. + reduction : Optional[str] + The reduction method for the metric ('sum' or 'mean'). Default is 'mean'. + + Returns + ------- + Reduced metric. Optionally, you can access the metric history + before call reduce(clear=True) : torch.Tensor + + Example + ------- + .. doctest:: + + >>> from micromind import Metric, Stage + >>> import torch + + >>> def custom_metric(pred, batch): + ... # Replace this with your custom metric calculation + ... return pred - batch + + >>> metric = Metric("Custom Metric", custom_metric, reduction="mean") + >>> pred = torch.tensor([1.0, 2.0, 3.0]) + >>> batch = torch.tensor([0.5, 1.5, 2.5]) + >>> metric(pred, batch, stage=Stage.train) + >>> metric.history + {0: [tensor([0.5000, 0.5000, 0.5000])], 1: [], 2: []} + >>> metric.reduce(Stage.train) + 0.5 + """ + + def __init__(self, name: str, fn: Callable, reduction="mean"): + self.name = name + self.fn = fn + self.reduction = reduction + self.history = {s: [] for s in [Stage.train, Stage.val, Stage.test]} + + def __call__(self, pred, batch, stage, device="cpu"): + if pred.device != device: + pred = pred.to(device) + dat = self.fn(pred, batch) + if dat.ndim == 0: + dat = dat.unsqueeze(0) + + self.history[stage].append(self.fn(pred, batch)) + + def reduce(self, stage, clear=False): + """ + Compute and return the metric for a given prediction and batch data. + + Arguments + ------- + pred : torch.Tensor + The model's prediction. + batch : torch.Tensor + The ground truth or target values. + stage : Stage + The current stage (e.g., Stage.train). + device Optional[str] + The device on which to perform the computation. Default is 'cpu'. + """ + + if self.reduction == "mean": + if clear or ( + self.history[stage][-1].shape[0] != self.history[stage][0].shape[0] + ): + tmp = torch.stack(self.history[stage][:-1]).mean() + else: + tmp = torch.stack(self.history[stage]).mean() + elif self.reduction == "sum": + if ( + clear + or self.history[stage][-1].shape[0] != self.history[stage][0].shape[0] + ): + tmp = torch.stack(self.history[stage][:-1]).sum() + else: + tmp = torch.stack(self.history[stage]).sum() + + if clear: + self.history[stage] = [] + return tmp.item() + + +class MicroMind(ABC): + """ + MicroMind is an abstract base class for creating and training deep learning + models. Handles training on multi-gpu via accelerate (using DDP and other + distributed training strategies). It automatically handles the device + management for the training and the micromind's export capabilities to onnx, + OpenVino and TFLite. + + Arguments + --------- + hparams : Optional[Namespace] + Hyperparameters for the model. Default is None. + + """ + + def __init__(self, hparams=None): + if hparams is None: + hparams = Namespace(**default_cfg) + + # here we should handle devices etc. + self.modules = torch.nn.ModuleDict({}) # init empty modules dict + self.hparams = hparams + self.input_shape = None + + self.device = "cpu" # used just to init the models + self.accelerator = Accelerator() + + @abstractmethod + def forward(self, batch): + """ + Forward step of the class. It gets called during inference and optimization. + This method should be overwritten for specific applications. + + Arguments + --------- + batch : torch.Tensor + Batch as output from the defined DataLoader. + + Returns + ------- + pred : Union[torch.Tensor, Tuple] + Predictions - this depends on the task. + """ + pass + + @abstractmethod + def compute_loss(self, pred, batch): + """ + Computes the cost function for the optimization process. It return a + tensor on which backward() is called. This method should be overwritten + for the specific application. + + Arguments + --------- + pred : Union[torch.Tensor, Tuple] + Output of the forward() function + batch : torch.Tensor + Batch as defined from the DataLoader. + + Returns + ------- + loss : torch.Tensor + Compute cost function. + """ + pass + + def set_input_shape(self, input_shape: Tuple = (3, 224, 224)): + """Setter function for input_shape. + + Arguments + --------- + input_shape : Tuple + Input shape of the forward step. + + """ + self.input_shape = input_shape + + def load_modules(self, checkpoint_path: Union[Path, str]): + """Loads models for path. + + Arguments + --------- + checkpoint_path : Union[Path, str] + Path to the checkpoint where the modules are stored. + + """ + dat = torch.load(checkpoint_path) + + modules_keys = list(self.modules.keys()) + for k in self.modules: + self.modules[k].load_state_dict(dat[k]) + + modules_keys.remove(k) + + if len(modules_keys) != 0: + print(modules_keys) + breakpoint() + logger.info(f"Couldn't find a state_dict for modules {modules_keys}.") + + def export( + self, save_dir: Union[Path, str], out_format: str = "onnx", input_shape=None + ) -> None: + """ + Export the model to a specified format for deployment. + TFLite and OpenVINO need a Linux machine to be exported. + + + Arguments + --------- + save_dir : Union[Path, str] + The directory where the exported model will be saved. + out_format : Optional[str] + The format for exporting the model. Default is 'onnx'. + input_shape : Optional[Tuple] + The input shape of the model. If not provided, the input shape + specified during model creation is used. + + """ + from micromind import convert + + if not isinstance(save_dir, Path): + save_dir = Path(save_dir) + save_dir = save_dir.joinpath(self.hparams.experiment_name) + + self.set_input_shape(input_shape) + assert ( + self.input_shape is not None + ), "You should pass the input_shape of the model." + + if out_format == "onnx": + convert.convert_to_onnx( + self, save_dir.joinpath("model.onnx"), replace_forward=True + ) + elif out_format == "openvino": + convert.convert_to_openvino(self, save_dir, replace_forward=True) + elif out_format == "tflite": + convert.convert_to_tflite(self, save_dir, replace_forward=True) + + def configure_optimizers(self): + """Configures and defines the optimizer for the task. Defaults to adam + with lr=0.001; It can be overwritten by either passing arguments from the + command line, or by overwriting this entire method. + + Returns + --------- + Optimizer and learning rate scheduler + (not implemented yet). : Tuple[torch.optim.Adam, None] + + """ + assert self.hparams.opt in [ + "adam", + "sgd", + ], f"Optimizer {self.hparams.opt} not supported." + if self.hparams.opt == "adam": + opt = torch.optim.Adam(self.modules.parameters(), self.hparams.lr) + elif self.hparams.opt == "sgd": + opt = torch.optim.SGD(self.modules.parameters(), self.hparams.lr) + return opt, None # None is for learning rate sched + + def __call__(self, *x, **xv): + """Just forwards everything to the forward method.""" + return self.forward(*x, **xv) + + def on_train_start(self): + """Initializes the optimizer, modules and puts the networks on the right + devices. Optionally loads checkpoint if already present. + + This function gets executed at the beginning of every training. + """ + self.experiment_folder = os.path.join( + self.hparams.output_folder, self.hparams.experiment_name + ) + if self.hparams.debug: + self.experiment_folder = "tmp_" + get_random_string() + logger.info(f"Created temporary folder for debug {self.experiment_folder}.") + + save_dir = os.path.join(self.experiment_folder, "save") + if os.path.exists(save_dir): + if len(os.listdir(save_dir)) != 0: + # select which checkpoint and load it. + checkpoint, path = select_and_load_checkpoint(save_dir) + self.opt = checkpoint["optimizer"] + self.lr_sched = checkpoint["lr_scheduler"] + self.start_epoch = checkpoint["epoch"] + 1 + + self.load_modules(path) + + if self.accelerator.is_local_main_process: + self.checkpointer = Checkpointer( + checkpoint["key"], + mode=checkpoint["mode"], + checkpoint_path=self.experiment_folder, + ) + + logger.info(f"Loaded existing checkpoint from {path}.") + else: + self.opt, self.lr_sched = self.configure_optimizers() + self.start_epoch = 0 + + self.checkpointer = Checkpointer( + "val_loss", checkpoint_path=self.experiment_folder + ) + else: + os.makedirs(self.experiment_folder, exist_ok=True) + + self.opt, self.lr_sched = self.configure_optimizers() + self.start_epoch = 0 + + self.checkpointer = Checkpointer( + "val_loss", checkpoint_path=self.experiment_folder + ) + + self.accelerator = Accelerator() + self.device = self.accelerator.device + self.modules.to(self.device) + print("Set device to ", self.device) + + convert = [self.modules, self.opt, self.lr_sched] + list(self.datasets.values()) + accelerated = self.accelerator.prepare(convert) + self.modules, self.opt, self.lr_sched = accelerated[:3] + for i, key in enumerate(self.datasets): + self.datasets[key] = accelerated[-(i + 1)] + + def on_train_end(self): + """Runs at the end of each training. Cleans up before exiting.""" + if self.hparams.debug: + logger.info(f"Removed temporary folder {self.experiment_folder}.") + shutil.rmtree(self.experiment_folder) + + if self.accelerator.is_local_main_process: + self.checkpointer.close() + + def train( + self, + epochs: int = 1, + datasets: Dict = {}, + metrics: List[Metric] = [], + debug: bool = False, + ) -> None: + """ + This method trains the model on the provided training dataset for the + specified number of epochs. It tracks training metrics and can + optionally perform validation during training, if the validation set is + provided. + + Arguments + --------- + epochs : int + The number of training epochs. + datasets : Dict + A dictionary of dataset loaders. Dataloader should be mapped to keys + "train", "val", and "test". + metrics : Optional[List[Metric]] + A list of metrics to track during training. Default is an empty list. + debug : bool + Whether to run in debug mode. Default is False. If in debug mode, + only runs for few epochs + and with few batches. + + """ + self.datasets = datasets + self.metrics = metrics + assert "train" in self.datasets, "Training dataloader was not specified." + assert epochs > 0, "You must specify at least one epoch." + + self.debug = debug + + self.on_train_start() + + if self.accelerator.is_local_main_process: + logger.info( + f"Starting from epoch {self.start_epoch}." + + f" Training is scheduled for {epochs} epochs." + ) + with self.accelerator.autocast(): + for e in range(self.start_epoch, epochs): + pbar = tqdm( + self.datasets["train"], + unit="batches", + ascii=True, + dynamic_ncols=True, + disable=not self.accelerator.is_local_main_process, + ) + loss_epoch = 0 + pbar.set_description(f"Running epoch {e + 1}/{epochs}") + self.modules.train() + for idx, batch in enumerate(pbar): + if isinstance(batch, list): + batch = [b.to(self.device) for b in batch] + + self.opt.zero_grad() + + model_out = self(batch) + loss = self.compute_loss(model_out, batch) + + self.accelerator.backward(loss) + self.opt.step() + + for m in self.metrics: + m(model_out, batch, Stage.train, self.device) + + running_train = { + "train_" + m.name: m.reduce(Stage.train) for m in self.metrics + } + + running_train.update({"train_loss": loss_epoch / (idx + 1)}) + + loss_epoch += loss.item() + pbar.set_postfix(**running_train) + + if self.debug and idx > 10: + break + + pbar.close() + + train_metrics = { + "train_" + m.name: m.reduce(Stage.train, True) for m in self.metrics + } + train_metrics.update({"train_loss": loss_epoch / (idx + 1)}) + + if "val" in datasets: + val_metrics = self.validate() + if self.accelerator.is_local_main_process: + self.checkpointer( + self, + e, + train_metrics, + val_metrics, + lambda x: self.accelerator.unwrap_model(x), + ) + else: + val_metrics = train_metrics.update( + {"val_loss": loss_epoch / (idx + 1)} + ) + + if e >= 1 and self.debug: + break + + self.on_train_end() + return None + + @torch.no_grad() + def validate(self) -> Dict: + """Runs the validation step.""" + assert "val" in self.datasets, "Validation dataloader was not specified." + self.modules.eval() + + pbar = tqdm( + self.datasets["val"], + unit="batches", + ascii=True, + dynamic_ncols=True, + disable=not self.accelerator.is_local_main_process, + ) + loss_epoch = 0 + pbar.set_description("Validation...") + with self.accelerator.autocast(): + for idx, batch in enumerate(pbar): + if isinstance(batch, list): + batch = [b.to(self.device) for b in batch] + + self.opt.zero_grad() + + model_out = self(batch) + loss = self.compute_loss(model_out, batch) + for m in self.metrics: + m(model_out, batch, Stage.val, self.device) + + loss_epoch += loss.item() + pbar.set_postfix(loss=loss_epoch / (idx + 1)) + + if self.debug and idx > 10: + break + + val_metrics = {"val_" + m.name: m.reduce(Stage.val, True) for m in self.metrics} + val_metrics.update({"val_loss": loss_epoch / (idx + 1)}) + + pbar.close() + + return val_metrics + + @torch.no_grad() + def test(self, datasets: Dict = {}) -> None: + """Runs the test steps.""" + assert "test" in self.datasets, "Test dataloader was not specified." + self.modules.eval() + + pbar = tqdm( + self.datasets["test"], + unit="batches", + ascii=True, + dynamic_ncols=True, + disable=not self.accelerator.is_local_main_process, + ) + loss_epoch = 0 + pbar.set_description("Testing...") + with self.accelerator.autocast(): + for idx, batch in enumerate(pbar): + if isinstance(batch, list): + batch = [b.to(self.device) for b in batch] + self.opt.zero_grad() + + model_out = self(batch) + loss = self.compute_loss(model_out, batch) + for m in self.metrics: + m(model_out, batch, Stage.test, self.device) + + loss_epoch += loss.item() + pbar.set_postfix(loss=loss_epoch / (idx + 1)) + + pbar.close() + + test_metrics = { + "test_" + m.name: m.reduce(Stage.test, True) for m in self.metrics + } + test_metrics.update({"test_loss": loss_epoch / (idx + 1)}) + s_out = ( + "Testing " + + " - ".join([f"{k}: {v:.2f}" for k, v in test_metrics.items()]) + + "; " + ) + + logger.info(s_out) + + return None diff --git a/micromind/networks/phinet.py b/micromind/networks/phinet.py index 96dd119..4d84262 100644 --- a/micromind/networks/phinet.py +++ b/micromind/networks/phinet.py @@ -5,33 +5,36 @@ - Francesco Paissan, 2023 - Alberto Ancilotto, 2023 - Matteo Beltrami, 2023 + - Matteo Tremonti, 2023 """ -import logging -from pathlib import Path -from types import SimpleNamespace from typing import List import torch import torch.nn as nn import torch.nn.functional as F -from huggingface_hub import hf_hub_download -from huggingface_hub.utils import EntryNotFoundError from torchinfo import summary -import os +import torch.ao.nn.quantized as nnq -import micromind +def _make_divisible(v, divisor=8, min_value=None): + """ + This function is taken from the original tf repo. It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py -def correct_pad(input_shape, kernel_size): - """Returns a tuple for zero-padding for 2D convolution with downsampling - - Args: - input_shape ([tuple/int]): [Input size] - kernel_size ([tuple/int]): [Kernel size] + It ensures that all layers have a channel number that is divisible by divisor. - Returns: - [tuple]: [Padding coeffs] """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def correct_pad(input_shape, kernel_size): + """Returns a tuple for zero-padding for 2D convolution with downsampling""" if isinstance(kernel_size, int): kernel_size = (kernel_size, kernel_size) @@ -51,30 +54,13 @@ def correct_pad(input_shape, kernel_size): def preprocess_input(x, **kwargs): - """Normalise channels between [-1, 1] - - Args: - x ([Tensor]): [Contains the image, number of channels is arbitrary] - - Returns: - [Tensor]: [Channel-wise normalised tensor] - """ + """Normalise channels between [-1, 1]""" return (x / 128.0) - 1 def get_xpansion_factor(t_zero, beta, block_id, num_blocks): - """Compute expansion factor based on the formula from the paper - - Args: - t_zero ([int]): [initial expansion factor] - beta ([int]): [shape factor] - block_id ([int]): [id of the block] - num_blocks ([int]): [number of blocks in the network] - - Returns: - [float]: [computed expansion factor] - """ + """Compute expansion factor based on the formula from the paper""" return (t_zero * beta) * block_id / num_blocks + t_zero * ( num_blocks - block_id ) / num_blocks @@ -89,14 +75,6 @@ def forward(self, x): return torch.clamp(x, min=0, max=self.max) -class HSwish(torch.nn.Module): - def __init__(self): - super(HSwish, self).__init__() - - def forward(self, x): - return x * nn.ReLU6(inplace=True)(x + 3) / 6 - - class SEBlock(torch.nn.Module): """Implements squeeze-and-excitation block""" @@ -109,6 +87,7 @@ def __init__(self, in_channels, out_channels, h_swish=True): h_swish (bool, optional): [Whether to use the h_swish]. Defaults to True. """ super(SEBlock, self).__init__() + self.se_conv = nn.Conv2d( in_channels, out_channels, @@ -122,10 +101,14 @@ def __init__(self, in_channels, out_channels, h_swish=True): ) if h_swish: - self.activation = HSwish() + self.activation = nn.Hardswish(inplace=True) else: self.activation = ReLUMax(6) + # It serves for the quantization. + # The behavior remains equivalent for the unquantized models. + self.mult = nnq.FloatFunctional() + def forward(self, x): """Executes SE Block @@ -135,6 +118,7 @@ def forward(self, x): Returns: [Tensor]: [output of squeeze-and-excitation block] """ + inp = x x = F.adaptive_avg_pool2d(x, (1, 1)) x = self.se_conv(x) @@ -142,16 +126,10 @@ def forward(self, x): x = self.se_conv2(x) x = torch.sigmoid(x) - return x * inp + return self.mult.mul(inp, x) # Equivalent to ``torch.mul(a, b)`` class DepthwiseConv2d(torch.nn.Conv2d): - """Depthwise 2D conv - - Args: - torch ([Tensor]): [Input tensor for convolution] - """ - def __init__( self, in_channels, @@ -271,6 +249,7 @@ def __init__( h_swish=True, k_size=3, dp_rate=0.05, + divisor=1, ): """Defines the structure of a PhiNet convolutional block. @@ -304,9 +283,10 @@ def __init__( self._layers = torch.nn.ModuleList() in_channels = in_shape[0] + # Define activation function if h_swish: - activation = HSwish() + activation = nn.Hardswish(inplace=True) else: activation = ReLUMax(6) @@ -314,14 +294,14 @@ def __init__( # Expand conv1 = nn.Conv2d( in_channels, - int(expansion * in_channels), + _make_divisible(int(expansion * in_channels), divisor=divisor), kernel_size=1, padding=0, bias=False, ) bn1 = nn.BatchNorm2d( - int(expansion * in_channels), + _make_divisible(int(expansion * in_channels), divisor=divisor), eps=1e-3, momentum=0.999, ) @@ -331,16 +311,16 @@ def __init__( self._layers.append(activation) if stride == 2: - pad = nn.ZeroPad2d( - padding=correct_pad([res, res], 3), - ) - - self._layers.append(pad) + padding = correct_pad([res, res], 3) self._layers.append(nn.Dropout2d(dp_rate)) d_mul = 1 - in_channels_dw = int(expansion * in_channels) if block_id else in_channels + in_channels_dw = ( + _make_divisible(int(expansion * in_channels), divisor=divisor) + if block_id + else in_channels + ) out_channels_dw = in_channels_dw * d_mul dw1 = DepthwiseConv2d( in_channels=in_channels_dw, @@ -348,7 +328,7 @@ def __init__( kernel_size=k_size, stride=stride, bias=False, - padding=k_size // 2 if stride == 1 else 0, + padding=k_size // 2 if stride == 1 else (padding[1], padding[3]), ) bn_dw1 = nn.BatchNorm2d( @@ -357,19 +337,27 @@ def __init__( momentum=0.999, ) + # It is necessary to reinitialize the activation + # for functions using Module.children() to work properly. + # Module.children() does not return repeated layers. + if h_swish: + activation = nn.Hardswish(inplace=True) + else: + activation = ReLUMax(6) + self._layers.append(dw1) self._layers.append(bn_dw1) self._layers.append(activation) if has_se: - num_reduced_filters = max(1, int(expansion * in_channels / 6)) - se_block = SEBlock( - int(expansion * in_channels), num_reduced_filters, h_swish=h_swish + num_reduced_filters = _make_divisible( + max(1, int(out_channels_dw / 6)), divisor=divisor ) + se_block = SEBlock(out_channels_dw, num_reduced_filters, h_swish=h_swish) self._layers.append(se_block) conv2 = nn.Conv2d( - in_channels=int(expansion * in_channels), + in_channels=out_channels_dw, out_channels=filters, kernel_size=1, padding=0, @@ -387,6 +375,9 @@ def __init__( if res and in_channels == filters and stride == 1: self.skip_conn = True + # It serves for the quantization. + # The behavior remains equivalent for the unquantized models. + self.op = nnq.FloatFunctional() def forward(self, x): """Executes PhiNet convolutional block @@ -398,6 +389,7 @@ def forward(self, x): Returns: Ouput of the convolutional block : torch.Tensor """ + if self.skip_conn: inp = x @@ -405,192 +397,12 @@ def forward(self, x): x = layer(x) if self.skip_conn: - return x + inp + return self.op.add(x, inp) # Equivalent to ``torch.add(a, b)`` return x class PhiNet(nn.Module): - @classmethod - def from_pretrained( - cls, - dataset, - alpha, - beta, - t_zero, - num_layers, - resolution, - path=None, - num_classes=None, - classifier=True, - device=None, - ): - """Loads parameters from checkpoint through Hugging Face Hub or through local - file system. - This function constructs two strings, `repo_dir` to find the model on Hugging - Face Hub and `file_to_choose` to select the correct file inside the repo, and - use them to download the pretrained model and initialize the PhiNet. - - Arguments - --------- - dataset : string - The dataset on which the model has been trained with. - alpha : float - The alpha hyperparameter. - beta : float - The beta hyperparameter. - t_zero : float - The t_zero hyperparameter. - num_layers : int - The number of layers. - resolution : int - The resolution of the images used during training. - path : string - The directory path or file path pointing to the checkpoint. - If None, the checkpoint is searched on HuggingFace. - num_classes : int - The number of classes that the model has been trained for. - If None, it gets the specific value determined by the dataset used. - classifier : bool - If True, the model returend includes the classifier. - device : string - The device that loads all the tensors. - If None, it's set to "cuda" if it's available, it's set to "cpu" otherwise. - - Returns - ------- - PhiNet: nn.Module - - Example - ------- - .. doctest:: - - >>> from micromind import PhiNet - >>> model = PhiNet.from_pretrained("CIFAR-10", 3.0, 0.75, 6.0, 7, 160) - Checkpoint taken from HuggingHace hub. - Checkpoint loaded successfully. - """ - if num_classes is None: - num_classes = micromind.datasets_info[dataset]["Nclasses"] - - repo_dir = f"micromind/{dataset}" - file_to_choose = f"\ - phinet_a{float(alpha)}_b{float(beta)}_tzero{float(t_zero)}_Nlayers{num_layers}\ - _res{resolution}{micromind.datasets_info[dataset]['ext']}\ - ".replace( - " ", "" - ) - - assert ( - num_classes == micromind.datasets_info[dataset]["Nclasses"] - ), "Can't load model because num_classes does not match with dataset." - - if device is None: - if torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - if path is not None: - path_to_search = os.path.join(path, file_to_choose) - if os.path.isfile(path): - path_to_search = path - if os.path.isfile(path_to_search): - state_dict = torch.load(str(path_to_search), map_location=device) - model_found = True - print("Checkpoint taken from local file system.") - else: - model_found = False - print( - "Checkpoint not taken from local file system." - + f"{path_to_search} is not a valid checkpoint." - ) - if (path is None) or not model_found: - try: - downloaded_file_path = hf_hub_download( - repo_id=repo_dir, filename=file_to_choose - ) - state_dict = torch.load(str(downloaded_file_path), map_location=device) - print("Checkpoint taken from HuggingHace hub.") - model_found = True - - except EntryNotFoundError: - state_dict = { - "args": SimpleNamespace( - alpha=alpha, - beta=beta, - t_zero=t_zero, - num_layers=num_layers, - num_classes=num_classes, - ) - } - model_found = False - logging.warning("Model initialized without loading checkpoint.") - - # model initialized - model = cls( - (micromind.datasets_info[dataset]["NChannels"], resolution, resolution), - alpha=state_dict["args"].alpha, - beta=state_dict["args"].beta, - t_zero=state_dict["args"].t_zero, - num_layers=state_dict["args"].num_layers, - num_classes=state_dict["args"].num_classes, - include_top=classifier, - compatibility=False, - ) - - # model initialized with downloaded parameters - if model_found: - model.load_state_dict(state_dict["state_dict"], strict=False) - print("Checkpoint loaded successfully.") - - return model - - def save_params(self, save_path: Path): - """Saves state_dict of model into a given path. - - Arguments - --------- - save_path : string or Path - Path where you want to store the state dict. - - Returns - ------- - None - - Example - ------- - .. doctest:: - - >>> from micromind import PhiNet - >>> model = PhiNet((3, 224, 224)) - >>> model.save_params("checkpoint.pt") - """ - torch.save(self.state_dict(), save_path) - - def from_checkpoint(self, load_path: Path): - """Loads state_dict of model into current instance of the PhiNet class. - - Arguments - --------- - load_path : string or Path - Path where you want to store the state dict. - - Returns - ------- - None - - Example - ------- - .. doctest:: - - >>> from micromind import PhiNet - >>> model = PhiNet((3, 224, 224)) - >>> model.save_params("checkpoint.pt") - >>> model.from_checkpoint("checkpoint.pt") - """ - self.load_state_dict(torch.load(load_path)) - def get_complexity(self): """Returns MAC and number of parameters of initialized architecture. @@ -602,7 +414,7 @@ def get_complexity(self): ------- .. doctest:: - >>> from micromind import PhiNet + >>> from micromind.networks import PhiNet >>> model = PhiNet((3, 224, 224)) >>> model.get_complexity() {'MAC': 9817670, 'params': 30917} @@ -624,7 +436,7 @@ def get_MAC(self): ------- .. doctest:: - >>> from micromind import PhiNet + >>> from micromind.networks import PhiNet >>> model = PhiNet((3, 224, 224)) >>> model.get_MAC() 9817670 @@ -642,7 +454,7 @@ def get_params(self): ------- .. doctest:: - >>> from micromind import PhiNet + >>> from micromind.networks import PhiNet >>> model = PhiNet((3, 224, 224)) >>> model.get_params() 30917 @@ -667,6 +479,7 @@ def __init__( pool: bool = False, # S2 h_swish: bool = True, # S1 squeeze_excite: bool = True, # S1 + divisor: int = 1, ) -> None: """This class implements the PhiNet architecture. @@ -720,7 +533,7 @@ def __init__( # Define self.activation function if h_swish: - activation = HSwish() + activation = nn.Hardswish(inplace=True) else: activation = ReLUMax(6) @@ -735,7 +548,7 @@ def __init__( sep1 = SeparableConv2d( in_channels, - int(first_conv_filters * alpha), + _make_divisible(int(first_conv_filters * alpha), divisor=divisor), kernel_size=3, stride=(first_conv_stride, first_conv_stride), padding=0, @@ -748,16 +561,17 @@ def __init__( block1 = PhiNetConvBlock( in_shape=( - int(first_conv_filters * alpha), + _make_divisible(int(first_conv_filters * alpha), divisor=divisor), res / first_conv_stride, res / first_conv_stride, ), - filters=int(b1_filters * alpha), + filters=_make_divisible(int(b1_filters * alpha), divisor=divisor), stride=1, expansion=1, has_se=False, res=residuals, h_swish=h_swish, + divisor=divisor, ) self._layers.append(block1) @@ -773,44 +587,51 @@ def __init__( self._layers.append(bn_c1) block2 = PhiNetConvBlock( - (int(b1_filters * alpha), res / first_conv_stride, res / first_conv_stride), - filters=int(b1_filters * alpha), + ( + _make_divisible(int(b1_filters * alpha), divisor=divisor), + res / first_conv_stride, + res / first_conv_stride, + ), + filters=_make_divisible(int(b1_filters * alpha), divisor=divisor), stride=2 if (not pool) else 1, expansion=get_xpansion_factor(t_zero, beta, 1, num_layers), block_id=1, has_se=squeeze_excite, res=residuals, h_swish=h_swish, + divisor=divisor, ) block3 = PhiNetConvBlock( ( - int(b1_filters * alpha), + _make_divisible(int(b1_filters * alpha), divisor=divisor), res / first_conv_stride / 2, res / first_conv_stride / 2, ), - filters=int(b1_filters * alpha), + filters=_make_divisible(int(b1_filters * alpha), divisor=divisor), stride=1, expansion=get_xpansion_factor(t_zero, beta, 2, num_layers), block_id=2, has_se=squeeze_excite, res=residuals, h_swish=h_swish, + divisor=divisor, ) block4 = PhiNetConvBlock( ( - int(b1_filters * alpha), + _make_divisible(int(b1_filters * alpha), divisor=divisor), res / first_conv_stride / 2, res / first_conv_stride / 2, ), - filters=int(b2_filters * alpha), + filters=_make_divisible(int(b2_filters * alpha), divisor=divisor), stride=2 if (not pool) else 1, expansion=get_xpansion_factor(t_zero, beta, 3, num_layers), block_id=3, has_se=squeeze_excite, res=residuals, h_swish=h_swish, + divisor=divisor, ) self._layers.append(block2) @@ -824,7 +645,7 @@ def __init__( block_id = 4 block_filters = b2_filters spatial_res = res / first_conv_stride / 4 - in_channels_next = int(b2_filters * alpha) + in_channels_next = _make_divisible(int(b2_filters * alpha), divisor=divisor) while num_layers >= block_id: if block_id in downsampling_layers: block_filters *= 2 @@ -833,7 +654,7 @@ def __init__( pn_block = PhiNetConvBlock( (in_channels_next, spatial_res, spatial_res), - filters=int(block_filters * alpha), + filters=_make_divisible(int(block_filters * alpha), divisor=divisor), stride=(2 if (block_id in downsampling_layers) and (not pool) else 1), expansion=get_xpansion_factor(t_zero, beta, block_id, num_layers), block_id=block_id, @@ -841,10 +662,13 @@ def __init__( res=residuals, h_swish=h_swish, k_size=(5 if (block_id / num_layers) > (1 - conv5_percent) else 3), + divisor=divisor, ) self._layers.append(pn_block) - in_channels_next = int(block_filters * alpha) + in_channels_next = _make_divisible( + int(block_filters * alpha), divisor=divisor + ) spatial_res = ( spatial_res / 2 if block_id in downsampling_layers else spatial_res ) @@ -855,7 +679,11 @@ def __init__( self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(), - nn.Linear(int(block_filters * alpha), num_classes, bias=True), + nn.Linear( + _make_divisible(int(block_filters * alpha), divisor=divisor), + num_classes, + bias=True, + ), ) def forward(self, x): diff --git a/micromind/utils/checkpointer.py b/micromind/utils/checkpointer.py new file mode 100644 index 0000000..09a867f --- /dev/null +++ b/micromind/utils/checkpointer.py @@ -0,0 +1,99 @@ +""" +micromind checkpointer. Unwraps models and saves the to disk with optimizer's +state etc. + +Authors: + - Francesco Paissan, 2023 +""" +from typing import Union, Dict, Callable +from loguru import logger +from pathlib import Path +import os + +import torch + + +class Checkpointer: + def __init__( + self, + key: str, + mode: str = "min", + top_k: int = 5, + checkpoint_path: Union[str, Path] = ".", + ) -> None: + assert mode in ["max", "min"], "Checkpointer mode can be only max or min." + self.key = key + self.mode = mode + self.top_k = 5 + + self.bests = [torch.inf] * self.top_k + self.check_paths = [""] * self.top_k + self.root_dir = checkpoint_path + self.save_dir = os.path.join(self.root_dir, "save") + os.makedirs(self.save_dir, exist_ok=True) + self.fstream = open(os.path.join(self.root_dir, "train_log.txt"), "a") + + def __call__( + self, + mind, + epoch: int, + train_metrics: Dict, + metrics: Dict, + unwrap: Callable = lambda x: x, + ) -> Union[Path, str]: + s_out = ( + f"Epoch {epoch}: " + + " - ".join([f"{k}: {v:.2f}" for k, v in train_metrics.items()]) + + "; " + ) + s_out += " - ".join([f"{k2}: {v2:.4f}" for k2, v2 in metrics.items()]) + ".\n" + self.fstream.write(s_out) + logger.info(s_out) + base_save = { + "key": self.key, + "mode": self.mode, + "epoch": epoch, + "optimizer": mind.opt, + "lr_scheduler": mind.lr_sched, + } + to_remove = None + if self.mode == "min": + if metrics[self.key] <= min(self.bests): + id_best = self.bests.index(min(self.bests)) + to_remove = self.check_paths[id_best] + + self.check_paths[id_best] = os.path.join( + self.save_dir, + f"epoch_{epoch}_{self.key}_{metrics[self.key]:.4f}.ckpt", + ) + + base_save.update( + {k: unwrap(v).state_dict() for k, v in mind.modules.items()} + ), + torch.save(base_save, self.check_paths[id_best]) + elif self.mode == "max": + if metrics[self.key] >= max(self.bests): + id_best = self.bests.index(min(self.bests)) + to_remove = self.check_paths[id_best] + + self.check_paths[id_best] = os.path.join( + self.save_dir, + f"epoch_{epoch}_{self.key}_{metrics[self.key]:.4f}.ckpt", + ) + + base_save.update( + {k: unwrap(v).state_dict() for k, v in mind.modules.items()} + ), + torch.save(base_save, self.check_paths[id_best]) + + if to_remove is not None and to_remove != "": + logger.info(f"Generated better checkpoint. Deleting {to_remove}.") + os.remove(to_remove) + + if self.mode == "max": + return self.check_paths[self.bests.index(max(self.bests))] + elif self.mode == "min": + return self.check_paths[self.bests.index(min(self.bests))] + + def close(self): + self.fstream.close() diff --git a/micromind/utils/configlib.py b/micromind/utils/configlib.py deleted file mode 100644 index 3550cea..0000000 --- a/micromind/utils/configlib.py +++ /dev/null @@ -1,46 +0,0 @@ -""" Configuration library for experiments. - -Authors: - Francesco Paissan, 2023 - -""" -import argparse -import logging -import pprint -import sys -import types -from typing import Any, Dict - - -class SimpleNamespace(types.SimpleNamespace): - def update(self, dictionary): - self.__dict__.update(dictionary) - - -logger = logging.getLogger(__name__) - -parser = argparse.ArgumentParser(description=__doc__, fromfile_prefix_chars="@") - -config: SimpleNamespace = SimpleNamespace() - - -def add_parser(title: str, description: str = ""): - """Create a new context for arguments and return a handle.""" - return parser.add_argument_group(title, description) - - -def parse(save_fname: str = "") -> Dict[str, Any]: - """Parse given arguments.""" - config.update(vars(parser.parse_args())) - logging.info("Parsed %i arguments.", len(config.__dict__)) - # Save passed arguments - if save_fname: - with open(save_fname, "w") as fout: - fout.write("\n".join(sys.argv[1:])) - logging.info("Saving experiment arguments to %s.", save_fname) - return config - - -def print_config(): - """Print the current config to stdout.""" - pprint.pprint(config.__dict__) diff --git a/micromind/utils/helpers.py b/micromind/utils/helpers.py new file mode 100644 index 0000000..f0c1c16 --- /dev/null +++ b/micromind/utils/helpers.py @@ -0,0 +1,43 @@ +""" +micromind helper functions. + +Authors: + - Francesco Paissan, 2023 +""" +from typing import Union, Dict, Tuple +from pathlib import Path +import random +import string +import torch +import os + + +def get_value_from_key(s: str, key: str, cast=float) -> float: + dat = s.split(f"{key}_")[-1] + + if "ckpt" in dat: + dat = dat.split(".ckpt")[0] + + return cast(dat) + + +def select_and_load_checkpoint(path: Union[Path, str]) -> Tuple[Dict, str]: + checkpoints = os.listdir(path) + checkpoints = [os.path.join(path, c) for c in checkpoints] + + dat = torch.load(checkpoints[0]) + selected_key, selected_mode = dat["key"], dat["mode"] + + values = [get_value_from_key(str(c), selected_key) for c in checkpoints] + + best_key = min(values) if selected_mode == "min" else max(values) + best_checkpoint = checkpoints[values.index(best_key)] + + return torch.load(best_checkpoint), best_checkpoint + + +def get_random_string(length=10): + letters = string.ascii_lowercase + result_str = "".join(random.choice(letters) for i in range(length)) + + return result_str diff --git a/micromind/utils/parse.py b/micromind/utils/parse.py new file mode 100644 index 0000000..debbbed --- /dev/null +++ b/micromind/utils/parse.py @@ -0,0 +1,28 @@ +import argparse + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="General configuration for micromind.") + + parser.add_argument("--lr", type=float, default=0.001, help="Learning rate.") + parser.add_argument( + "--optimizer", + dest="opt", + default="adam", + choices=["adam", "sgd"], + help="Optimizer name.", + ) + parser.add_argument( + "--experiment_name", default="exp", help="Name of the experiment." + ) + parser.add_argument( + "--output_folder", default="results", help="Output folder path." + ) + parser.add_argument( + "--debug", + action="store_true", + help="Run in debug mode to check train and validation steps.", + ) + + args = parser.parse_args() + return args diff --git a/pyproject.toml b/pyproject.toml index e23ef59..b72ae4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,9 @@ dependencies = [ "torch", "torchinfo", "huggingface_hub", + "accelerate==0.23.0", + "onnx", + "loguru" ] requires-python = ">=3.8" @@ -53,7 +56,7 @@ profile = "black" py-modules = [] [tool.bumpver] -current_version = "0.0.5" +current_version = "0.1.0" version_pattern = "MAJOR.MINOR.PATCH" commit_message = "bump version {old_version} -> {new_version}" commit = true diff --git a/recipes/image_classification/README.md b/recipes/image_classification/README.md deleted file mode 100644 index 50dd16c..0000000 --- a/recipes/image_classification/README.md +++ /dev/null @@ -1,61 +0,0 @@ -## Image classification - -This image classification recipe is heavily based and depends on pytorch-image-models (timm), the awesome tool developed by [Ross Wightman](https://github.com/rwightman). -It supports all data augmentation, datasets and architectures of the original implementation, and was adapted to support the training of PhiNets. - -To reproduce our results, you can follow these steps: - -1. install PhiNets with `pip install git+https://github.com/fpaissan/micromind` -2. install the additional dependencies for this recipe with `pip install -r extra_requirements.txt` -2. launch the training script on the dataset you want - -### MNIST -``` -python classification.py ~/data/mnist -b 128 --dataset torch/mnist --num-classes 10 \ - --model phinet --input-size 1 28 28 --epochs 20 --amp \ - --opt adam --lr 0.01 --weight-decay 0.01 --no-aug \ - --pin-mem --apex-amp --use-multi-epochs-loader --mean 0.1307 --std 0.3081 --dataset-download --log-interval 100 \ - --alpha 0.5 --num_layers 4 --beta 1 --t_zero 6 --experiment mnist -``` - -### CIFAR-10 -``` -python classification.py ~/data/cifar10 -b 64 --dataset torch/cifar10 --num-classes 10 \ - --model phinet --input-size 3 160 160 --epochs 100 --amp \ - --opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \ - --hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \ - --pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar10 \ - --alpha 3 --beta 0.75 --t_zero 6 --num_layers 7 -``` - -### CIFAR-100 -``` -python classification.py ~/data/cifar100 -b 64 --dataset torch/cifar100 --num-classes 100 \ - --model phinet --input-size 3 160 160 --epochs 100 --amp \ - --opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \ - --hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \ - --pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar100 \ - --alpha 3 --beta 0.75 --t_zero 6 --num_layers 7 -``` - -In the table is a list of PhiNet's performance on some common image classification benchmarks. - -| Dataset | Model name | Top 1 Accuracy | Top 5 Accuracy | -| -------- | ------------------ |---------------- | -------------- | -| MNIST | `PhiNet(alpha=0.5, beta=1, t_zero=6, num_layers=4)` | 98.96% | 100.00% | -| CIFAR-10 | `PhiNet(alpha=3, beta=0.75, t_zero=6, num_layers=7)` | 93.61% | 99.77% | -| CIFAR-100 | `PhiNet(alpha=3, beta=0.75, t_zero=6, num_layers=7)` | 75.56% | 93.5% | - -### Cite PhiNets -``` -@article{10.1145/3510832, - author = {Paissan, Francesco and Ancilotto, Alberto and Farella, Elisabetta}, - title = {PhiNets: A Scalable Backbone for Low-Power AI at the Edge}, - year = {2022}, - publisher = {Association for Computing Machinery}, - address = {New York, NY, USA}, - url = {https://doi.org/10.1145/3510832}, - doi = {10.1145/3510832}, - journal = {ACM Trans. Embed. Comput. Syst.}, -} -``` diff --git a/recipes/image_classification/classification.py b/recipes/image_classification/classification.py deleted file mode 100644 index c85d216..0000000 --- a/recipes/image_classification/classification.py +++ /dev/null @@ -1,1551 +0,0 @@ -""" -This code is an adaptation of the imagenet training script from -Ross Wightman (https://github.com/rwightman) modified to train -networks supported inside phinet. - -Adapted by: - - Mariam Jamal, 2023 - - Francesco Paissan, 2023 - -""" -import logging -import os -import time -from collections import OrderedDict -from contextlib import suppress -from datetime import datetime - -import torch - -# should speed up backward-pass with depth-wise separable convolutions -import torch.backends.cudnn as cudnn -import torch.nn as nn -import torchvision.utils -import yaml -from timm import utils -from timm.data import ( - AugMixDataset, - FastCollateMixup, - Mixup, - create_dataset, - create_loader, - resolve_data_config, -) -from timm.loss import ( - BinaryCrossEntropy, - JsdCrossEntropy, - LabelSmoothingCrossEntropy, - SoftTargetCrossEntropy, -) -from timm.models import ( - convert_splitbn_model, - convert_sync_batchnorm, - create_model, - load_checkpoint, - model_parameters, - resume_checkpoint, - safe_model_name, - set_fast_norm, -) -from timm.optim import create_optimizer_v2, optimizer_kwargs -from timm.scheduler import create_scheduler -from timm.utils import ApexScaler, NativeScaler -from torch.nn.parallel import DistributedDataParallel as NativeDDP - -# Model interface -from micromind import PhiNet - -# For argparse from multiple files -from micromind.utils import configlib -from micromind.utils.configlib import config as args - -cudnn.benchmark = True - -try: - from apex import amp - from apex.parallel import DistributedDataParallel as ApexDDP - from apex.parallel import convert_syncbn_model - - has_apex = True -except ImportError: - has_apex = False - -has_native_amp = False -try: - if getattr(torch.cuda.amp, "autocast") is not None: - has_native_amp = True -except AttributeError: - pass - -try: - import wandb - - has_wandb = True -except ImportError: - has_wandb = False - -try: - from functorch.compile import memory_efficient_fusion - - has_functorch = True -except ImportError: - has_functorch = False - - -torch.backends.cudnn.benchmark = True -_logger = logging.getLogger("train") - -# The first arg parser parses out only the --config argument, -# this argument is used to load a yaml file containing key-values -# that override the defaults for the main parser below -# config_parser = configlib.add_parser("Classification training config") -# -# parser.add_argument( -# "-c", -# "--config", -# default="", -# type=str, -# metavar="FILE", -# help="YAML config file specifying default arguments", -# ) - - -# Dataset parameters -group = configlib.add_parser("Dataset parameters") -# Keep this argument outside of the dataset group because it is positional. -group.add_argument("data_dir", metavar="DIR", help="path to dataset") -group.add_argument( - "--dataset", - "-d", - metavar="NAME", - default="", - help="dataset type (default: ImageFolder/ImageTar if empty)", -) -group.add_argument( - "--train-split", - metavar="NAME", - default="train", - help="dataset train split (default: train)", -) -group.add_argument( - "--val-split", - metavar="NAME", - default="validation", - help="dataset validation split (default: validation)", -) -group.add_argument( - "--dataset-download", - action="store_true", - default=False, - help="Allow download of dataset for torch/ and tfds/ datasets that support it.", -) -group.add_argument( - "--class-map", - default="", - type=str, - metavar="FILENAME", - help='path to class to idx mapping file (default: "")', -) - -# Model parameters -group = configlib.add_parser("Model parameters") -group.add_argument( - "--model", - default="resnet50", - type=str, - metavar="MODEL", - help='Name of model to train (default: "resnet50"', -) -group.add_argument( - "--pretrained", - action="store_true", - default=False, - help="Start with pretrained version of specified network (if avail)", -) -group.add_argument( - "--initial-checkpoint", - default="", - type=str, - metavar="PATH", - help="Initialize model from this checkpoint (default: none)", -) -group.add_argument( - "--resume", - default="", - type=str, - metavar="PATH", - help="Resume full model and optimizer state from checkpoint (default: none)", -) -group.add_argument( - "--no-resume-opt", - action="store_true", - default=False, - help="prevent resume of optimizer state when resuming model", -) -group.add_argument( - "--num-classes", - type=int, - default=None, - metavar="N", - help="number of label classes (Model default if None)", -) -group.add_argument( - "--gp", - default=None, - type=str, - metavar="POOL", - help="Global pool type, one of (fast, avg, max, avgmax, avgmaxc). \ - Model default if None.", -) -group.add_argument( - "--img-size", - type=int, - default=None, - metavar="N", - help="Image patch size (default: None => model default)", -) -group.add_argument( - "--input-size", - default=None, - nargs=3, - type=int, - metavar="N N N", - help="Input all image dimensions (d h w, e.g. --input-size 3 224 224), \ - uses model default if empty", -) -group.add_argument( - "--crop-pct", - default=None, - type=float, - metavar="N", - help="Input image center crop percent (for validation only)", -) -group.add_argument( - "--mean", - type=float, - nargs="+", - default=None, - metavar="MEAN", - help="Override mean pixel value of dataset", -) -group.add_argument( - "--std", - type=float, - nargs="+", - default=None, - metavar="STD", - help="Override std deviation of dataset", -) -group.add_argument( - "--interpolation", - default="", - type=str, - metavar="NAME", - help="Image resize interpolation type (overrides model)", -) -group.add_argument( - "-b", - "--batch-size", - type=int, - default=128, - metavar="N", - help="Input batch size for training (default: 128)", -) -group.add_argument( - "-vb", - "--validation-batch-size", - type=int, - default=None, - metavar="N", - help="Validation batch size override (default: None)", -) -group.add_argument( - "--channels-last", - action="store_true", - default=False, - help="Use channels_last memory layout", -) -scripting_group = group.add_mutually_exclusive_group() -scripting_group.add_argument( - "--torchscript", - dest="torchscript", - action="store_true", - help="torch.jit.script the full model", -) -scripting_group.add_argument( - "--aot-autograd", - default=False, - action="store_true", - help="Enable AOT Autograd support. (It's recommended to use \ - this option with `--fuser nvfuser` together)", -) -group.add_argument( - "--fuser", - default="", - type=str, - help="Select jit fuser. One of ('', 'te', 'old', 'nvfuser')", -) -group.add_argument( - "--fast-norm", - default=False, - action="store_true", - help="enable experimental fast-norm", -) -group.add_argument( - "--grad-checkpointing", - action="store_true", - default=False, - help="Enable gradient checkpointing through model blocks/stages", -) -group.add_argument( - "--alpha", - default=0.5, - type=float, - help="alpha parameter for phinet. Defaults to 0.5", -) -group.add_argument( - "--beta", - default=1.0, - type=float, - help="beta parameter for phinet. Defaults to 1.", -) -group.add_argument( - "--t_zero", - default=4, - type=float, - help="t_zero parameter for phinet. Defaults to 4.", -) -group.add_argument( - "--num_layers", - default=4, - type=int, - help="Number of layers for phinet. Defaults to 4.", -) - -# Optimizer parameters -group = configlib.add_parser("Optimizer parameters") -group.add_argument( - "--opt", - default="sgd", - type=str, - metavar="OPTIMIZER", - help='Optimizer (default: "sgd"', -) -group.add_argument( - "--opt-eps", - default=None, - type=float, - metavar="EPSILON", - help="Optimizer Epsilon (default: None, use opt default)", -) -group.add_argument( - "--opt-betas", - default=None, - type=float, - nargs="+", - metavar="BETA", - help="Optimizer Betas (default: None, use opt default)", -) -group.add_argument( - "--momentum", - type=float, - default=0.9, - metavar="M", - help="Optimizer momentum (default: 0.9)", -) -group.add_argument( - "--weight-decay", type=float, default=2e-5, help="weight decay (default: 2e-5)" -) -group.add_argument( - "--clip-grad", - type=float, - default=None, - metavar="NORM", - help="Clip gradient norm (default: None, no clipping)", -) -group.add_argument( - "--clip-mode", - type=str, - default="norm", - help='Gradient clipping mode. One of ("norm", "value", "agc")', -) -group.add_argument( - "--layer-decay", - type=float, - default=None, - help="layer-wise learning rate decay (default: None)", -) - -# Learning rate schedule parameters -group = configlib.add_parser("Learning rate schedule parameters") -group.add_argument( - "--sched", - default="cosine", - type=str, - metavar="SCHEDULER", - help='LR scheduler (default: "step"', -) -group.add_argument( - "--lr", type=float, default=0.05, metavar="LR", help="learning rate (default: 0.05)" -) -group.add_argument( - "--lr-noise", - type=float, - nargs="+", - default=None, - metavar="pct, pct", - help="learning rate noise on/off epoch percentages", -) -group.add_argument( - "--lr-noise-pct", - type=float, - default=0.67, - metavar="PERCENT", - help="learning rate noise limit percent (default: 0.67)", -) -group.add_argument( - "--lr-noise-std", - type=float, - default=1.0, - metavar="STDDEV", - help="learning rate noise std-dev (default: 1.0)", -) -group.add_argument( - "--lr-cycle-mul", - type=float, - default=1.0, - metavar="MULT", - help="learning rate cycle len multiplier (default: 1.0)", -) -group.add_argument( - "--lr-cycle-decay", - type=float, - default=0.5, - metavar="MULT", - help="amount to decay each learning rate cycle (default: 0.5)", -) -group.add_argument( - "--lr-cycle-limit", - type=int, - default=1, - metavar="N", - help="learning rate cycle limit, cycles enabled if > 1", -) -group.add_argument( - "--lr-k-decay", - type=float, - default=1.0, - help="learning rate k-decay for cosine/poly (default: 1.0)", -) -group.add_argument( - "--warmup-lr", - type=float, - default=0.0001, - metavar="LR", - help="warmup learning rate (default: 0.0001)", -) -group.add_argument( - "--min-lr", - type=float, - default=1e-6, - metavar="LR", - help="lower lr bound for cyclic schedulers that hit 0 (1e-5)", -) -group.add_argument( - "--epochs", - type=int, - default=300, - metavar="N", - help="number of epochs to train (default: 300)", -) -group.add_argument( - "--epoch-repeats", - type=float, - default=0.0, - metavar="N", - help="epoch repeat multiplier \ - (number of times to repeat dataset epoch per train epoch).", -) -group.add_argument( - "--start-epoch", - default=None, - type=int, - metavar="N", - help="manual epoch number (useful on restarts)", -) -group.add_argument( - "--decay-milestones", - default=[30, 60], - type=int, - nargs="+", - metavar="MILESTONES", - help="list of decay epoch indices for multistep lr. must be increasing", -) -group.add_argument( - "--decay-epochs", - type=float, - default=100, - metavar="N", - help="epoch interval to decay LR", -) -group.add_argument( - "--warmup-epochs", - type=int, - default=3, - metavar="N", - help="epochs to warmup LR, if scheduler supports", -) -group.add_argument( - "--cooldown-epochs", - type=int, - default=10, - metavar="N", - help="epochs to cooldown LR at min_lr, after cyclic schedule ends", -) -group.add_argument( - "--patience-epochs", - type=int, - default=10, - metavar="N", - help="patience epochs for Plateau LR scheduler (default: 10", -) -group.add_argument( - "--decay-rate", - "--dr", - type=float, - default=0.1, - metavar="RATE", - help="LR decay rate (default: 0.1)", -) - -# Augmentation & regularization parameters -group = configlib.add_parser("Augmentation and regularization parameters") -group.add_argument( - "--no-aug", - action="store_true", - default=False, - help="Disable all training augmentation, override other train aug args", -) -group.add_argument( - "--scale", - type=float, - nargs="+", - default=[0.08, 1.0], - metavar="PCT", - help="Random resize scale (default: 0.08 1.0)", -) -group.add_argument( - "--ratio", - type=float, - nargs="+", - default=[3.0 / 4.0, 4.0 / 3.0], - metavar="RATIO", - help="Random resize aspect ratio (default: 0.75 1.33)", -) -group.add_argument( - "--hflip", type=float, default=0.5, help="Horizontal flip training aug probability" -) -group.add_argument( - "--vflip", type=float, default=0.0, help="Vertical flip training aug probability" -) -group.add_argument( - "--color-jitter", - type=float, - default=0.4, - metavar="PCT", - help="Color jitter factor (default: 0.4)", -) -group.add_argument( - "--aa", - type=str, - default=None, - metavar="NAME", - help='Use AutoAugment policy. "v0" or "original". (default: None)', -), -group.add_argument( - "--aug-repeats", - type=float, - default=0, - help="Number of augmentation repetitions (distributed training only) (default: 0)", -) -group.add_argument( - "--aug-splits", - type=int, - default=0, - help="Number of augmentation splits (default: 0, valid: 0 or >=2)", -) -group.add_argument( - "--jsd-loss", - action="store_true", - default=False, - help="Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.", -) -group.add_argument( - "--bce-loss", - action="store_true", - default=False, - help="Enable BCE loss w/ Mixup/CutMix use.", -) -group.add_argument( - "--bce-target-thresh", - type=float, - default=None, - help="Threshold for binarizing softened BCE targets (default: None, disabled)", -) -group.add_argument( - "--reprob", - type=float, - default=0.0, - metavar="PCT", - help="Random erase prob (default: 0.)", -) -group.add_argument( - "--remode", type=str, default="pixel", help='Random erase mode (default: "pixel")' -) -group.add_argument( - "--recount", type=int, default=1, help="Random erase count (default: 1)" -) -group.add_argument( - "--resplit", - action="store_true", - default=False, - help="Do not random erase first (clean) augmentation split", -) -group.add_argument( - "--mixup", - type=float, - default=0.0, - help="mixup alpha, mixup enabled if > 0. (default: 0.)", -) -group.add_argument( - "--cutmix", - type=float, - default=0.0, - help="cutmix alpha, cutmix enabled if > 0. (default: 0.)", -) -group.add_argument( - "--cutmix-minmax", - type=float, - nargs="+", - default=None, - help="cutmix min/max ratio, overrides alpha \ - and enables cutmix if set (default: None)", -) -group.add_argument( - "--mixup-prob", - type=float, - default=1.0, - help="Probability of performing mixup or cutmix when either/both is enabled", -) -group.add_argument( - "--mixup-switch-prob", - type=float, - default=0.5, - help="Probability of switching to cutmix when both mixup and cutmix enabled", -) -group.add_argument( - "--mixup-mode", - type=str, - default="batch", - help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"', -) -group.add_argument( - "--mixup-off-epoch", - default=0, - type=int, - metavar="N", - help="Turn off mixup after this epoch, disabled if 0 (default: 0)", -) -group.add_argument( - "--smoothing", type=float, default=0.1, help="Label smoothing (default: 0.1)" -) -group.add_argument( - "--train-interpolation", - type=str, - default="random", - help='Training interpolation (random, bilinear, bicubic default: "random")', -) -group.add_argument( - "--drop", type=float, default=0.0, metavar="PCT", help="Dropout rate (default: 0.)" -) -group.add_argument( - "--drop-connect", - type=float, - default=None, - metavar="PCT", - help="Drop connect rate, DEPRECATED, use drop-path (default: None)", -) -group.add_argument( - "--drop-path", - type=float, - default=None, - metavar="PCT", - help="Drop path rate (default: None)", -) -group.add_argument( - "--drop-block", - type=float, - default=None, - metavar="PCT", - help="Drop block rate (default: None)", -) - -# Batch norm parameters (only works with gen_efficientnet based models currently) -group = configlib.add_parser( - "Batch norm parameters", "Only works with gen_efficientnet based models currently." -) -group.add_argument( - "--bn-momentum", - type=float, - default=None, - help="BatchNorm momentum override (if not None)", -) -group.add_argument( - "--bn-eps", - type=float, - default=None, - help="BatchNorm epsilon override (if not None)", -) -group.add_argument( - "--sync-bn", - action="store_true", - help="Enable NVIDIA Apex or Torch synchronized BatchNorm.", -) -group.add_argument( - "--dist-bn", - type=str, - default="reduce", - help='Distribute BatchNorm stats between nodes after each epoch \ - ("broadcast", "reduce", or "")', -) -group.add_argument( - "--split-bn", - action="store_true", - help="Enable separate BN layers per augmentation split.", -) - -# Model Exponential Moving Average -group = configlib.add_parser("Model exponential moving average parameters") -group.add_argument( - "--model-ema", - action="store_true", - default=False, - help="Enable tracking moving average of model weights", -) -group.add_argument( - "--model-ema-force-cpu", - action="store_true", - default=False, - help="Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.", -) -group.add_argument( - "--model-ema-decay", - type=float, - default=0.9998, - help="decay factor for model weights moving average (default: 0.9998)", -) - -# Misc -group = configlib.add_parser("Miscellaneous parameters") -group.add_argument( - "--seed", type=int, default=42, metavar="S", help="random seed (default: 42)" -) -group.add_argument( - "--worker-seeding", type=str, default="all", help="worker seed mode (default: all)" -) -group.add_argument( - "--log-interval", - type=int, - default=5, - metavar="N", - help="how many batches to wait before logging training status", -) -group.add_argument( - "--recovery-interval", - type=int, - default=0, - metavar="N", - help="how many batches to wait before writing recovery checkpoint", -) -group.add_argument( - "--checkpoint-hist", - type=int, - default=10, - metavar="N", - help="number of checkpoints to keep (default: 10)", -) -group.add_argument( - "-j", - "--workers", - type=int, - default=4, - metavar="N", - help="how many training processes to use (default: 4)", -) -group.add_argument( - "--save-images", - action="store_true", - default=False, - help="save images of input bathes every log interval for debugging", -) -group.add_argument( - "--amp", - action="store_true", - default=False, - help="use NVIDIA Apex AMP or Native AMP for mixed precision training", -) -group.add_argument( - "--apex-amp", - action="store_true", - default=False, - help="Use NVIDIA Apex AMP mixed precision", -) -group.add_argument( - "--native-amp", - action="store_true", - default=False, - help="Use Native Torch AMP mixed precision", -) -group.add_argument( - "--no-ddp-bb", - action="store_true", - default=False, - help="Force broadcast buffers for native DDP to off.", -) -group.add_argument( - "--pin-mem", - action="store_true", - default=False, - help="Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.", -) -group.add_argument( - "--no-prefetcher", - action="store_true", - default=False, - help="disable fast prefetcher", -) -group.add_argument( - "--output", - default="", - type=str, - metavar="PATH", - help="path to output folder (default: none, current dir)", -) -group.add_argument( - "--experiment", - default="", - type=str, - metavar="NAME", - help="name of train experiment, name of sub-folder for output", -) -group.add_argument( - "--eval-metric", - default="top1", - type=str, - metavar="EVAL_METRIC", - help='Best metric (default: "top1"', -) -group.add_argument( - "--tta", - type=int, - default=0, - metavar="N", - help="Test/inference time augmentation (oversampling) factor. 0=None (default: 0)", -) -group.add_argument("--local_rank", default=0, type=int) -group.add_argument( - "--use-multi-epochs-loader", - action="store_true", - default=False, - help="use the multi-epochs-loader to save time at the beginning of every epoch", -) -group.add_argument( - "--log-wandb", - action="store_true", - default=False, - help="log training and validation metrics to wandb", -) - - -def _parse_args(): - # Do we have a config file to parse? - # args_config, remaining = config_parser.parse_known_args() - # if args_config.config: - # with open(args_config.config, "r") as f: - # cfg = yaml.safe_load(f) - # parser.set_defaults(**cfg) - - # The main arg parser parses the rest of the args, the usual - # defaults will have been overridden if config file specified. - # args = parser.parse_args(remaining) - configlib.parse() - - # Cache the args as a text string to save them in the output dir later - args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) - return args, args_text - - -def main(): - utils.setup_default_logging() - args, args_text = _parse_args() - - args.prefetcher = not args.no_prefetcher - args.distributed = False - if "WORLD_SIZE" in os.environ: - args.distributed = int(os.environ["WORLD_SIZE"]) > 1 - args.device = "cuda:0" - args.world_size = 1 - args.rank = 0 # global rank - if args.distributed: - if "LOCAL_RANK" in os.environ: - args.local_rank = int(os.getenv("LOCAL_RANK")) - args.device = "cuda:%d" % args.local_rank - torch.cuda.set_device(args.local_rank) - torch.distributed.init_process_group(backend="nccl", init_method="env://") - args.world_size = torch.distributed.get_world_size() - args.rank = torch.distributed.get_rank() - _logger.info( - "Training in distributed mode with multiple processes, \ - 1 GPU per process. Process %d, total %d." - % (args.rank, args.world_size) - ) - else: - _logger.info("Training with a single process on 1 GPUs.") - assert args.rank >= 0 - - if args.rank == 0 and args.log_wandb: - if has_wandb: - wandb.init(project=args.experiment, config=args) - else: - _logger.warning( - "You've requested to log metrics to wandb but package not found. " - "Metrics not being logged to wandb, try `pip install wandb`" - ) - - # resolve AMP arguments based on PyTorch / Apex availability - use_amp = None - if args.amp: - # `--amp` chooses native amp before apex (APEX ver not actively maintained) - if has_native_amp: - args.native_amp = True - elif has_apex: - args.apex_amp = True - if args.apex_amp and has_apex: - use_amp = "apex" - elif args.native_amp and has_native_amp: - use_amp = "native" - elif args.apex_amp or args.native_amp: - _logger.warning( - "Neither APEX or native Torch AMP is available, using float32. " - "Install NVIDA apex or upgrade to PyTorch 1.6" - ) - - utils.random_seed(args.seed, args.rank) - - if args.fuser: - utils.set_jit_fuser(args.fuser) - if args.fast_norm: - set_fast_norm() - - if args.model == "phinet": - model = PhiNet( - input_shape=vars(args)["input_size"], - alpha=vars(args)["alpha"], - num_layers=vars(args)["num_layers"], - beta=vars(args)["beta"], - t_zero=vars(args)["t_zero"], - include_top=True, - num_classes=vars(args)["num_classes"], - compatibility=False, - ) - else: - model = create_model( - args.model, - pretrained=args.pretrained, - num_classes=args.num_classes, - drop_rate=args.drop, - drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path - drop_path_rate=args.drop_path, - drop_block_rate=args.drop_block, - global_pool=args.gp, - bn_momentum=args.bn_momentum, - bn_eps=args.bn_eps, - scriptable=args.torchscript, - checkpoint_path=args.initial_checkpoint, - ) - - if args.num_classes is None: - assert hasattr( - model, "num_classes" - ), "Model must have `num_classes` attr if not set on cmd line/config." - args.num_classes = ( - model.num_classes - ) # FIXME handle model default vs config num_classes more elegantly - - if args.grad_checkpointing: - model.set_grad_checkpointing(enable=True) - - if args.local_rank == 0: - _logger.info( - f"Model {safe_model_name(args.model)} created, \ - param count:{sum([m.numel() for m in model.parameters()])}" - ) - - data_config = resolve_data_config( - vars(args), model=model, verbose=args.local_rank == 0 - ) - - # setup augmentation batch splits for contrastive loss or split bn - num_aug_splits = 0 - if args.aug_splits > 0: - assert args.aug_splits > 1, "A split of 1 makes no sense" - num_aug_splits = args.aug_splits - - # enable split bn (separate bn stats per batch-portion) - if args.split_bn: - assert num_aug_splits > 1 or args.resplit - model = convert_splitbn_model(model, max(num_aug_splits, 2)) - - # move model to GPU, enable channels last layout if set - model.cuda() - if args.channels_last: - model = model.to(memory_format=torch.channels_last) - - # setup synchronized BatchNorm for distributed training - if args.distributed and args.sync_bn: - args.dist_bn = "" # disable dist_bn when sync BN active - assert not args.split_bn - if has_apex and use_amp == "apex": - # Apex SyncBN used with Apex AMP - # WARNING this won't currently work with models using BatchNormAct2d - model = convert_syncbn_model(model) - else: - model = convert_sync_batchnorm(model) - if args.local_rank == 0: - _logger.info( - "Converted model to use Synchronized BatchNorm. \ - WARNING: You may have issues if using " - "zero initialized BN layers (enabled by default for ResNets) \ - while sync-bn enabled." - ) - - if args.torchscript: - assert not use_amp == "apex", "Cannot use APEX AMP with torchscripted model" - assert not args.sync_bn, "Cannot use SyncBatchNorm with torchscripted model" - model = torch.jit.script(model) - if args.aot_autograd: - assert has_functorch, "functorch is needed for --aot-autograd" - model = memory_efficient_fusion(model) - - optimizer = create_optimizer_v2(model, **optimizer_kwargs(cfg=args)) - - # setup automatic mixed-precision (AMP) loss scaling and op casting - amp_autocast = suppress # do nothing - loss_scaler = None - if use_amp == "apex": - model, optimizer = amp.initialize(model, optimizer, opt_level="O1") - loss_scaler = ApexScaler() - if args.local_rank == 0: - _logger.info("Using NVIDIA APEX AMP. Training in mixed precision.") - elif use_amp == "native": - amp_autocast = torch.cuda.amp.autocast - loss_scaler = NativeScaler() - if args.local_rank == 0: - _logger.info("Using native Torch AMP. Training in mixed precision.") - else: - if args.local_rank == 0: - _logger.info("AMP not enabled. Training in float32.") - - # optionally resume from a checkpoint - resume_epoch = None - if args.resume: - resume_epoch = resume_checkpoint( - model, - args.resume, - optimizer=None if args.no_resume_opt else optimizer, - loss_scaler=None if args.no_resume_opt else loss_scaler, - log_info=args.local_rank == 0, - ) - - # setup exponential moving average of model weights, SWA could be used here too - model_ema = None - if args.model_ema: - # Important to create EMA model after cuda() - # DP wrapper, and AMP but before DDP wrapper - model_ema = utils.ModelEmaV2( - model, - decay=args.model_ema_decay, - device="cpu" if args.model_ema_force_cpu else None, - ) - if args.resume: - load_checkpoint(model_ema.module, args.resume, use_ema=True) - - # setup distributed training - if args.distributed: - if has_apex and use_amp == "apex": - # Apex DDP preferred unless native amp is activated - if args.local_rank == 0: - _logger.info("Using NVIDIA APEX DistributedDataParallel.") - model = ApexDDP(model, delay_allreduce=True) - else: - if args.local_rank == 0: - _logger.info("Using native Torch DistributedDataParallel.") - model = NativeDDP( - model, - device_ids=[args.local_rank], - broadcast_buffers=not args.no_ddp_bb, - ) - # NOTE: EMA model does not need to be wrapped by DDP - - # setup learning rate schedule and starting epoch - lr_scheduler, num_epochs = create_scheduler(args, optimizer) - start_epoch = 0 - if args.start_epoch is not None: - # a specified start_epoch will always override the resume epoch - start_epoch = args.start_epoch - elif resume_epoch is not None: - start_epoch = resume_epoch - if lr_scheduler is not None and start_epoch > 0: - lr_scheduler.step(start_epoch) - - if args.local_rank == 0: - _logger.info("Scheduled epochs: {}".format(num_epochs)) - - # create the train and eval datasets - dataset_train = create_dataset( - args.dataset, - root=args.data_dir, - split=args.train_split, - is_training=True, - class_map=args.class_map, - download=args.dataset_download, - batch_size=args.batch_size, - repeats=args.epoch_repeats, - ) - dataset_eval = create_dataset( - args.dataset, - root=args.data_dir, - split=args.val_split, - is_training=False, - class_map=args.class_map, - download=args.dataset_download, - batch_size=args.batch_size, - ) - - # setup mixup / cutmix - collate_fn = None - mixup_fn = None - mixup_active = args.mixup > 0 or args.cutmix > 0.0 or args.cutmix_minmax is not None - if mixup_active: - mixup_args = dict( - mixup_alpha=args.mixup, - cutmix_alpha=args.cutmix, - cutmix_minmax=args.cutmix_minmax, - prob=args.mixup_prob, - switch_prob=args.mixup_switch_prob, - mode=args.mixup_mode, - label_smoothing=args.smoothing, - num_classes=args.num_classes, - ) - if args.prefetcher: - assert ( - not num_aug_splits - ) # collate conflict (need to support deinterleaving in collate mixup) - collate_fn = FastCollateMixup(**mixup_args) - else: - mixup_fn = Mixup(**mixup_args) - - # wrap dataset in AugMix helper - if num_aug_splits > 1: - dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) - - # create data loaders w/ augmentation pipeiine - train_interpolation = args.train_interpolation - if args.no_aug or not train_interpolation: - train_interpolation = data_config["interpolation"] - loader_train = create_loader( - dataset_train, - input_size=data_config["input_size"], - batch_size=args.batch_size, - is_training=True, - use_prefetcher=args.prefetcher, - no_aug=args.no_aug, - re_prob=args.reprob, - re_mode=args.remode, - re_count=args.recount, - re_split=args.resplit, - scale=args.scale, - ratio=args.ratio, - hflip=args.hflip, - vflip=args.vflip, - color_jitter=args.color_jitter, - auto_augment=args.aa, - num_aug_repeats=args.aug_repeats, - num_aug_splits=num_aug_splits, - interpolation=train_interpolation, - mean=data_config["mean"], - std=data_config["std"], - num_workers=args.workers, - distributed=args.distributed, - collate_fn=collate_fn, - pin_memory=args.pin_mem, - use_multi_epochs_loader=args.use_multi_epochs_loader, - worker_seeding=args.worker_seeding, - ) - - loader_eval = create_loader( - dataset_eval, - input_size=data_config["input_size"], - batch_size=args.validation_batch_size or args.batch_size, - is_training=False, - use_prefetcher=args.prefetcher, - interpolation=data_config["interpolation"], - mean=data_config["mean"], - std=data_config["std"], - num_workers=args.workers, - distributed=args.distributed, - crop_pct=data_config["crop_pct"], - pin_memory=args.pin_mem, - ) - - # setup loss function - if args.jsd_loss: - assert num_aug_splits > 1 # JSD only valid with aug splits set - train_loss_fn = JsdCrossEntropy( - num_splits=num_aug_splits, smoothing=args.smoothing - ) - elif mixup_active: - # smoothing is handled with mixup target transform which outputs sparse, - # soft targets - if args.bce_loss: - train_loss_fn = BinaryCrossEntropy(target_threshold=args.bce_target_thresh) - else: - train_loss_fn = SoftTargetCrossEntropy() - elif args.smoothing: - if args.bce_loss: - train_loss_fn = BinaryCrossEntropy( - smoothing=args.smoothing, target_threshold=args.bce_target_thresh - ) - else: - train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing) - else: - train_loss_fn = nn.CrossEntropyLoss() - train_loss_fn = train_loss_fn.cuda() - validate_loss_fn = nn.CrossEntropyLoss().cuda() - - # setup checkpoint saver and eval metric tracking - eval_metric = args.eval_metric - best_metric = None - best_epoch = None - saver = None - output_dir = None - if args.rank == 0: - if args.experiment: - exp_name = args.experiment - else: - exp_name = "-".join( - [ - datetime.now().strftime("%Y%m%d-%H%M%S"), - safe_model_name(args.model), - str(data_config["input_size"][-1]), - ] - ) - output_dir = utils.get_outdir( - args.output if args.output else "./output/train", exp_name - ) - decreasing = True if eval_metric == "loss" else False - saver = utils.CheckpointSaver( - model=model, - optimizer=optimizer, - args=args, - model_ema=model_ema, - amp_scaler=loss_scaler, - checkpoint_dir=output_dir, - recovery_dir=output_dir, - decreasing=decreasing, - max_history=args.checkpoint_hist, - ) - with open(os.path.join(output_dir, "args.yaml"), "w") as f: - f.write(args_text) - - try: - for epoch in range(start_epoch, num_epochs): - if args.distributed and hasattr(loader_train.sampler, "set_epoch"): - loader_train.sampler.set_epoch(epoch) - - train_metrics = train_one_epoch( - epoch, - model, - loader_train, - optimizer, - train_loss_fn, - args, - lr_scheduler=lr_scheduler, - saver=saver, - output_dir=output_dir, - amp_autocast=amp_autocast, - loss_scaler=loss_scaler, - model_ema=model_ema, - mixup_fn=mixup_fn, - ) - - if args.distributed and args.dist_bn in ("broadcast", "reduce"): - if args.local_rank == 0: - _logger.info("Distributing BatchNorm running means and vars") - utils.distribute_bn(model, args.world_size, args.dist_bn == "reduce") - - eval_metrics = validate( - model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast - ) - - if model_ema is not None and not args.model_ema_force_cpu: - if args.distributed and args.dist_bn in ("broadcast", "reduce"): - utils.distribute_bn( - model_ema, args.world_size, args.dist_bn == "reduce" - ) - ema_eval_metrics = validate( - model_ema.module, - loader_eval, - validate_loss_fn, - args, - amp_autocast=amp_autocast, - log_suffix=" (EMA)", - ) - eval_metrics = ema_eval_metrics - - if lr_scheduler is not None: - # step LR for next epoch - lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) - - if output_dir is not None: - utils.update_summary( - epoch, - train_metrics, - eval_metrics, - os.path.join(output_dir, "summary.csv"), - write_header=best_metric is None, - log_wandb=args.log_wandb and has_wandb, - ) - - if saver is not None: - # save proper checkpoint with eval metric - save_metric = eval_metrics[eval_metric] - best_metric, best_epoch = saver.save_checkpoint( - epoch, metric=save_metric - ) - - except KeyboardInterrupt: - pass - if best_metric is not None: - _logger.info("*** Best metric: {0} (epoch {1})".format(best_metric, best_epoch)) - - -def train_one_epoch( - epoch, - model, - loader, - optimizer, - loss_fn, - args, - lr_scheduler=None, - saver=None, - output_dir=None, - amp_autocast=suppress, - loss_scaler=None, - model_ema=None, - mixup_fn=None, -): - if args.mixup_off_epoch and epoch >= args.mixup_off_epoch: - if args.prefetcher and loader.mixup_enabled: - loader.mixup_enabled = False - elif mixup_fn is not None: - mixup_fn.mixup_enabled = False - - second_order = hasattr(optimizer, "is_second_order") and optimizer.is_second_order - batch_time_m = utils.AverageMeter() - data_time_m = utils.AverageMeter() - losses_m = utils.AverageMeter() - - model.train() - - end = time.time() - last_idx = len(loader) - 1 - num_updates = epoch * len(loader) - for batch_idx, (input, target) in enumerate(loader): - last_batch = batch_idx == last_idx - data_time_m.update(time.time() - end) - if not args.prefetcher: - input, target = input.cuda(), target.cuda() - if mixup_fn is not None: - input, target = mixup_fn(input, target) - if args.channels_last: - input = input.contiguous(memory_format=torch.channels_last) - - with amp_autocast(): - output = model(input) - loss = loss_fn(output, target) - - if not args.distributed: - losses_m.update(loss.item(), input.size(0)) - - optimizer.zero_grad() - if loss_scaler is not None: - loss_scaler( - loss, - optimizer, - clip_grad=args.clip_grad, - clip_mode=args.clip_mode, - parameters=model_parameters( - model, exclude_head="agc" in args.clip_mode - ), - create_graph=second_order, - ) - else: - loss.backward(create_graph=second_order) - if args.clip_grad is not None: - utils.dispatch_clip_grad( - model_parameters(model, exclude_head="agc" in args.clip_mode), - value=args.clip_grad, - mode=args.clip_mode, - ) - optimizer.step() - - if model_ema is not None: - model_ema.update(model) - - torch.cuda.synchronize() - num_updates += 1 - batch_time_m.update(time.time() - end) - if last_batch or batch_idx % args.log_interval == 0: - lrl = [param_group["lr"] for param_group in optimizer.param_groups] - lr = sum(lrl) / len(lrl) - - if args.distributed: - reduced_loss = utils.reduce_tensor(loss.data, args.world_size) - losses_m.update(reduced_loss.item(), input.size(0)) - - if args.local_rank == 0: - _logger.info( - "Train: {} [{:>4d}/{} ({:>3.0f}%)] " - "Loss: {loss.val:#.4g} ({loss.avg:#.3g}) " - "Time: {batch_time.val:.3f}s, {rate:>7.2f}/s " - "({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) " - "LR: {lr:.3e} " - "Data: {data_time.val:.3f} ({data_time.avg:.3f})".format( - epoch, - batch_idx, - len(loader), - 100.0 * batch_idx / last_idx, - loss=losses_m, - batch_time=batch_time_m, - rate=input.size(0) * args.world_size / batch_time_m.val, - rate_avg=input.size(0) * args.world_size / batch_time_m.avg, - lr=lr, - data_time=data_time_m, - ) - ) - - if args.save_images and output_dir: - torchvision.utils.save_image( - input, - os.path.join(output_dir, "train-batch-%d.jpg" % batch_idx), - padding=0, - normalize=True, - ) - - if ( - saver is not None - and args.recovery_interval - and (last_batch or (batch_idx + 1) % args.recovery_interval == 0) - ): - saver.save_recovery(epoch, batch_idx=batch_idx) - - if lr_scheduler is not None: - lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg) - - end = time.time() - # end for - - if hasattr(optimizer, "sync_lookahead"): - optimizer.sync_lookahead() - - return OrderedDict([("loss", losses_m.avg)]) - - -def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=""): - batch_time_m = utils.AverageMeter() - losses_m = utils.AverageMeter() - top1_m = utils.AverageMeter() - top5_m = utils.AverageMeter() - - model.eval() - - end = time.time() - last_idx = len(loader) - 1 - with torch.no_grad(): - for batch_idx, (input, target) in enumerate(loader): - last_batch = batch_idx == last_idx - if not args.prefetcher: - input = input.cuda() - target = target.cuda() - if args.channels_last: - input = input.contiguous(memory_format=torch.channels_last) - - with amp_autocast(): - output = model(input) - if isinstance(output, (tuple, list)): - output = output[0] - - # augmentation reduction - reduce_factor = args.tta - if reduce_factor > 1: - output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2) - target = target[0 : target.size(0) : reduce_factor] - - loss = loss_fn(output, target) - acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) - - if args.distributed: - reduced_loss = utils.reduce_tensor(loss.data, args.world_size) - acc1 = utils.reduce_tensor(acc1, args.world_size) - acc5 = utils.reduce_tensor(acc5, args.world_size) - else: - reduced_loss = loss.data - - torch.cuda.synchronize() - - losses_m.update(reduced_loss.item(), input.size(0)) - top1_m.update(acc1.item(), output.size(0)) - top5_m.update(acc5.item(), output.size(0)) - - batch_time_m.update(time.time() - end) - end = time.time() - if args.local_rank == 0 and ( - last_batch or batch_idx % args.log_interval == 0 - ): - log_name = "Test" + log_suffix - _logger.info( - "{0}: [{1:>4d}/{2}] " - "Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) " - "Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) " - "Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) " - "Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})".format( - log_name, - batch_idx, - last_idx, - batch_time=batch_time_m, - loss=losses_m, - top1=top1_m, - top5=top5_m, - ) - ) - - metrics = OrderedDict( - [("loss", losses_m.avg), ("top1", top1_m.avg), ("top5", top5_m.avg)] - ) - - return metrics - - -if __name__ == "__main__": - main() diff --git a/recipes/image_classification/distributed_train.sh b/recipes/image_classification/distributed_train.sh deleted file mode 100755 index df2fbe1..0000000 --- a/recipes/image_classification/distributed_train.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -NUM_PROC=$1 -shift -python3 -m torch.distributed.launch --nproc_per_node=$NUM_PROC classification.py "$@" diff --git a/recipes/image_classification/extra_requirements.txt b/recipes/image_classification/extra_requirements.txt deleted file mode 100644 index bdbf074..0000000 --- a/recipes/image_classification/extra_requirements.txt +++ /dev/null @@ -1 +0,0 @@ -timm==0.6.13 diff --git a/recipes/image_classification/launch_training.sh b/recipes/image_classification/launch_training.sh deleted file mode 100755 index b6056ac..0000000 --- a/recipes/image_classification/launch_training.sh +++ /dev/null @@ -1,22 +0,0 @@ -# MNIST training -CUDA_VISIBLE_DEVICES=0 python classification.py ~/data/mnist -b 128 --dataset torch/mnist --num-classes 10 \ - --model phinet --input-size 1 28 28 --epochs 20 --amp \ - --opt adam --lr 0.01 --weight-decay 0.01 --no-aug \ - --pin-mem --apex-amp --use-multi-epochs-loader --mean 0.1307 --std 0.3081 --dataset-download --log-interval 100 \ - --alpha 0.25 --num_layers 7 --beta 1 --t_zero 6 --experiment mnist_025_1_6_7_test - -# CIFAR-10 training -# CUDA_VISIBLE_DEVICES=2 python classification.py ~/data/cifar10 -b 64 --dataset torch/cifar10 --num-classes 10 \ -# --model phinet --input-size 3 160 160 --epochs 100 --amp \ -# --opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \ -# --hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \ -# --pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar10_025_1_6_7 \ -# --alpha 0.25 --beta 1 --t_zero 6 --num_layers 7 - -# CIFAR-100 training -# CUDA_VISIBLE_DEVICES=1 python classification.py ~/data/cifar100 -b 64 --dataset torch/cifar100 --num-classes 100 \ -# --model phinet --input-size 3 160 160 --epochs 100 --amp \ -# --opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \ -# --hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \ -# --pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar100_025_1_6_7 \ -# --alpha 0.25 --beta 1 --t_zero 6 --num_layers 7 diff --git a/tests/test_networks.py b/tests/test_networks.py index 09c87f5..ed489bd 100644 --- a/tests/test_networks.py +++ b/tests/test_networks.py @@ -8,8 +8,8 @@ def test_onnx(): - from micromind import PhiNet - from micromind.conversion import convert_to_onnx + from micromind.networks import PhiNet + from micromind.convert import convert_to_onnx save_path = "temp.onnx" @@ -29,8 +29,8 @@ def test_onnx(): def test_openvino(): - from micromind import PhiNet - from micromind.conversion import convert_to_openvino + from micromind.networks import PhiNet + from micromind.convert import convert_to_openvino save_dir = "vino" @@ -45,8 +45,8 @@ def test_openvino(): def test_tflite(): - from micromind import PhiNet - from micromind.conversion import convert_to_tflite + from micromind.networks import PhiNet + from micromind.convert import convert_to_tflite save_path = "tflite"