From d318e0b81255a319f1302073c1ebeb5488074fda Mon Sep 17 00:00:00 2001
From: Francesco Paissan <46992226+fpaissan@users.noreply.github.com>
Date: Thu, 12 Oct 2023 12:04:17 +0200
Subject: [PATCH] Version 0.1.0 (#41)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Extended unit tests to classifier and fixed pooling (#17)

* Extended unit tests to classifier and fixed pooling

* Changed trigger of doctest workflow

* Fixing issue #18

* fixed linters

* Add pre-commit hooks

* Doctest only on PRs

* Fixed network conversion from GPU

Also tested on Windows machine.

* Create python_versions.yml

* Update and rename python_versions.yml to tests.yml

* Update export.yaml

* CI fix (#21)

* Create pre-commit.yaml

* remove code.yaml

* fixing pre-commit

* Doctest with pytest

* change trigger

* change trigger

* Delete LICENSE

* checkpoint from filesystem (#20)

* checkpoint from filesystem

* fixed deps

* Update README.md

* Update LICENSE

* Updating LICENSE

---------

Co-authored-by: fpaissan <me@francescopaissan.it>
Co-authored-by: Francesco Paissan <46992226+fpaissan@users.noreply.github.com>

* Create LICENSE (#22)

* Update README.md (#23)

* new min python version to 3.8

* 🐛 extra_requirements now have a version - fixed CI (#24)

* 🐛 extra_requirements now have a version

* fixed linter errors

* testing actions

* fixed linter

* removing tf_probability

* fixed tf prob version

---------

Co-authored-by: fpaissan <me@francescopaissan.it>

* Documentation upgrade - guide for contribution (#25)

* add contribution guide to docs

* documentation with contribution guide

* cosmetic

* bump version 0.0.4 -> 0.0.5

* Bump requests from 2.28.2 to 2.31.0 (#27)

Bumps [requests](https://github.com/psf/requests) from 2.28.2 to 2.31.0.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.28.2...v2.31.0)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

* fix pypi release

* Update README.md (#29)

* Patch for faster GPU inference (#35)

* Patch for faster GPU inference

* remove unused zeropad def

---------

Co-authored-by: fpaissan <me@francescopaissan.it>

* initial commit

* add eval loop

* add acceleration

* modules as dict

* add checkpointer

* minor

* load best checkpoint

* restore epoch, optimizer, lr sched

* fix logging on multi-gpu

* minor fixes

* working on single gpu

* fix checkpointer + multi-gpu

* fp16 might not be ok yet

* load_modules and unwrap_model

* fixed convert and export

* cosmetic on export

* add argparse

* add metrics -- check something is off with acc

* its print strange

* fixed checkpointer viz

* fix checkpointers and metrics

* cosmetic

* linters

* add credits

* fix requirements

* fix unittest

* remove recipes

* remove unused files

* remove unused fuctions from networks

* fix tests

* hot fix

* onnx conversion without convert

* fix requirements

* add default class config and temp folder for debug mode

* add doc for class Metric

* finish doc MicroMind

* update docs

* linters fix

* new initial page

* bump version 0.0.5 -> 0.1.0

* final touches and bumpver

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Matteo Beltrami <71525176+matteobeltrami@users.noreply.github.com>
Co-authored-by: SebastianCavada <sebastian.cavada.lab@gmail.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Matteo Tremonti <102596472+Tremo8@users.noreply.github.com>
Co-authored-by: Matteo Beltrami <beltramimatteo01@gmail.com>
---
 README.md                                     |    8 +
 docs/source/index.rst                         |   68 +
 docs/source/micromind.conversion.rst          |   21 -
 docs/source/micromind.networks.rst            |    8 -
 docs/source/micromind.rst                     |   27 +-
 docs/source/micromind.utils.rst               |   20 +-
 examples/mind.py                              |   67 +
 micromind/__init__.py                         |    6 +-
 micromind/conversion/__init__.py              |    1 -
 micromind/{conversion => }/convert.py         |  124 +-
 micromind/core.py                             |  561 ++++++
 micromind/networks/phinet.py                  |  352 +---
 micromind/utils/checkpointer.py               |   99 ++
 micromind/utils/configlib.py                  |   46 -
 micromind/utils/helpers.py                    |   43 +
 micromind/utils/parse.py                      |   28 +
 pyproject.toml                                |    5 +-
 recipes/image_classification/README.md        |   61 -
 .../image_classification/classification.py    | 1551 -----------------
 .../image_classification/distributed_train.sh |    4 -
 .../extra_requirements.txt                    |    1 -
 .../image_classification/launch_training.sh   |   22 -
 tests/test_networks.py                        |   12 +-
 23 files changed, 1097 insertions(+), 2038 deletions(-)
 delete mode 100644 docs/source/micromind.conversion.rst
 create mode 100644 examples/mind.py
 delete mode 100644 micromind/conversion/__init__.py
 rename micromind/{conversion => }/convert.py (50%)
 create mode 100644 micromind/core.py
 create mode 100644 micromind/utils/checkpointer.py
 delete mode 100644 micromind/utils/configlib.py
 create mode 100644 micromind/utils/helpers.py
 create mode 100644 micromind/utils/parse.py
 delete mode 100644 recipes/image_classification/README.md
 delete mode 100644 recipes/image_classification/classification.py
 delete mode 100755 recipes/image_classification/distributed_train.sh
 delete mode 100644 recipes/image_classification/extra_requirements.txt
 delete mode 100755 recipes/image_classification/launch_training.sh

diff --git a/README.md b/README.md
index b3e0fd2..fd39001 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,14 @@ for the basic install. To install `micromind` with the full exportability featur
 pip install -e .[conversion]
 ```
 
+### Training networks with recipes
+
+After the installation, get started looking at the examples and the docs!
+
+### Export your model and run it on your MCU
+Check out [this](https://docs.google.com/document/d/1zt5urvNtI9VSJcoJdIeo10YrdH-tZNcS4JHbT1z5udI/edit?usp=sharing)
+tutorial and have fun deploying your network on MCU!
+
 ---------------------------------------------------------------------------------------------------------
 
 ## 📧 Contact
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9f114f7..742fcc8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -6,6 +6,74 @@
 Welcome to micromind's documentation!
 =====================================
 
+.. image:: https://img.shields.io/badge/python-3.9%20|%203.10-blue
+    :target: https://www.python.org/downloads/
+
+.. image:: https://img.shields.io/badge/License-Apache_2.0-blue.svg
+    :target: https://github.com/fpaissan/micromind/blob/main/LICENSE
+
+.. image:: https://img.shields.io/pypi/v/micromind
+
+This is the official repository of `micromind`, a toolkit that aims to bridge two communities: artificial intelligence and embedded systems. `micromind` is based on `PyTorch <https://pytorch.org>`_ and provides exportability for the supported models in ONNX, Intel OpenVINO, and TFLite.
+
+Key Features
+------------
+
+- Smooth flow from research to deployment;
+- Support for multimedia analytics recipes (image classification, sound event detection, etc);
+- Detailed API documentation;
+- Tutorials for embedded deployment.
+
+Installation
+------------
+
+Using Pip
+~~~~~~~~~
+
+First of all, install `Python 3.8 or later <https://www.python.org>`_. Open a terminal and run:
+
+.. code:: shell
+
+    pip install micromind
+
+for the basic install. To install `micromind` with the full exportability features, run
+
+.. code:: shell
+
+    pip install micromind[conversion]
+
+
+Basic how-to
+------------
+
+If you want to launch a simple training on an image classification model, you just need to define a class that extends `MicroMind <https://micromind.readthedocs.org/en/latest/micromind.html#micromind.core.MicroMind>`_, defining the modules you want to use, such as a `PhiNet`, the forward method of the model and the way in which to calculate your loss function. micromind takes care of the rest for you.
+
+.. code-block:: python
+
+   class ImageClassification(MicroMind):
+      def __init__(self, *args, **kwargs):
+         super().__init__(*args, **kwargs)
+
+         self.modules["classifier"] = PhiNet(
+            (3, 32, 32), include_top=True, num_classes=10
+         )
+
+      def forward(self, batch):
+         return self.modules["classifier"](batch[0])
+
+      def compute_loss(self, pred, batch):
+         return nn.CrossEntropyLoss()(pred, batch[1])
+
+Afterwards, you can export the model in the format you like best between **ONNX**, **TFLite** and **OpenVINO**, just run this simple code:
+
+.. code-block:: python
+
+   m = ImageClassification()
+   m.export("output_onnx", "onnx", (3, 32, 32))
+
+
+Here is the link to the Python `file <https://github.com/micromind-toolkit/micromind/blob/mm_refactor/examples/mind.py>`_ inside our repository that illustrates how to use the MicroMind class.
+
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
diff --git a/docs/source/micromind.conversion.rst b/docs/source/micromind.conversion.rst
deleted file mode 100644
index 30195cc..0000000
--- a/docs/source/micromind.conversion.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-micromind.conversion package
-============================
-
-Submodules
-----------
-
-micromind.conversion.convert module
------------------------------------
-
-.. automodule:: micromind.conversion.convert
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: micromind.conversion
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/micromind.networks.rst b/docs/source/micromind.networks.rst
index f9eb3b6..327a2b4 100644
--- a/docs/source/micromind.networks.rst
+++ b/docs/source/micromind.networks.rst
@@ -11,11 +11,3 @@ micromind.networks.phinet module
    :members:
    :undoc-members:
    :show-inheritance:
-
-Module contents
----------------
-
-.. automodule:: micromind.networks
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/micromind.rst b/docs/source/micromind.rst
index 9825874..d85c6b5 100644
--- a/docs/source/micromind.rst
+++ b/docs/source/micromind.rst
@@ -1,20 +1,27 @@
 micromind package
 =================
 
+micromind.core module
+---------------------
+
+.. automodule:: micromind.core
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+micromind.convert module
+------------------------
+
+.. automodule:: micromind.convert
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 Subpackages
 -----------
 
 .. toctree::
-   :maxdepth: 4
+   :maxdepth: 2
 
-   micromind.conversion
    micromind.networks
    micromind.utils
-
-Module contents
----------------
-
-.. automodule:: micromind
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/micromind.utils.rst b/docs/source/micromind.utils.rst
index 0692030..fd2a7b7 100644
--- a/docs/source/micromind.utils.rst
+++ b/docs/source/micromind.utils.rst
@@ -4,18 +4,26 @@ micromind.utils package
 Submodules
 ----------
 
-micromind.utils.configlib module
---------------------------------
+micromind.utils.checkpointer module
+-----------------------------------
 
-.. automodule:: micromind.utils.configlib
+.. automodule:: micromind.utils.checkpointer
    :members:
    :undoc-members:
    :show-inheritance:
 
-Module contents
----------------
+micromind.utils.helpers module
+------------------------------
 
-.. automodule:: micromind.utils
+.. automodule:: micromind.utils.helpers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+micromind.utils.parse module
+----------------------------
+
+.. automodule:: micromind.utils.parse
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/examples/mind.py b/examples/mind.py
new file mode 100644
index 0000000..958d612
--- /dev/null
+++ b/examples/mind.py
@@ -0,0 +1,67 @@
+from micromind import MicroMind, Metric
+from micromind.networks import PhiNet
+from micromind.utils.parse import parse_arguments
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+
+batch_size = 128
+
+
+class ImageClassification(MicroMind):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.modules["classifier"] = PhiNet(
+            (3, 32, 32), include_top=True, num_classes=10
+        )
+
+    def forward(self, batch):
+        return self.modules["classifier"](batch[0])
+
+    def compute_loss(self, pred, batch):
+        return nn.CrossEntropyLoss()(pred, batch[1])
+
+
+if __name__ == "__main__":
+    hparams = parse_arguments()
+    m = ImageClassification(hparams)
+
+    def compute_accuracy(pred, batch):
+        tmp = (pred.argmax(1) == batch[1]).float()
+        return tmp
+
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    )
+
+    trainset = torchvision.datasets.CIFAR10(
+        root="data/cifar-10", train=True, download=True, transform=transform
+    )
+    trainloader = torch.utils.data.DataLoader(
+        trainset, batch_size=batch_size, shuffle=True, num_workers=1
+    )
+
+    testset = torchvision.datasets.CIFAR10(
+        root="data/cifar-10", train=False, download=True, transform=transform
+    )
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=batch_size, shuffle=False, num_workers=1
+    )
+
+    acc = Metric(name="accuracy", fn=compute_accuracy)
+
+    m.train(
+        epochs=10,
+        datasets={"train": trainloader, "val": testloader, "test": testloader},
+        metrics=[acc],
+        debug=hparams.debug,
+    )
+
+    m.test(
+        datasets={"test": testloader},
+    )
+
+    m.export("output_onnx", "onnx", (3, 32, 32))
diff --git a/micromind/__init__.py b/micromind/__init__.py
index 3711e9f..7ec91f3 100644
--- a/micromind/__init__.py
+++ b/micromind/__init__.py
@@ -1,9 +1,7 @@
-from .networks.phinet import PhiNet
-from .utils import configlib
-
+from .core import MicroMind, Metric, Stage
 
 # Package version
-__version__ = "0.0.5"
+__version__ = "0.1.0"
 
 """datasets_info is a dictionary that contains information about the attributes
 of the datasets.
diff --git a/micromind/conversion/__init__.py b/micromind/conversion/__init__.py
deleted file mode 100644
index ca0a917..0000000
--- a/micromind/conversion/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .convert import convert_to_onnx, convert_to_openvino, convert_to_tflite
diff --git a/micromind/conversion/convert.py b/micromind/convert.py
similarity index 50%
rename from micromind/conversion/convert.py
rename to micromind/convert.py
index f8be0aa..25878fc 100644
--- a/micromind/conversion/convert.py
+++ b/micromind/convert.py
@@ -6,60 +6,89 @@
     - Francesco Paissan, 2023
     - Alberto Ancilotto, 2023
 """
-try:
-    import os
-    import shutil
-    import sys
-    from pathlib import Path
-
-    import numpy as np
-    import onnx
-    import onnxsim
-    import tensorflow as tf
-    import torch
-    import torch.nn as nn
-    from onnx_tf.backend import prepare
-    from openvino.tools.mo import main as mo_main
-except Exception as e:
-    print(str(e))
-    print("Did you install micromind with conversion capabilities?")
-    print("Please try again after pip install micromind[conversion].")
-    quit()
+from pathlib import Path
+from loguru import logger
+from typing import Union
+import torch.nn as nn
+import torch
+import os
 
 
 @torch.no_grad()
-def convert_to_onnx(net: nn.Module, save_path: Path, simplify: bool = True):
+def convert_to_onnx(
+    net: nn.Module,
+    save_path: Union[Path, str] = "model.onnx",
+    simplify: bool = False,
+    replace_forward: bool = False,
+):
     """Converts nn.Module to onnx and saves it to save_path.
     Optionally simplifies it."""
+    save_path = Path(save_path)
+    os.makedirs(save_path.parent, exist_ok=True)
     x = torch.zeros([1] + list(net.input_shape))
 
+    if replace_forward:
+        # add forward to ModuleDict
+        bound_method = net.forward.__get__(net.modules, net.modules.__class__)
+        setattr(net.modules, "forward", bound_method)
+
+        net.modules.input_shape = net.input_shape
+        net = net.modules
+        x = [torch.zeros([1] + list(net.input_shape)), None]
+
     torch.onnx.export(
         net.cpu(),
         x,
         save_path,
         verbose=False,
-        input_names=["input"],
+        input_names=["input", "labels"],
         output_names=["output"],
         opset_version=11,
     )
 
     if simplify:
+        import onnx
+        import onnxsim
+
         onnx_model = onnx.load(save_path)
         onnx_model, check = onnxsim.simplify(onnx_model)
         onnx.save(onnx_model, save_path)
 
-    return onnx.load(save_path)
+    logger.info(f"Saved converted ONNX model to {save_path}.")
+
+    return save_path
 
 
 @torch.no_grad()
-def convert_to_openvino(net: nn.Module, save_dir: Path) -> str:
+def convert_to_openvino(
+    net: nn.Module, save_path: Path, replace_forward: bool = False
+) -> str:
     """Converts nn.Module to OpenVINO."""
-    os.makedirs(save_dir, exist_ok=True)
-    if not isinstance(save_dir, Path):
-        save_dir = Path(save_dir)
+    try:
+        import os
+
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+        import sys
+        from pathlib import Path
+        from loguru import logger
+
+        import onnx
+        from onnx_tf.backend import prepare
+        from openvino.tools.mo import main as mo_main
+
+    except Exception as e:
+        print(str(e))
+        print("Did you install micromind with conversion capabilities?")
+        print("Please try again after pip install micromind[conversion].")
+        exit(0)
+    os.makedirs(save_path, exist_ok=True)
+    if not isinstance(save_path, Path):
+        save_path = Path(save_path)
 
-    onnx_path = save_dir.joinpath("model.onnx")
-    onnx_model = convert_to_onnx(net, onnx_path, simplify=True)
+    onnx_path = save_path.joinpath("model.onnx")
+    onnx_model = onnx.load(
+        convert_to_onnx(net, onnx_path, simplify=True, replace_forward=replace_forward)
+    )
 
     tf_rep = prepare(onnx_model)
 
@@ -76,21 +105,45 @@ def convert_to_openvino(net: nn.Module, save_dir: Path) -> str:
         "--input_shape",
         input_shape_str,
         "--output_dir",
-        str(save_dir),
+        str(save_path),
         "--data_type",
         "FP32",
+        "--silent",
+        "True",
     ]
 
-    os.system(" ".join(cmd))
+    os.popen(" ".join(cmd)).read()
 
-    return str(save_dir.joinpath("model.xml"))
+    logger.info(f"Saved converted OpenVINO model to {save_path}.")
+
+    return str(save_path.joinpath("model.xml"))
 
 
 @torch.no_grad()
 def convert_to_tflite(
-    net: nn.Module, save_path: Path, batch_quant: torch.Tensor = None
+    net: nn.Module,
+    save_path: Path,
+    batch_quant: torch.Tensor = None,
+    replace_forward: bool = False,
 ) -> None:
     """Converts nn.Module to tf_lite, optionally quantizes it."""
+    try:
+        import os
+
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+        import shutil
+        import sys
+        from pathlib import Path
+        from loguru import logger
+
+        import numpy as np
+        import tensorflow as tf
+
+    except Exception as e:
+        print(str(e))
+        print("Did you install micromind with conversion capabilities?")
+        print("Please try again after pip install micromind[conversion].")
+        exit(0)
     if not isinstance(save_path, Path):
         save_path = Path(save_path)
 
@@ -99,7 +152,7 @@ def convert_to_tflite(
 
     vino_sub = save_path.joinpath("vino")
     os.makedirs(vino_sub, exist_ok=True)
-    vino_path = convert_to_openvino(net, vino_sub)
+    vino_path = convert_to_openvino(net, vino_sub, replace_forward=replace_forward)
     if os.name == "nt":
         openvino2tensorflow_exe_cmd = [
             sys.executable,
@@ -117,9 +170,10 @@ def convert_to_tflite(
         str(save_path),
         "--output_saved_model",
         "--output_no_quant_float32_tflite",
+        "--non_verbose",
     ]
 
-    os.system(" ".join(cmd))
+    os.popen(" ".join(cmd)).read()
 
     shutil.rmtree(vino_sub)
 
@@ -140,3 +194,5 @@ def representative_dataset():
 
         with open(save_path.joinpath("model.int8.tflite"), "wb") as f:
             f.write(tflite_quant_model)
+
+    logger.info(f"Saved converted TFLite model to {save_path}.")
diff --git a/micromind/core.py b/micromind/core.py
new file mode 100644
index 0000000..9c50a62
--- /dev/null
+++ b/micromind/core.py
@@ -0,0 +1,561 @@
+"""
+Core class for micromind. Supports helper function for exports. Out-of-the-box
+multi-gpu and FP16 training with HF Accelerate and much more.
+
+Authors:
+    - Francesco Paissan, 2023
+"""
+from typing import Dict, Union, Tuple, Callable, List
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from argparse import Namespace
+from pathlib import Path
+from loguru import logger
+from tqdm import tqdm
+import shutil
+
+from accelerate import Accelerator
+import torch
+import os
+
+from .utils.helpers import select_and_load_checkpoint, get_random_string
+from .utils.checkpointer import Checkpointer
+
+# This is used ONLY if you are not using argparse to get the hparams
+default_cfg = {
+    "output_folder": "results",
+    "experiment_name": "micromind_exp",
+    "opt": "adam",  # this is ignored if you are overriding the configure_optimizers
+    "lr": 0.001,  # this is ignored if you are overriding the configure_optimizers
+    "debug": False,
+}
+
+
+@dataclass
+class Stage:
+    """enum to track training stage"""
+
+    train: int = 0
+    val: int = 1
+    test: int = 2
+
+
+class Metric:
+    """
+    Class for tracking evaluation metrics during training.
+
+    This class allows you to create custom evaluation metrics by providing a
+    function to compute the metric and specifying a reduction method.
+
+    Arguments
+    ---------
+        name : str
+            The name of the metric.
+        fn : Callable
+            A function that computes the metric given predictions and batch data.
+        reduction : Optional[str]
+            The reduction method for the metric ('sum' or 'mean'). Default is 'mean'.
+
+    Returns
+    -------
+        Reduced metric. Optionally, you can access the metric history
+        before call reduce(clear=True) : torch.Tensor
+
+    Example
+    -------
+    .. doctest::
+
+        >>> from micromind import Metric, Stage
+        >>> import torch
+
+        >>> def custom_metric(pred, batch):
+        ...     # Replace this with your custom metric calculation
+        ...     return pred - batch
+
+        >>> metric = Metric("Custom Metric", custom_metric, reduction="mean")
+        >>> pred = torch.tensor([1.0, 2.0, 3.0])
+        >>> batch = torch.tensor([0.5, 1.5, 2.5])
+        >>> metric(pred, batch, stage=Stage.train)
+        >>> metric.history
+        {0: [tensor([0.5000, 0.5000, 0.5000])], 1: [], 2: []}
+        >>> metric.reduce(Stage.train)
+        0.5
+    """
+
+    def __init__(self, name: str, fn: Callable, reduction="mean"):
+        self.name = name
+        self.fn = fn
+        self.reduction = reduction
+        self.history = {s: [] for s in [Stage.train, Stage.val, Stage.test]}
+
+    def __call__(self, pred, batch, stage, device="cpu"):
+        if pred.device != device:
+            pred = pred.to(device)
+        dat = self.fn(pred, batch)
+        if dat.ndim == 0:
+            dat = dat.unsqueeze(0)
+
+        self.history[stage].append(self.fn(pred, batch))
+
+    def reduce(self, stage, clear=False):
+        """
+        Compute and return the metric for a given prediction and batch data.
+
+        Arguments
+        -------
+            pred : torch.Tensor
+                The model's prediction.
+            batch : torch.Tensor
+                The ground truth or target values.
+            stage : Stage
+                The current stage (e.g., Stage.train).
+            device Optional[str]
+                The device on which to perform the computation. Default is 'cpu'.
+        """
+
+        if self.reduction == "mean":
+            if clear or (
+                self.history[stage][-1].shape[0] != self.history[stage][0].shape[0]
+            ):
+                tmp = torch.stack(self.history[stage][:-1]).mean()
+            else:
+                tmp = torch.stack(self.history[stage]).mean()
+        elif self.reduction == "sum":
+            if (
+                clear
+                or self.history[stage][-1].shape[0] != self.history[stage][0].shape[0]
+            ):
+                tmp = torch.stack(self.history[stage][:-1]).sum()
+            else:
+                tmp = torch.stack(self.history[stage]).sum()
+
+        if clear:
+            self.history[stage] = []
+        return tmp.item()
+
+
+class MicroMind(ABC):
+    """
+    MicroMind is an abstract base class for creating and training deep learning
+    models. Handles training on multi-gpu via accelerate (using DDP and other
+    distributed training strategies). It automatically handles the device
+    management for the training and the micromind's export capabilities to onnx,
+    OpenVino and TFLite.
+
+    Arguments
+    ---------
+        hparams : Optional[Namespace]
+            Hyperparameters for the model. Default is None.
+
+    """
+
+    def __init__(self, hparams=None):
+        if hparams is None:
+            hparams = Namespace(**default_cfg)
+
+        # here we should handle devices etc.
+        self.modules = torch.nn.ModuleDict({})  # init empty modules dict
+        self.hparams = hparams
+        self.input_shape = None
+
+        self.device = "cpu"  # used just to init the models
+        self.accelerator = Accelerator()
+
+    @abstractmethod
+    def forward(self, batch):
+        """
+        Forward step of the class. It gets called during inference and optimization.
+        This method should be overwritten for specific applications.
+
+        Arguments
+        ---------
+            batch : torch.Tensor
+                Batch as output from the defined DataLoader.
+
+        Returns
+        -------
+            pred : Union[torch.Tensor, Tuple]
+                Predictions - this depends on the task.
+        """
+        pass
+
+    @abstractmethod
+    def compute_loss(self, pred, batch):
+        """
+        Computes the cost function for the optimization process.  It return a
+        tensor on which backward() is called. This method should be overwritten
+        for the specific application.
+
+        Arguments
+        ---------
+            pred : Union[torch.Tensor, Tuple]
+                Output of the forward() function
+            batch : torch.Tensor
+                Batch as defined from the DataLoader.
+
+        Returns
+        -------
+            loss : torch.Tensor
+                Compute cost function.
+        """
+        pass
+
+    def set_input_shape(self, input_shape: Tuple = (3, 224, 224)):
+        """Setter function for input_shape.
+
+        Arguments
+        ---------
+            input_shape : Tuple
+                Input shape of the forward step.
+
+        """
+        self.input_shape = input_shape
+
+    def load_modules(self, checkpoint_path: Union[Path, str]):
+        """Loads models for path.
+
+        Arguments
+        ---------
+            checkpoint_path : Union[Path, str]
+                Path to the checkpoint where the modules are stored.
+
+        """
+        dat = torch.load(checkpoint_path)
+
+        modules_keys = list(self.modules.keys())
+        for k in self.modules:
+            self.modules[k].load_state_dict(dat[k])
+
+            modules_keys.remove(k)
+
+        if len(modules_keys) != 0:
+            print(modules_keys)
+            breakpoint()
+            logger.info(f"Couldn't find a state_dict for modules {modules_keys}.")
+
+    def export(
+        self, save_dir: Union[Path, str], out_format: str = "onnx", input_shape=None
+    ) -> None:
+        """
+        Export the model to a specified format for deployment.
+        TFLite and OpenVINO need a Linux machine to be exported.
+
+
+        Arguments
+        ---------
+        save_dir : Union[Path, str]
+            The directory where the exported model will be saved.
+        out_format : Optional[str]
+            The format for exporting the model. Default is 'onnx'.
+        input_shape : Optional[Tuple]
+            The input shape of the model. If not provided, the input shape
+            specified during model creation is used.
+
+        """
+        from micromind import convert
+
+        if not isinstance(save_dir, Path):
+            save_dir = Path(save_dir)
+        save_dir = save_dir.joinpath(self.hparams.experiment_name)
+
+        self.set_input_shape(input_shape)
+        assert (
+            self.input_shape is not None
+        ), "You should pass the input_shape of the model."
+
+        if out_format == "onnx":
+            convert.convert_to_onnx(
+                self, save_dir.joinpath("model.onnx"), replace_forward=True
+            )
+        elif out_format == "openvino":
+            convert.convert_to_openvino(self, save_dir, replace_forward=True)
+        elif out_format == "tflite":
+            convert.convert_to_tflite(self, save_dir, replace_forward=True)
+
+    def configure_optimizers(self):
+        """Configures and defines the optimizer for the task. Defaults to adam
+        with lr=0.001; It can be overwritten by either passing arguments from the
+        command line, or by overwriting this entire method.
+
+        Returns
+        ---------
+           Optimizer and learning rate scheduler
+           (not implemented yet). : Tuple[torch.optim.Adam, None]
+
+        """
+        assert self.hparams.opt in [
+            "adam",
+            "sgd",
+        ], f"Optimizer {self.hparams.opt} not supported."
+        if self.hparams.opt == "adam":
+            opt = torch.optim.Adam(self.modules.parameters(), self.hparams.lr)
+        elif self.hparams.opt == "sgd":
+            opt = torch.optim.SGD(self.modules.parameters(), self.hparams.lr)
+        return opt, None  # None is for learning rate sched
+
+    def __call__(self, *x, **xv):
+        """Just forwards everything to the forward method."""
+        return self.forward(*x, **xv)
+
+    def on_train_start(self):
+        """Initializes the optimizer, modules and puts the networks on the right
+        devices. Optionally loads checkpoint if already present.
+
+        This function gets executed at the beginning of every training.
+        """
+        self.experiment_folder = os.path.join(
+            self.hparams.output_folder, self.hparams.experiment_name
+        )
+        if self.hparams.debug:
+            self.experiment_folder = "tmp_" + get_random_string()
+            logger.info(f"Created temporary folder for debug {self.experiment_folder}.")
+
+        save_dir = os.path.join(self.experiment_folder, "save")
+        if os.path.exists(save_dir):
+            if len(os.listdir(save_dir)) != 0:
+                # select which checkpoint and load it.
+                checkpoint, path = select_and_load_checkpoint(save_dir)
+                self.opt = checkpoint["optimizer"]
+                self.lr_sched = checkpoint["lr_scheduler"]
+                self.start_epoch = checkpoint["epoch"] + 1
+
+                self.load_modules(path)
+
+                if self.accelerator.is_local_main_process:
+                    self.checkpointer = Checkpointer(
+                        checkpoint["key"],
+                        mode=checkpoint["mode"],
+                        checkpoint_path=self.experiment_folder,
+                    )
+
+                    logger.info(f"Loaded existing checkpoint from {path}.")
+            else:
+                self.opt, self.lr_sched = self.configure_optimizers()
+                self.start_epoch = 0
+
+                self.checkpointer = Checkpointer(
+                    "val_loss", checkpoint_path=self.experiment_folder
+                )
+        else:
+            os.makedirs(self.experiment_folder, exist_ok=True)
+
+            self.opt, self.lr_sched = self.configure_optimizers()
+            self.start_epoch = 0
+
+            self.checkpointer = Checkpointer(
+                "val_loss", checkpoint_path=self.experiment_folder
+            )
+
+        self.accelerator = Accelerator()
+        self.device = self.accelerator.device
+        self.modules.to(self.device)
+        print("Set device to ", self.device)
+
+        convert = [self.modules, self.opt, self.lr_sched] + list(self.datasets.values())
+        accelerated = self.accelerator.prepare(convert)
+        self.modules, self.opt, self.lr_sched = accelerated[:3]
+        for i, key in enumerate(self.datasets):
+            self.datasets[key] = accelerated[-(i + 1)]
+
+    def on_train_end(self):
+        """Runs at the end of each training. Cleans up before exiting."""
+        if self.hparams.debug:
+            logger.info(f"Removed temporary folder {self.experiment_folder}.")
+            shutil.rmtree(self.experiment_folder)
+
+        if self.accelerator.is_local_main_process:
+            self.checkpointer.close()
+
+    def train(
+        self,
+        epochs: int = 1,
+        datasets: Dict = {},
+        metrics: List[Metric] = [],
+        debug: bool = False,
+    ) -> None:
+        """
+        This method trains the model on the provided training dataset for the
+        specified number of epochs. It tracks training metrics and can
+        optionally perform validation during training, if the validation set is
+        provided.
+
+        Arguments
+        ---------
+        epochs : int
+            The number of training epochs.
+        datasets : Dict
+            A dictionary of dataset loaders. Dataloader should be mapped to keys
+            "train", "val", and "test".
+        metrics : Optional[List[Metric]]
+            A list of metrics to track during training. Default is an empty list.
+        debug : bool
+            Whether to run in debug mode. Default is False. If in debug mode,
+            only runs for few epochs
+            and with few batches.
+
+        """
+        self.datasets = datasets
+        self.metrics = metrics
+        assert "train" in self.datasets, "Training dataloader was not specified."
+        assert epochs > 0, "You must specify at least one epoch."
+
+        self.debug = debug
+
+        self.on_train_start()
+
+        if self.accelerator.is_local_main_process:
+            logger.info(
+                f"Starting from epoch {self.start_epoch}."
+                + f" Training is scheduled for {epochs} epochs."
+            )
+        with self.accelerator.autocast():
+            for e in range(self.start_epoch, epochs):
+                pbar = tqdm(
+                    self.datasets["train"],
+                    unit="batches",
+                    ascii=True,
+                    dynamic_ncols=True,
+                    disable=not self.accelerator.is_local_main_process,
+                )
+                loss_epoch = 0
+                pbar.set_description(f"Running epoch {e + 1}/{epochs}")
+                self.modules.train()
+                for idx, batch in enumerate(pbar):
+                    if isinstance(batch, list):
+                        batch = [b.to(self.device) for b in batch]
+
+                    self.opt.zero_grad()
+
+                    model_out = self(batch)
+                    loss = self.compute_loss(model_out, batch)
+
+                    self.accelerator.backward(loss)
+                    self.opt.step()
+
+                    for m in self.metrics:
+                        m(model_out, batch, Stage.train, self.device)
+
+                    running_train = {
+                        "train_" + m.name: m.reduce(Stage.train) for m in self.metrics
+                    }
+
+                    running_train.update({"train_loss": loss_epoch / (idx + 1)})
+
+                    loss_epoch += loss.item()
+                    pbar.set_postfix(**running_train)
+
+                    if self.debug and idx > 10:
+                        break
+
+                pbar.close()
+
+                train_metrics = {
+                    "train_" + m.name: m.reduce(Stage.train, True) for m in self.metrics
+                }
+                train_metrics.update({"train_loss": loss_epoch / (idx + 1)})
+
+                if "val" in datasets:
+                    val_metrics = self.validate()
+                    if self.accelerator.is_local_main_process:
+                        self.checkpointer(
+                            self,
+                            e,
+                            train_metrics,
+                            val_metrics,
+                            lambda x: self.accelerator.unwrap_model(x),
+                        )
+                else:
+                    val_metrics = train_metrics.update(
+                        {"val_loss": loss_epoch / (idx + 1)}
+                    )
+
+                if e >= 1 and self.debug:
+                    break
+
+        self.on_train_end()
+        return None
+
+    @torch.no_grad()
+    def validate(self) -> Dict:
+        """Runs the validation step."""
+        assert "val" in self.datasets, "Validation dataloader was not specified."
+        self.modules.eval()
+
+        pbar = tqdm(
+            self.datasets["val"],
+            unit="batches",
+            ascii=True,
+            dynamic_ncols=True,
+            disable=not self.accelerator.is_local_main_process,
+        )
+        loss_epoch = 0
+        pbar.set_description("Validation...")
+        with self.accelerator.autocast():
+            for idx, batch in enumerate(pbar):
+                if isinstance(batch, list):
+                    batch = [b.to(self.device) for b in batch]
+
+                self.opt.zero_grad()
+
+                model_out = self(batch)
+                loss = self.compute_loss(model_out, batch)
+                for m in self.metrics:
+                    m(model_out, batch, Stage.val, self.device)
+
+                loss_epoch += loss.item()
+                pbar.set_postfix(loss=loss_epoch / (idx + 1))
+
+                if self.debug and idx > 10:
+                    break
+
+        val_metrics = {"val_" + m.name: m.reduce(Stage.val, True) for m in self.metrics}
+        val_metrics.update({"val_loss": loss_epoch / (idx + 1)})
+
+        pbar.close()
+
+        return val_metrics
+
+    @torch.no_grad()
+    def test(self, datasets: Dict = {}) -> None:
+        """Runs the test steps."""
+        assert "test" in self.datasets, "Test dataloader was not specified."
+        self.modules.eval()
+
+        pbar = tqdm(
+            self.datasets["test"],
+            unit="batches",
+            ascii=True,
+            dynamic_ncols=True,
+            disable=not self.accelerator.is_local_main_process,
+        )
+        loss_epoch = 0
+        pbar.set_description("Testing...")
+        with self.accelerator.autocast():
+            for idx, batch in enumerate(pbar):
+                if isinstance(batch, list):
+                    batch = [b.to(self.device) for b in batch]
+                self.opt.zero_grad()
+
+                model_out = self(batch)
+                loss = self.compute_loss(model_out, batch)
+                for m in self.metrics:
+                    m(model_out, batch, Stage.test, self.device)
+
+                loss_epoch += loss.item()
+                pbar.set_postfix(loss=loss_epoch / (idx + 1))
+
+        pbar.close()
+
+        test_metrics = {
+            "test_" + m.name: m.reduce(Stage.test, True) for m in self.metrics
+        }
+        test_metrics.update({"test_loss": loss_epoch / (idx + 1)})
+        s_out = (
+            "Testing "
+            + " - ".join([f"{k}: {v:.2f}" for k, v in test_metrics.items()])
+            + "; "
+        )
+
+        logger.info(s_out)
+
+        return None
diff --git a/micromind/networks/phinet.py b/micromind/networks/phinet.py
index 96dd119..4d84262 100644
--- a/micromind/networks/phinet.py
+++ b/micromind/networks/phinet.py
@@ -5,33 +5,36 @@
     - Francesco Paissan, 2023
     - Alberto Ancilotto, 2023
     - Matteo Beltrami, 2023
+    - Matteo Tremonti, 2023
 """
-import logging
-from pathlib import Path
-from types import SimpleNamespace
 from typing import List
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError
 from torchinfo import summary
-import os
+import torch.ao.nn.quantized as nnq
 
-import micromind
 
+def _make_divisible(v, divisor=8, min_value=None):
+    """
+    This function is taken from the original tf repo. It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
 
-def correct_pad(input_shape, kernel_size):
-    """Returns a tuple for zero-padding for 2D convolution with downsampling
-
-    Args:
-        input_shape ([tuple/int]): [Input size]
-        kernel_size ([tuple/int]): [Kernel size]
+    It ensures that all layers have a channel number that is divisible by divisor.
 
-    Returns:
-        [tuple]: [Padding coeffs]
     """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def correct_pad(input_shape, kernel_size):
+    """Returns a tuple for zero-padding for 2D convolution with downsampling"""
     if isinstance(kernel_size, int):
         kernel_size = (kernel_size, kernel_size)
 
@@ -51,30 +54,13 @@ def correct_pad(input_shape, kernel_size):
 
 
 def preprocess_input(x, **kwargs):
-    """Normalise channels between [-1, 1]
-
-    Args:
-        x ([Tensor]): [Contains the image, number of channels is arbitrary]
-
-    Returns:
-        [Tensor]: [Channel-wise normalised tensor]
-    """
+    """Normalise channels between [-1, 1]"""
 
     return (x / 128.0) - 1
 
 
 def get_xpansion_factor(t_zero, beta, block_id, num_blocks):
-    """Compute expansion factor based on the formula from the paper
-
-    Args:
-        t_zero ([int]): [initial expansion factor]
-        beta ([int]): [shape factor]
-        block_id ([int]): [id of the block]
-        num_blocks ([int]): [number of blocks in the network]
-
-    Returns:
-        [float]: [computed expansion factor]
-    """
+    """Compute expansion factor based on the formula from the paper"""
     return (t_zero * beta) * block_id / num_blocks + t_zero * (
         num_blocks - block_id
     ) / num_blocks
@@ -89,14 +75,6 @@ def forward(self, x):
         return torch.clamp(x, min=0, max=self.max)
 
 
-class HSwish(torch.nn.Module):
-    def __init__(self):
-        super(HSwish, self).__init__()
-
-    def forward(self, x):
-        return x * nn.ReLU6(inplace=True)(x + 3) / 6
-
-
 class SEBlock(torch.nn.Module):
     """Implements squeeze-and-excitation block"""
 
@@ -109,6 +87,7 @@ def __init__(self, in_channels, out_channels, h_swish=True):
             h_swish (bool, optional): [Whether to use the h_swish]. Defaults to True.
         """
         super(SEBlock, self).__init__()
+
         self.se_conv = nn.Conv2d(
             in_channels,
             out_channels,
@@ -122,10 +101,14 @@ def __init__(self, in_channels, out_channels, h_swish=True):
         )
 
         if h_swish:
-            self.activation = HSwish()
+            self.activation = nn.Hardswish(inplace=True)
         else:
             self.activation = ReLUMax(6)
 
+        # It serves for the quantization.
+        # The behavior remains equivalent for the unquantized models.
+        self.mult = nnq.FloatFunctional()
+
     def forward(self, x):
         """Executes SE Block
 
@@ -135,6 +118,7 @@ def forward(self, x):
         Returns:
             [Tensor]: [output of squeeze-and-excitation block]
         """
+
         inp = x
         x = F.adaptive_avg_pool2d(x, (1, 1))
         x = self.se_conv(x)
@@ -142,16 +126,10 @@ def forward(self, x):
         x = self.se_conv2(x)
         x = torch.sigmoid(x)
 
-        return x * inp
+        return self.mult.mul(inp, x)  # Equivalent to ``torch.mul(a, b)``
 
 
 class DepthwiseConv2d(torch.nn.Conv2d):
-    """Depthwise 2D conv
-
-    Args:
-        torch ([Tensor]): [Input tensor for convolution]
-    """
-
     def __init__(
         self,
         in_channels,
@@ -271,6 +249,7 @@ def __init__(
         h_swish=True,
         k_size=3,
         dp_rate=0.05,
+        divisor=1,
     ):
         """Defines the structure of a PhiNet convolutional block.
 
@@ -304,9 +283,10 @@ def __init__(
 
         self._layers = torch.nn.ModuleList()
         in_channels = in_shape[0]
+
         # Define activation function
         if h_swish:
-            activation = HSwish()
+            activation = nn.Hardswish(inplace=True)
         else:
             activation = ReLUMax(6)
 
@@ -314,14 +294,14 @@ def __init__(
             # Expand
             conv1 = nn.Conv2d(
                 in_channels,
-                int(expansion * in_channels),
+                _make_divisible(int(expansion * in_channels), divisor=divisor),
                 kernel_size=1,
                 padding=0,
                 bias=False,
             )
 
             bn1 = nn.BatchNorm2d(
-                int(expansion * in_channels),
+                _make_divisible(int(expansion * in_channels), divisor=divisor),
                 eps=1e-3,
                 momentum=0.999,
             )
@@ -331,16 +311,16 @@ def __init__(
             self._layers.append(activation)
 
         if stride == 2:
-            pad = nn.ZeroPad2d(
-                padding=correct_pad([res, res], 3),
-            )
-
-            self._layers.append(pad)
+            padding = correct_pad([res, res], 3)
 
         self._layers.append(nn.Dropout2d(dp_rate))
 
         d_mul = 1
-        in_channels_dw = int(expansion * in_channels) if block_id else in_channels
+        in_channels_dw = (
+            _make_divisible(int(expansion * in_channels), divisor=divisor)
+            if block_id
+            else in_channels
+        )
         out_channels_dw = in_channels_dw * d_mul
         dw1 = DepthwiseConv2d(
             in_channels=in_channels_dw,
@@ -348,7 +328,7 @@ def __init__(
             kernel_size=k_size,
             stride=stride,
             bias=False,
-            padding=k_size // 2 if stride == 1 else 0,
+            padding=k_size // 2 if stride == 1 else (padding[1], padding[3]),
         )
 
         bn_dw1 = nn.BatchNorm2d(
@@ -357,19 +337,27 @@ def __init__(
             momentum=0.999,
         )
 
+        # It is necessary to reinitialize the activation
+        # for functions using Module.children() to work properly.
+        # Module.children() does not return repeated layers.
+        if h_swish:
+            activation = nn.Hardswish(inplace=True)
+        else:
+            activation = ReLUMax(6)
+
         self._layers.append(dw1)
         self._layers.append(bn_dw1)
         self._layers.append(activation)
 
         if has_se:
-            num_reduced_filters = max(1, int(expansion * in_channels / 6))
-            se_block = SEBlock(
-                int(expansion * in_channels), num_reduced_filters, h_swish=h_swish
+            num_reduced_filters = _make_divisible(
+                max(1, int(out_channels_dw / 6)), divisor=divisor
             )
+            se_block = SEBlock(out_channels_dw, num_reduced_filters, h_swish=h_swish)
             self._layers.append(se_block)
 
         conv2 = nn.Conv2d(
-            in_channels=int(expansion * in_channels),
+            in_channels=out_channels_dw,
             out_channels=filters,
             kernel_size=1,
             padding=0,
@@ -387,6 +375,9 @@ def __init__(
 
         if res and in_channels == filters and stride == 1:
             self.skip_conn = True
+            # It serves for the quantization.
+            # The behavior remains equivalent for the unquantized models.
+            self.op = nnq.FloatFunctional()
 
     def forward(self, x):
         """Executes PhiNet convolutional block
@@ -398,6 +389,7 @@ def forward(self, x):
         Returns:
             Ouput of the convolutional block : torch.Tensor
         """
+
         if self.skip_conn:
             inp = x
 
@@ -405,192 +397,12 @@ def forward(self, x):
             x = layer(x)
 
         if self.skip_conn:
-            return x + inp
+            return self.op.add(x, inp)  # Equivalent to ``torch.add(a, b)``
 
         return x
 
 
 class PhiNet(nn.Module):
-    @classmethod
-    def from_pretrained(
-        cls,
-        dataset,
-        alpha,
-        beta,
-        t_zero,
-        num_layers,
-        resolution,
-        path=None,
-        num_classes=None,
-        classifier=True,
-        device=None,
-    ):
-        """Loads parameters from checkpoint through Hugging Face Hub or through local
-        file system.
-        This function constructs two strings, `repo_dir` to find the model on Hugging
-        Face Hub and `file_to_choose` to select the correct file inside the repo, and
-        use them to download the pretrained model and initialize the PhiNet.
-
-        Arguments
-        ---------
-        dataset : string
-            The dataset on which the model has been trained with.
-        alpha : float
-            The alpha hyperparameter.
-        beta : float
-            The beta hyperparameter.
-        t_zero : float
-            The t_zero hyperparameter.
-        num_layers : int
-            The number of layers.
-        resolution : int
-            The resolution of the images used during training.
-        path : string
-            The directory path or file path pointing to the checkpoint.
-            If None, the checkpoint is searched on HuggingFace.
-        num_classes : int
-            The number of classes that the model has been trained for.
-            If None, it gets the specific value determined by the dataset used.
-        classifier : bool
-            If True, the model returend includes the classifier.
-        device : string
-            The device that loads all the tensors.
-            If None, it's set to "cuda" if it's available, it's set to "cpu" otherwise.
-
-        Returns
-        -------
-            PhiNet: nn.Module
-
-        Example
-        -------
-        .. doctest::
-
-            >>> from micromind import PhiNet
-            >>> model = PhiNet.from_pretrained("CIFAR-10", 3.0, 0.75, 6.0, 7, 160)
-            Checkpoint taken from HuggingHace hub.
-            Checkpoint loaded successfully.
-        """
-        if num_classes is None:
-            num_classes = micromind.datasets_info[dataset]["Nclasses"]
-
-        repo_dir = f"micromind/{dataset}"
-        file_to_choose = f"\
-                phinet_a{float(alpha)}_b{float(beta)}_tzero{float(t_zero)}_Nlayers{num_layers}\
-                _res{resolution}{micromind.datasets_info[dataset]['ext']}\
-            ".replace(
-            " ", ""
-        )
-
-        assert (
-            num_classes == micromind.datasets_info[dataset]["Nclasses"]
-        ), "Can't load model because num_classes does not match with dataset."
-
-        if device is None:
-            if torch.cuda.is_available():
-                device = "cuda"
-            else:
-                device = "cpu"
-
-        if path is not None:
-            path_to_search = os.path.join(path, file_to_choose)
-            if os.path.isfile(path):
-                path_to_search = path
-            if os.path.isfile(path_to_search):
-                state_dict = torch.load(str(path_to_search), map_location=device)
-                model_found = True
-                print("Checkpoint taken from local file system.")
-            else:
-                model_found = False
-                print(
-                    "Checkpoint not taken from local file system."
-                    + f"{path_to_search} is not a valid checkpoint."
-                )
-        if (path is None) or not model_found:
-            try:
-                downloaded_file_path = hf_hub_download(
-                    repo_id=repo_dir, filename=file_to_choose
-                )
-                state_dict = torch.load(str(downloaded_file_path), map_location=device)
-                print("Checkpoint taken from HuggingHace hub.")
-                model_found = True
-
-            except EntryNotFoundError:
-                state_dict = {
-                    "args": SimpleNamespace(
-                        alpha=alpha,
-                        beta=beta,
-                        t_zero=t_zero,
-                        num_layers=num_layers,
-                        num_classes=num_classes,
-                    )
-                }
-                model_found = False
-                logging.warning("Model initialized without loading checkpoint.")
-
-        # model initialized
-        model = cls(
-            (micromind.datasets_info[dataset]["NChannels"], resolution, resolution),
-            alpha=state_dict["args"].alpha,
-            beta=state_dict["args"].beta,
-            t_zero=state_dict["args"].t_zero,
-            num_layers=state_dict["args"].num_layers,
-            num_classes=state_dict["args"].num_classes,
-            include_top=classifier,
-            compatibility=False,
-        )
-
-        # model initialized with downloaded parameters
-        if model_found:
-            model.load_state_dict(state_dict["state_dict"], strict=False)
-            print("Checkpoint loaded successfully.")
-
-        return model
-
-    def save_params(self, save_path: Path):
-        """Saves state_dict of model into a given path.
-
-        Arguments
-        ---------
-        save_path : string or Path
-            Path where you want to store the state dict.
-
-        Returns
-        -------
-            None
-
-        Example
-        -------
-        .. doctest::
-
-            >>> from micromind import PhiNet
-            >>> model = PhiNet((3, 224, 224))
-            >>> model.save_params("checkpoint.pt")
-        """
-        torch.save(self.state_dict(), save_path)
-
-    def from_checkpoint(self, load_path: Path):
-        """Loads state_dict of model into current instance of the PhiNet class.
-
-        Arguments
-        ---------
-        load_path : string or Path
-            Path where you want to store the state dict.
-
-        Returns
-        -------
-            None
-
-        Example
-        -------
-        .. doctest::
-
-            >>> from micromind import PhiNet
-            >>> model = PhiNet((3, 224, 224))
-            >>> model.save_params("checkpoint.pt")
-            >>> model.from_checkpoint("checkpoint.pt")
-        """
-        self.load_state_dict(torch.load(load_path))
-
     def get_complexity(self):
         """Returns MAC and number of parameters of initialized architecture.
 
@@ -602,7 +414,7 @@ def get_complexity(self):
         -------
         .. doctest::
 
-            >>> from micromind import PhiNet
+            >>> from micromind.networks import PhiNet
             >>> model = PhiNet((3, 224, 224))
             >>> model.get_complexity()
             {'MAC': 9817670, 'params': 30917}
@@ -624,7 +436,7 @@ def get_MAC(self):
         -------
         .. doctest::
 
-            >>> from micromind import PhiNet
+            >>> from micromind.networks import PhiNet
             >>> model = PhiNet((3, 224, 224))
             >>> model.get_MAC()
             9817670
@@ -642,7 +454,7 @@ def get_params(self):
         -------
         .. doctest::
 
-            >>> from micromind import PhiNet
+            >>> from micromind.networks import PhiNet
             >>> model = PhiNet((3, 224, 224))
             >>> model.get_params()
             30917
@@ -667,6 +479,7 @@ def __init__(
         pool: bool = False,  # S2
         h_swish: bool = True,  # S1
         squeeze_excite: bool = True,  # S1
+        divisor: int = 1,
     ) -> None:
         """This class implements the PhiNet architecture.
 
@@ -720,7 +533,7 @@ def __init__(
 
         # Define self.activation function
         if h_swish:
-            activation = HSwish()
+            activation = nn.Hardswish(inplace=True)
         else:
             activation = ReLUMax(6)
 
@@ -735,7 +548,7 @@ def __init__(
 
             sep1 = SeparableConv2d(
                 in_channels,
-                int(first_conv_filters * alpha),
+                _make_divisible(int(first_conv_filters * alpha), divisor=divisor),
                 kernel_size=3,
                 stride=(first_conv_stride, first_conv_stride),
                 padding=0,
@@ -748,16 +561,17 @@ def __init__(
 
             block1 = PhiNetConvBlock(
                 in_shape=(
-                    int(first_conv_filters * alpha),
+                    _make_divisible(int(first_conv_filters * alpha), divisor=divisor),
                     res / first_conv_stride,
                     res / first_conv_stride,
                 ),
-                filters=int(b1_filters * alpha),
+                filters=_make_divisible(int(b1_filters * alpha), divisor=divisor),
                 stride=1,
                 expansion=1,
                 has_se=False,
                 res=residuals,
                 h_swish=h_swish,
+                divisor=divisor,
             )
 
             self._layers.append(block1)
@@ -773,44 +587,51 @@ def __init__(
             self._layers.append(bn_c1)
 
         block2 = PhiNetConvBlock(
-            (int(b1_filters * alpha), res / first_conv_stride, res / first_conv_stride),
-            filters=int(b1_filters * alpha),
+            (
+                _make_divisible(int(b1_filters * alpha), divisor=divisor),
+                res / first_conv_stride,
+                res / first_conv_stride,
+            ),
+            filters=_make_divisible(int(b1_filters * alpha), divisor=divisor),
             stride=2 if (not pool) else 1,
             expansion=get_xpansion_factor(t_zero, beta, 1, num_layers),
             block_id=1,
             has_se=squeeze_excite,
             res=residuals,
             h_swish=h_swish,
+            divisor=divisor,
         )
 
         block3 = PhiNetConvBlock(
             (
-                int(b1_filters * alpha),
+                _make_divisible(int(b1_filters * alpha), divisor=divisor),
                 res / first_conv_stride / 2,
                 res / first_conv_stride / 2,
             ),
-            filters=int(b1_filters * alpha),
+            filters=_make_divisible(int(b1_filters * alpha), divisor=divisor),
             stride=1,
             expansion=get_xpansion_factor(t_zero, beta, 2, num_layers),
             block_id=2,
             has_se=squeeze_excite,
             res=residuals,
             h_swish=h_swish,
+            divisor=divisor,
         )
 
         block4 = PhiNetConvBlock(
             (
-                int(b1_filters * alpha),
+                _make_divisible(int(b1_filters * alpha), divisor=divisor),
                 res / first_conv_stride / 2,
                 res / first_conv_stride / 2,
             ),
-            filters=int(b2_filters * alpha),
+            filters=_make_divisible(int(b2_filters * alpha), divisor=divisor),
             stride=2 if (not pool) else 1,
             expansion=get_xpansion_factor(t_zero, beta, 3, num_layers),
             block_id=3,
             has_se=squeeze_excite,
             res=residuals,
             h_swish=h_swish,
+            divisor=divisor,
         )
 
         self._layers.append(block2)
@@ -824,7 +645,7 @@ def __init__(
         block_id = 4
         block_filters = b2_filters
         spatial_res = res / first_conv_stride / 4
-        in_channels_next = int(b2_filters * alpha)
+        in_channels_next = _make_divisible(int(b2_filters * alpha), divisor=divisor)
         while num_layers >= block_id:
             if block_id in downsampling_layers:
                 block_filters *= 2
@@ -833,7 +654,7 @@ def __init__(
 
             pn_block = PhiNetConvBlock(
                 (in_channels_next, spatial_res, spatial_res),
-                filters=int(block_filters * alpha),
+                filters=_make_divisible(int(block_filters * alpha), divisor=divisor),
                 stride=(2 if (block_id in downsampling_layers) and (not pool) else 1),
                 expansion=get_xpansion_factor(t_zero, beta, block_id, num_layers),
                 block_id=block_id,
@@ -841,10 +662,13 @@ def __init__(
                 res=residuals,
                 h_swish=h_swish,
                 k_size=(5 if (block_id / num_layers) > (1 - conv5_percent) else 3),
+                divisor=divisor,
             )
 
             self._layers.append(pn_block)
-            in_channels_next = int(block_filters * alpha)
+            in_channels_next = _make_divisible(
+                int(block_filters * alpha), divisor=divisor
+            )
             spatial_res = (
                 spatial_res / 2 if block_id in downsampling_layers else spatial_res
             )
@@ -855,7 +679,11 @@ def __init__(
             self.classifier = nn.Sequential(
                 nn.AdaptiveAvgPool2d((1, 1)),
                 nn.Flatten(),
-                nn.Linear(int(block_filters * alpha), num_classes, bias=True),
+                nn.Linear(
+                    _make_divisible(int(block_filters * alpha), divisor=divisor),
+                    num_classes,
+                    bias=True,
+                ),
             )
 
     def forward(self, x):
diff --git a/micromind/utils/checkpointer.py b/micromind/utils/checkpointer.py
new file mode 100644
index 0000000..09a867f
--- /dev/null
+++ b/micromind/utils/checkpointer.py
@@ -0,0 +1,99 @@
+"""
+micromind checkpointer. Unwraps models and saves the to disk with optimizer's
+state etc.
+
+Authors:
+    - Francesco Paissan, 2023
+"""
+from typing import Union, Dict, Callable
+from loguru import logger
+from pathlib import Path
+import os
+
+import torch
+
+
+class Checkpointer:
+    def __init__(
+        self,
+        key: str,
+        mode: str = "min",
+        top_k: int = 5,
+        checkpoint_path: Union[str, Path] = ".",
+    ) -> None:
+        assert mode in ["max", "min"], "Checkpointer mode can be only max or min."
+        self.key = key
+        self.mode = mode
+        self.top_k = 5
+
+        self.bests = [torch.inf] * self.top_k
+        self.check_paths = [""] * self.top_k
+        self.root_dir = checkpoint_path
+        self.save_dir = os.path.join(self.root_dir, "save")
+        os.makedirs(self.save_dir, exist_ok=True)
+        self.fstream = open(os.path.join(self.root_dir, "train_log.txt"), "a")
+
+    def __call__(
+        self,
+        mind,
+        epoch: int,
+        train_metrics: Dict,
+        metrics: Dict,
+        unwrap: Callable = lambda x: x,
+    ) -> Union[Path, str]:
+        s_out = (
+            f"Epoch {epoch}: "
+            + " - ".join([f"{k}: {v:.2f}" for k, v in train_metrics.items()])
+            + "; "
+        )
+        s_out += " - ".join([f"{k2}: {v2:.4f}" for k2, v2 in metrics.items()]) + ".\n"
+        self.fstream.write(s_out)
+        logger.info(s_out)
+        base_save = {
+            "key": self.key,
+            "mode": self.mode,
+            "epoch": epoch,
+            "optimizer": mind.opt,
+            "lr_scheduler": mind.lr_sched,
+        }
+        to_remove = None
+        if self.mode == "min":
+            if metrics[self.key] <= min(self.bests):
+                id_best = self.bests.index(min(self.bests))
+                to_remove = self.check_paths[id_best]
+
+                self.check_paths[id_best] = os.path.join(
+                    self.save_dir,
+                    f"epoch_{epoch}_{self.key}_{metrics[self.key]:.4f}.ckpt",
+                )
+
+                base_save.update(
+                    {k: unwrap(v).state_dict() for k, v in mind.modules.items()}
+                ),
+                torch.save(base_save, self.check_paths[id_best])
+        elif self.mode == "max":
+            if metrics[self.key] >= max(self.bests):
+                id_best = self.bests.index(min(self.bests))
+                to_remove = self.check_paths[id_best]
+
+                self.check_paths[id_best] = os.path.join(
+                    self.save_dir,
+                    f"epoch_{epoch}_{self.key}_{metrics[self.key]:.4f}.ckpt",
+                )
+
+                base_save.update(
+                    {k: unwrap(v).state_dict() for k, v in mind.modules.items()}
+                ),
+                torch.save(base_save, self.check_paths[id_best])
+
+        if to_remove is not None and to_remove != "":
+            logger.info(f"Generated better checkpoint. Deleting {to_remove}.")
+            os.remove(to_remove)
+
+        if self.mode == "max":
+            return self.check_paths[self.bests.index(max(self.bests))]
+        elif self.mode == "min":
+            return self.check_paths[self.bests.index(min(self.bests))]
+
+    def close(self):
+        self.fstream.close()
diff --git a/micromind/utils/configlib.py b/micromind/utils/configlib.py
deleted file mode 100644
index 3550cea..0000000
--- a/micromind/utils/configlib.py
+++ /dev/null
@@ -1,46 +0,0 @@
-""" Configuration library for experiments.
-
-Authors:
-    Francesco Paissan, 2023
-
-"""
-import argparse
-import logging
-import pprint
-import sys
-import types
-from typing import Any, Dict
-
-
-class SimpleNamespace(types.SimpleNamespace):
-    def update(self, dictionary):
-        self.__dict__.update(dictionary)
-
-
-logger = logging.getLogger(__name__)
-
-parser = argparse.ArgumentParser(description=__doc__, fromfile_prefix_chars="@")
-
-config: SimpleNamespace = SimpleNamespace()
-
-
-def add_parser(title: str, description: str = ""):
-    """Create a new context for arguments and return a handle."""
-    return parser.add_argument_group(title, description)
-
-
-def parse(save_fname: str = "") -> Dict[str, Any]:
-    """Parse given arguments."""
-    config.update(vars(parser.parse_args()))
-    logging.info("Parsed %i arguments.", len(config.__dict__))
-    # Save passed arguments
-    if save_fname:
-        with open(save_fname, "w") as fout:
-            fout.write("\n".join(sys.argv[1:]))
-        logging.info("Saving experiment arguments to %s.", save_fname)
-    return config
-
-
-def print_config():
-    """Print the current config to stdout."""
-    pprint.pprint(config.__dict__)
diff --git a/micromind/utils/helpers.py b/micromind/utils/helpers.py
new file mode 100644
index 0000000..f0c1c16
--- /dev/null
+++ b/micromind/utils/helpers.py
@@ -0,0 +1,43 @@
+"""
+micromind helper functions.
+
+Authors:
+    - Francesco Paissan, 2023
+"""
+from typing import Union, Dict, Tuple
+from pathlib import Path
+import random
+import string
+import torch
+import os
+
+
+def get_value_from_key(s: str, key: str, cast=float) -> float:
+    dat = s.split(f"{key}_")[-1]
+
+    if "ckpt" in dat:
+        dat = dat.split(".ckpt")[0]
+
+    return cast(dat)
+
+
+def select_and_load_checkpoint(path: Union[Path, str]) -> Tuple[Dict, str]:
+    checkpoints = os.listdir(path)
+    checkpoints = [os.path.join(path, c) for c in checkpoints]
+
+    dat = torch.load(checkpoints[0])
+    selected_key, selected_mode = dat["key"], dat["mode"]
+
+    values = [get_value_from_key(str(c), selected_key) for c in checkpoints]
+
+    best_key = min(values) if selected_mode == "min" else max(values)
+    best_checkpoint = checkpoints[values.index(best_key)]
+
+    return torch.load(best_checkpoint), best_checkpoint
+
+
+def get_random_string(length=10):
+    letters = string.ascii_lowercase
+    result_str = "".join(random.choice(letters) for i in range(length))
+
+    return result_str
diff --git a/micromind/utils/parse.py b/micromind/utils/parse.py
new file mode 100644
index 0000000..debbbed
--- /dev/null
+++ b/micromind/utils/parse.py
@@ -0,0 +1,28 @@
+import argparse
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="General configuration for micromind.")
+
+    parser.add_argument("--lr", type=float, default=0.001, help="Learning rate.")
+    parser.add_argument(
+        "--optimizer",
+        dest="opt",
+        default="adam",
+        choices=["adam", "sgd"],
+        help="Optimizer name.",
+    )
+    parser.add_argument(
+        "--experiment_name", default="exp", help="Name of the experiment."
+    )
+    parser.add_argument(
+        "--output_folder", default="results", help="Output folder path."
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Run in debug mode to check train and validation steps.",
+    )
+
+    args = parser.parse_args()
+    return args
diff --git a/pyproject.toml b/pyproject.toml
index e23ef59..b72ae4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,9 @@ dependencies = [
     "torch",
     "torchinfo",
     "huggingface_hub",
+    "accelerate==0.23.0",
+    "onnx",
+    "loguru"
 ]
 requires-python = ">=3.8"
 
@@ -53,7 +56,7 @@ profile = "black"
 py-modules = []
 
 [tool.bumpver]
-current_version = "0.0.5"
+current_version = "0.1.0"
 version_pattern = "MAJOR.MINOR.PATCH"
 commit_message = "bump version {old_version} -> {new_version}"
 commit = true
diff --git a/recipes/image_classification/README.md b/recipes/image_classification/README.md
deleted file mode 100644
index 50dd16c..0000000
--- a/recipes/image_classification/README.md
+++ /dev/null
@@ -1,61 +0,0 @@
-## Image classification
-
-This image classification recipe is heavily based and depends on pytorch-image-models (timm), the awesome tool developed by [Ross Wightman](https://github.com/rwightman).
-It supports all data augmentation, datasets and architectures of the original implementation, and was adapted to support the training of PhiNets.
-
-To reproduce our results, you can follow these steps:
-
-1. install PhiNets with `pip install git+https://github.com/fpaissan/micromind`
-2. install the additional dependencies for this recipe with `pip install -r extra_requirements.txt`
-2. launch the training script on the dataset you want
-
-### MNIST
-```
-python classification.py ~/data/mnist -b 128 --dataset torch/mnist --num-classes 10 \
-	--model phinet --input-size 1 28 28 --epochs 20 --amp \
-	--opt adam --lr 0.01 --weight-decay 0.01 --no-aug \
-	--pin-mem --apex-amp --use-multi-epochs-loader --mean 0.1307 --std 0.3081 --dataset-download --log-interval 100 \
-	--alpha 0.5 --num_layers 4 --beta 1 --t_zero 6 --experiment mnist
-```
-
-### CIFAR-10
-```
-python classification.py ~/data/cifar10 -b 64 --dataset torch/cifar10 --num-classes 10 \
-	--model phinet --input-size 3 160 160 --epochs 100 --amp \
-	--opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \
-	--hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \
-	--pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar10 \
-	--alpha 3 --beta 0.75 --t_zero 6 --num_layers 7
-```
-
-### CIFAR-100
-```
-python classification.py ~/data/cifar100 -b 64 --dataset torch/cifar100 --num-classes 100 \
-	--model phinet --input-size 3 160 160 --epochs 100 --amp \
-	--opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \
-	--hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \
-	--pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar100 \
-	--alpha 3 --beta 0.75 --t_zero 6 --num_layers 7
-```
-
-In the table is a list of PhiNet's performance on some common image classification benchmarks.
-
-| Dataset | Model name         | Top 1 Accuracy  | Top 5 Accuracy |
-| -------- | ------------------ |---------------- | -------------- |
-| MNIST | `PhiNet(alpha=0.5, beta=1, t_zero=6, num_layers=4)`   |     98.96%         |      100.00%       |
-| CIFAR-10 | `PhiNet(alpha=3, beta=0.75, t_zero=6, num_layers=7)`   |     93.61%         |      99.77%       |
-| CIFAR-100 | `PhiNet(alpha=3, beta=0.75, t_zero=6, num_layers=7)`   |     75.56%         |      93.5%       |
-
-### Cite PhiNets
-```
-@article{10.1145/3510832,
-	author = {Paissan, Francesco and Ancilotto, Alberto and Farella, Elisabetta},
-	title = {PhiNets: A Scalable Backbone for Low-Power AI at the Edge},
-	year = {2022},
-	publisher = {Association for Computing Machinery},
-	address = {New York, NY, USA},
-	url = {https://doi.org/10.1145/3510832},
-	doi = {10.1145/3510832},
-	journal = {ACM Trans. Embed. Comput. Syst.},
-}
-```
diff --git a/recipes/image_classification/classification.py b/recipes/image_classification/classification.py
deleted file mode 100644
index c85d216..0000000
--- a/recipes/image_classification/classification.py
+++ /dev/null
@@ -1,1551 +0,0 @@
-"""
-This code is an adaptation of the imagenet training script from
-Ross Wightman (https://github.com/rwightman) modified to train
-networks supported inside phinet.
-
-Adapted by:
-    - Mariam Jamal, 2023
-    - Francesco Paissan, 2023
-
-"""
-import logging
-import os
-import time
-from collections import OrderedDict
-from contextlib import suppress
-from datetime import datetime
-
-import torch
-
-# should speed up backward-pass with depth-wise separable convolutions
-import torch.backends.cudnn as cudnn
-import torch.nn as nn
-import torchvision.utils
-import yaml
-from timm import utils
-from timm.data import (
-    AugMixDataset,
-    FastCollateMixup,
-    Mixup,
-    create_dataset,
-    create_loader,
-    resolve_data_config,
-)
-from timm.loss import (
-    BinaryCrossEntropy,
-    JsdCrossEntropy,
-    LabelSmoothingCrossEntropy,
-    SoftTargetCrossEntropy,
-)
-from timm.models import (
-    convert_splitbn_model,
-    convert_sync_batchnorm,
-    create_model,
-    load_checkpoint,
-    model_parameters,
-    resume_checkpoint,
-    safe_model_name,
-    set_fast_norm,
-)
-from timm.optim import create_optimizer_v2, optimizer_kwargs
-from timm.scheduler import create_scheduler
-from timm.utils import ApexScaler, NativeScaler
-from torch.nn.parallel import DistributedDataParallel as NativeDDP
-
-# Model interface
-from micromind import PhiNet
-
-# For argparse from multiple files
-from micromind.utils import configlib
-from micromind.utils.configlib import config as args
-
-cudnn.benchmark = True
-
-try:
-    from apex import amp
-    from apex.parallel import DistributedDataParallel as ApexDDP
-    from apex.parallel import convert_syncbn_model
-
-    has_apex = True
-except ImportError:
-    has_apex = False
-
-has_native_amp = False
-try:
-    if getattr(torch.cuda.amp, "autocast") is not None:
-        has_native_amp = True
-except AttributeError:
-    pass
-
-try:
-    import wandb
-
-    has_wandb = True
-except ImportError:
-    has_wandb = False
-
-try:
-    from functorch.compile import memory_efficient_fusion
-
-    has_functorch = True
-except ImportError:
-    has_functorch = False
-
-
-torch.backends.cudnn.benchmark = True
-_logger = logging.getLogger("train")
-
-# The first arg parser parses out only the --config argument,
-# this argument is used to load a yaml file containing key-values
-# that override the defaults for the main parser below
-# config_parser = configlib.add_parser("Classification training config")
-#
-# parser.add_argument(
-# "-c",
-# "--config",
-# default="",
-# type=str,
-# metavar="FILE",
-# help="YAML config file specifying default arguments",
-# )
-
-
-# Dataset parameters
-group = configlib.add_parser("Dataset parameters")
-# Keep this argument outside of the dataset group because it is positional.
-group.add_argument("data_dir", metavar="DIR", help="path to dataset")
-group.add_argument(
-    "--dataset",
-    "-d",
-    metavar="NAME",
-    default="",
-    help="dataset type (default: ImageFolder/ImageTar if empty)",
-)
-group.add_argument(
-    "--train-split",
-    metavar="NAME",
-    default="train",
-    help="dataset train split (default: train)",
-)
-group.add_argument(
-    "--val-split",
-    metavar="NAME",
-    default="validation",
-    help="dataset validation split (default: validation)",
-)
-group.add_argument(
-    "--dataset-download",
-    action="store_true",
-    default=False,
-    help="Allow download of dataset for torch/ and tfds/ datasets that support it.",
-)
-group.add_argument(
-    "--class-map",
-    default="",
-    type=str,
-    metavar="FILENAME",
-    help='path to class to idx mapping file (default: "")',
-)
-
-# Model parameters
-group = configlib.add_parser("Model parameters")
-group.add_argument(
-    "--model",
-    default="resnet50",
-    type=str,
-    metavar="MODEL",
-    help='Name of model to train (default: "resnet50"',
-)
-group.add_argument(
-    "--pretrained",
-    action="store_true",
-    default=False,
-    help="Start with pretrained version of specified network (if avail)",
-)
-group.add_argument(
-    "--initial-checkpoint",
-    default="",
-    type=str,
-    metavar="PATH",
-    help="Initialize model from this checkpoint (default: none)",
-)
-group.add_argument(
-    "--resume",
-    default="",
-    type=str,
-    metavar="PATH",
-    help="Resume full model and optimizer state from checkpoint (default: none)",
-)
-group.add_argument(
-    "--no-resume-opt",
-    action="store_true",
-    default=False,
-    help="prevent resume of optimizer state when resuming model",
-)
-group.add_argument(
-    "--num-classes",
-    type=int,
-    default=None,
-    metavar="N",
-    help="number of label classes (Model default if None)",
-)
-group.add_argument(
-    "--gp",
-    default=None,
-    type=str,
-    metavar="POOL",
-    help="Global pool type, one of (fast, avg, max, avgmax, avgmaxc). \
-            Model default if None.",
-)
-group.add_argument(
-    "--img-size",
-    type=int,
-    default=None,
-    metavar="N",
-    help="Image patch size (default: None => model default)",
-)
-group.add_argument(
-    "--input-size",
-    default=None,
-    nargs=3,
-    type=int,
-    metavar="N N N",
-    help="Input all image dimensions (d h w, e.g. --input-size 3 224 224), \
-            uses model default if empty",
-)
-group.add_argument(
-    "--crop-pct",
-    default=None,
-    type=float,
-    metavar="N",
-    help="Input image center crop percent (for validation only)",
-)
-group.add_argument(
-    "--mean",
-    type=float,
-    nargs="+",
-    default=None,
-    metavar="MEAN",
-    help="Override mean pixel value of dataset",
-)
-group.add_argument(
-    "--std",
-    type=float,
-    nargs="+",
-    default=None,
-    metavar="STD",
-    help="Override std deviation of dataset",
-)
-group.add_argument(
-    "--interpolation",
-    default="",
-    type=str,
-    metavar="NAME",
-    help="Image resize interpolation type (overrides model)",
-)
-group.add_argument(
-    "-b",
-    "--batch-size",
-    type=int,
-    default=128,
-    metavar="N",
-    help="Input batch size for training (default: 128)",
-)
-group.add_argument(
-    "-vb",
-    "--validation-batch-size",
-    type=int,
-    default=None,
-    metavar="N",
-    help="Validation batch size override (default: None)",
-)
-group.add_argument(
-    "--channels-last",
-    action="store_true",
-    default=False,
-    help="Use channels_last memory layout",
-)
-scripting_group = group.add_mutually_exclusive_group()
-scripting_group.add_argument(
-    "--torchscript",
-    dest="torchscript",
-    action="store_true",
-    help="torch.jit.script the full model",
-)
-scripting_group.add_argument(
-    "--aot-autograd",
-    default=False,
-    action="store_true",
-    help="Enable AOT Autograd support. (It's recommended to use \
-            this option with `--fuser nvfuser` together)",
-)
-group.add_argument(
-    "--fuser",
-    default="",
-    type=str,
-    help="Select jit fuser. One of ('', 'te', 'old', 'nvfuser')",
-)
-group.add_argument(
-    "--fast-norm",
-    default=False,
-    action="store_true",
-    help="enable experimental fast-norm",
-)
-group.add_argument(
-    "--grad-checkpointing",
-    action="store_true",
-    default=False,
-    help="Enable gradient checkpointing through model blocks/stages",
-)
-group.add_argument(
-    "--alpha",
-    default=0.5,
-    type=float,
-    help="alpha parameter for phinet. Defaults to 0.5",
-)
-group.add_argument(
-    "--beta",
-    default=1.0,
-    type=float,
-    help="beta parameter for phinet. Defaults to 1.",
-)
-group.add_argument(
-    "--t_zero",
-    default=4,
-    type=float,
-    help="t_zero parameter for phinet. Defaults to 4.",
-)
-group.add_argument(
-    "--num_layers",
-    default=4,
-    type=int,
-    help="Number of layers for phinet. Defaults to 4.",
-)
-
-# Optimizer parameters
-group = configlib.add_parser("Optimizer parameters")
-group.add_argument(
-    "--opt",
-    default="sgd",
-    type=str,
-    metavar="OPTIMIZER",
-    help='Optimizer (default: "sgd"',
-)
-group.add_argument(
-    "--opt-eps",
-    default=None,
-    type=float,
-    metavar="EPSILON",
-    help="Optimizer Epsilon (default: None, use opt default)",
-)
-group.add_argument(
-    "--opt-betas",
-    default=None,
-    type=float,
-    nargs="+",
-    metavar="BETA",
-    help="Optimizer Betas (default: None, use opt default)",
-)
-group.add_argument(
-    "--momentum",
-    type=float,
-    default=0.9,
-    metavar="M",
-    help="Optimizer momentum (default: 0.9)",
-)
-group.add_argument(
-    "--weight-decay", type=float, default=2e-5, help="weight decay (default: 2e-5)"
-)
-group.add_argument(
-    "--clip-grad",
-    type=float,
-    default=None,
-    metavar="NORM",
-    help="Clip gradient norm (default: None, no clipping)",
-)
-group.add_argument(
-    "--clip-mode",
-    type=str,
-    default="norm",
-    help='Gradient clipping mode. One of ("norm", "value", "agc")',
-)
-group.add_argument(
-    "--layer-decay",
-    type=float,
-    default=None,
-    help="layer-wise learning rate decay (default: None)",
-)
-
-# Learning rate schedule parameters
-group = configlib.add_parser("Learning rate schedule parameters")
-group.add_argument(
-    "--sched",
-    default="cosine",
-    type=str,
-    metavar="SCHEDULER",
-    help='LR scheduler (default: "step"',
-)
-group.add_argument(
-    "--lr", type=float, default=0.05, metavar="LR", help="learning rate (default: 0.05)"
-)
-group.add_argument(
-    "--lr-noise",
-    type=float,
-    nargs="+",
-    default=None,
-    metavar="pct, pct",
-    help="learning rate noise on/off epoch percentages",
-)
-group.add_argument(
-    "--lr-noise-pct",
-    type=float,
-    default=0.67,
-    metavar="PERCENT",
-    help="learning rate noise limit percent (default: 0.67)",
-)
-group.add_argument(
-    "--lr-noise-std",
-    type=float,
-    default=1.0,
-    metavar="STDDEV",
-    help="learning rate noise std-dev (default: 1.0)",
-)
-group.add_argument(
-    "--lr-cycle-mul",
-    type=float,
-    default=1.0,
-    metavar="MULT",
-    help="learning rate cycle len multiplier (default: 1.0)",
-)
-group.add_argument(
-    "--lr-cycle-decay",
-    type=float,
-    default=0.5,
-    metavar="MULT",
-    help="amount to decay each learning rate cycle (default: 0.5)",
-)
-group.add_argument(
-    "--lr-cycle-limit",
-    type=int,
-    default=1,
-    metavar="N",
-    help="learning rate cycle limit, cycles enabled if > 1",
-)
-group.add_argument(
-    "--lr-k-decay",
-    type=float,
-    default=1.0,
-    help="learning rate k-decay for cosine/poly (default: 1.0)",
-)
-group.add_argument(
-    "--warmup-lr",
-    type=float,
-    default=0.0001,
-    metavar="LR",
-    help="warmup learning rate (default: 0.0001)",
-)
-group.add_argument(
-    "--min-lr",
-    type=float,
-    default=1e-6,
-    metavar="LR",
-    help="lower lr bound for cyclic schedulers that hit 0 (1e-5)",
-)
-group.add_argument(
-    "--epochs",
-    type=int,
-    default=300,
-    metavar="N",
-    help="number of epochs to train (default: 300)",
-)
-group.add_argument(
-    "--epoch-repeats",
-    type=float,
-    default=0.0,
-    metavar="N",
-    help="epoch repeat multiplier \
-            (number of times to repeat dataset epoch per train epoch).",
-)
-group.add_argument(
-    "--start-epoch",
-    default=None,
-    type=int,
-    metavar="N",
-    help="manual epoch number (useful on restarts)",
-)
-group.add_argument(
-    "--decay-milestones",
-    default=[30, 60],
-    type=int,
-    nargs="+",
-    metavar="MILESTONES",
-    help="list of decay epoch indices for multistep lr. must be increasing",
-)
-group.add_argument(
-    "--decay-epochs",
-    type=float,
-    default=100,
-    metavar="N",
-    help="epoch interval to decay LR",
-)
-group.add_argument(
-    "--warmup-epochs",
-    type=int,
-    default=3,
-    metavar="N",
-    help="epochs to warmup LR, if scheduler supports",
-)
-group.add_argument(
-    "--cooldown-epochs",
-    type=int,
-    default=10,
-    metavar="N",
-    help="epochs to cooldown LR at min_lr, after cyclic schedule ends",
-)
-group.add_argument(
-    "--patience-epochs",
-    type=int,
-    default=10,
-    metavar="N",
-    help="patience epochs for Plateau LR scheduler (default: 10",
-)
-group.add_argument(
-    "--decay-rate",
-    "--dr",
-    type=float,
-    default=0.1,
-    metavar="RATE",
-    help="LR decay rate (default: 0.1)",
-)
-
-# Augmentation & regularization parameters
-group = configlib.add_parser("Augmentation and regularization parameters")
-group.add_argument(
-    "--no-aug",
-    action="store_true",
-    default=False,
-    help="Disable all training augmentation, override other train aug args",
-)
-group.add_argument(
-    "--scale",
-    type=float,
-    nargs="+",
-    default=[0.08, 1.0],
-    metavar="PCT",
-    help="Random resize scale (default: 0.08 1.0)",
-)
-group.add_argument(
-    "--ratio",
-    type=float,
-    nargs="+",
-    default=[3.0 / 4.0, 4.0 / 3.0],
-    metavar="RATIO",
-    help="Random resize aspect ratio (default: 0.75 1.33)",
-)
-group.add_argument(
-    "--hflip", type=float, default=0.5, help="Horizontal flip training aug probability"
-)
-group.add_argument(
-    "--vflip", type=float, default=0.0, help="Vertical flip training aug probability"
-)
-group.add_argument(
-    "--color-jitter",
-    type=float,
-    default=0.4,
-    metavar="PCT",
-    help="Color jitter factor (default: 0.4)",
-)
-group.add_argument(
-    "--aa",
-    type=str,
-    default=None,
-    metavar="NAME",
-    help='Use AutoAugment policy. "v0" or "original". (default: None)',
-),
-group.add_argument(
-    "--aug-repeats",
-    type=float,
-    default=0,
-    help="Number of augmentation repetitions (distributed training only) (default: 0)",
-)
-group.add_argument(
-    "--aug-splits",
-    type=int,
-    default=0,
-    help="Number of augmentation splits (default: 0, valid: 0 or >=2)",
-)
-group.add_argument(
-    "--jsd-loss",
-    action="store_true",
-    default=False,
-    help="Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.",
-)
-group.add_argument(
-    "--bce-loss",
-    action="store_true",
-    default=False,
-    help="Enable BCE loss w/ Mixup/CutMix use.",
-)
-group.add_argument(
-    "--bce-target-thresh",
-    type=float,
-    default=None,
-    help="Threshold for binarizing softened BCE targets (default: None, disabled)",
-)
-group.add_argument(
-    "--reprob",
-    type=float,
-    default=0.0,
-    metavar="PCT",
-    help="Random erase prob (default: 0.)",
-)
-group.add_argument(
-    "--remode", type=str, default="pixel", help='Random erase mode (default: "pixel")'
-)
-group.add_argument(
-    "--recount", type=int, default=1, help="Random erase count (default: 1)"
-)
-group.add_argument(
-    "--resplit",
-    action="store_true",
-    default=False,
-    help="Do not random erase first (clean) augmentation split",
-)
-group.add_argument(
-    "--mixup",
-    type=float,
-    default=0.0,
-    help="mixup alpha, mixup enabled if > 0. (default: 0.)",
-)
-group.add_argument(
-    "--cutmix",
-    type=float,
-    default=0.0,
-    help="cutmix alpha, cutmix enabled if > 0. (default: 0.)",
-)
-group.add_argument(
-    "--cutmix-minmax",
-    type=float,
-    nargs="+",
-    default=None,
-    help="cutmix min/max ratio, overrides alpha \
-            and enables cutmix if set (default: None)",
-)
-group.add_argument(
-    "--mixup-prob",
-    type=float,
-    default=1.0,
-    help="Probability of performing mixup or cutmix when either/both is enabled",
-)
-group.add_argument(
-    "--mixup-switch-prob",
-    type=float,
-    default=0.5,
-    help="Probability of switching to cutmix when both mixup and cutmix enabled",
-)
-group.add_argument(
-    "--mixup-mode",
-    type=str,
-    default="batch",
-    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"',
-)
-group.add_argument(
-    "--mixup-off-epoch",
-    default=0,
-    type=int,
-    metavar="N",
-    help="Turn off mixup after this epoch, disabled if 0 (default: 0)",
-)
-group.add_argument(
-    "--smoothing", type=float, default=0.1, help="Label smoothing (default: 0.1)"
-)
-group.add_argument(
-    "--train-interpolation",
-    type=str,
-    default="random",
-    help='Training interpolation (random, bilinear, bicubic default: "random")',
-)
-group.add_argument(
-    "--drop", type=float, default=0.0, metavar="PCT", help="Dropout rate (default: 0.)"
-)
-group.add_argument(
-    "--drop-connect",
-    type=float,
-    default=None,
-    metavar="PCT",
-    help="Drop connect rate, DEPRECATED, use drop-path (default: None)",
-)
-group.add_argument(
-    "--drop-path",
-    type=float,
-    default=None,
-    metavar="PCT",
-    help="Drop path rate (default: None)",
-)
-group.add_argument(
-    "--drop-block",
-    type=float,
-    default=None,
-    metavar="PCT",
-    help="Drop block rate (default: None)",
-)
-
-# Batch norm parameters (only works with gen_efficientnet based models currently)
-group = configlib.add_parser(
-    "Batch norm parameters", "Only works with gen_efficientnet based models currently."
-)
-group.add_argument(
-    "--bn-momentum",
-    type=float,
-    default=None,
-    help="BatchNorm momentum override (if not None)",
-)
-group.add_argument(
-    "--bn-eps",
-    type=float,
-    default=None,
-    help="BatchNorm epsilon override (if not None)",
-)
-group.add_argument(
-    "--sync-bn",
-    action="store_true",
-    help="Enable NVIDIA Apex or Torch synchronized BatchNorm.",
-)
-group.add_argument(
-    "--dist-bn",
-    type=str,
-    default="reduce",
-    help='Distribute BatchNorm stats between nodes after each epoch \
-            ("broadcast", "reduce", or "")',
-)
-group.add_argument(
-    "--split-bn",
-    action="store_true",
-    help="Enable separate BN layers per augmentation split.",
-)
-
-# Model Exponential Moving Average
-group = configlib.add_parser("Model exponential moving average parameters")
-group.add_argument(
-    "--model-ema",
-    action="store_true",
-    default=False,
-    help="Enable tracking moving average of model weights",
-)
-group.add_argument(
-    "--model-ema-force-cpu",
-    action="store_true",
-    default=False,
-    help="Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.",
-)
-group.add_argument(
-    "--model-ema-decay",
-    type=float,
-    default=0.9998,
-    help="decay factor for model weights moving average (default: 0.9998)",
-)
-
-# Misc
-group = configlib.add_parser("Miscellaneous parameters")
-group.add_argument(
-    "--seed", type=int, default=42, metavar="S", help="random seed (default: 42)"
-)
-group.add_argument(
-    "--worker-seeding", type=str, default="all", help="worker seed mode (default: all)"
-)
-group.add_argument(
-    "--log-interval",
-    type=int,
-    default=5,
-    metavar="N",
-    help="how many batches to wait before logging training status",
-)
-group.add_argument(
-    "--recovery-interval",
-    type=int,
-    default=0,
-    metavar="N",
-    help="how many batches to wait before writing recovery checkpoint",
-)
-group.add_argument(
-    "--checkpoint-hist",
-    type=int,
-    default=10,
-    metavar="N",
-    help="number of checkpoints to keep (default: 10)",
-)
-group.add_argument(
-    "-j",
-    "--workers",
-    type=int,
-    default=4,
-    metavar="N",
-    help="how many training processes to use (default: 4)",
-)
-group.add_argument(
-    "--save-images",
-    action="store_true",
-    default=False,
-    help="save images of input bathes every log interval for debugging",
-)
-group.add_argument(
-    "--amp",
-    action="store_true",
-    default=False,
-    help="use NVIDIA Apex AMP or Native AMP for mixed precision training",
-)
-group.add_argument(
-    "--apex-amp",
-    action="store_true",
-    default=False,
-    help="Use NVIDIA Apex AMP mixed precision",
-)
-group.add_argument(
-    "--native-amp",
-    action="store_true",
-    default=False,
-    help="Use Native Torch AMP mixed precision",
-)
-group.add_argument(
-    "--no-ddp-bb",
-    action="store_true",
-    default=False,
-    help="Force broadcast buffers for native DDP to off.",
-)
-group.add_argument(
-    "--pin-mem",
-    action="store_true",
-    default=False,
-    help="Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.",
-)
-group.add_argument(
-    "--no-prefetcher",
-    action="store_true",
-    default=False,
-    help="disable fast prefetcher",
-)
-group.add_argument(
-    "--output",
-    default="",
-    type=str,
-    metavar="PATH",
-    help="path to output folder (default: none, current dir)",
-)
-group.add_argument(
-    "--experiment",
-    default="",
-    type=str,
-    metavar="NAME",
-    help="name of train experiment, name of sub-folder for output",
-)
-group.add_argument(
-    "--eval-metric",
-    default="top1",
-    type=str,
-    metavar="EVAL_METRIC",
-    help='Best metric (default: "top1"',
-)
-group.add_argument(
-    "--tta",
-    type=int,
-    default=0,
-    metavar="N",
-    help="Test/inference time augmentation (oversampling) factor. 0=None (default: 0)",
-)
-group.add_argument("--local_rank", default=0, type=int)
-group.add_argument(
-    "--use-multi-epochs-loader",
-    action="store_true",
-    default=False,
-    help="use the multi-epochs-loader to save time at the beginning of every epoch",
-)
-group.add_argument(
-    "--log-wandb",
-    action="store_true",
-    default=False,
-    help="log training and validation metrics to wandb",
-)
-
-
-def _parse_args():
-    # Do we have a config file to parse?
-    # args_config, remaining = config_parser.parse_known_args()
-    # if args_config.config:
-    # with open(args_config.config, "r") as f:
-    # cfg = yaml.safe_load(f)
-    # parser.set_defaults(**cfg)
-
-    # The main arg parser parses the rest of the args, the usual
-    # defaults will have been overridden if config file specified.
-    # args = parser.parse_args(remaining)
-    configlib.parse()
-
-    # Cache the args as a text string to save them in the output dir later
-    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
-    return args, args_text
-
-
-def main():
-    utils.setup_default_logging()
-    args, args_text = _parse_args()
-
-    args.prefetcher = not args.no_prefetcher
-    args.distributed = False
-    if "WORLD_SIZE" in os.environ:
-        args.distributed = int(os.environ["WORLD_SIZE"]) > 1
-    args.device = "cuda:0"
-    args.world_size = 1
-    args.rank = 0  # global rank
-    if args.distributed:
-        if "LOCAL_RANK" in os.environ:
-            args.local_rank = int(os.getenv("LOCAL_RANK"))
-        args.device = "cuda:%d" % args.local_rank
-        torch.cuda.set_device(args.local_rank)
-        torch.distributed.init_process_group(backend="nccl", init_method="env://")
-        args.world_size = torch.distributed.get_world_size()
-        args.rank = torch.distributed.get_rank()
-        _logger.info(
-            "Training in distributed mode with multiple processes, \
-                    1 GPU per process. Process %d, total %d."
-            % (args.rank, args.world_size)
-        )
-    else:
-        _logger.info("Training with a single process on 1 GPUs.")
-    assert args.rank >= 0
-
-    if args.rank == 0 and args.log_wandb:
-        if has_wandb:
-            wandb.init(project=args.experiment, config=args)
-        else:
-            _logger.warning(
-                "You've requested to log metrics to wandb but package not found. "
-                "Metrics not being logged to wandb, try `pip install wandb`"
-            )
-
-    # resolve AMP arguments based on PyTorch / Apex availability
-    use_amp = None
-    if args.amp:
-        # `--amp` chooses native amp before apex (APEX ver not actively maintained)
-        if has_native_amp:
-            args.native_amp = True
-        elif has_apex:
-            args.apex_amp = True
-    if args.apex_amp and has_apex:
-        use_amp = "apex"
-    elif args.native_amp and has_native_amp:
-        use_amp = "native"
-    elif args.apex_amp or args.native_amp:
-        _logger.warning(
-            "Neither APEX or native Torch AMP is available, using float32. "
-            "Install NVIDA apex or upgrade to PyTorch 1.6"
-        )
-
-    utils.random_seed(args.seed, args.rank)
-
-    if args.fuser:
-        utils.set_jit_fuser(args.fuser)
-    if args.fast_norm:
-        set_fast_norm()
-
-    if args.model == "phinet":
-        model = PhiNet(
-            input_shape=vars(args)["input_size"],
-            alpha=vars(args)["alpha"],
-            num_layers=vars(args)["num_layers"],
-            beta=vars(args)["beta"],
-            t_zero=vars(args)["t_zero"],
-            include_top=True,
-            num_classes=vars(args)["num_classes"],
-            compatibility=False,
-        )
-    else:
-        model = create_model(
-            args.model,
-            pretrained=args.pretrained,
-            num_classes=args.num_classes,
-            drop_rate=args.drop,
-            drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
-            drop_path_rate=args.drop_path,
-            drop_block_rate=args.drop_block,
-            global_pool=args.gp,
-            bn_momentum=args.bn_momentum,
-            bn_eps=args.bn_eps,
-            scriptable=args.torchscript,
-            checkpoint_path=args.initial_checkpoint,
-        )
-
-    if args.num_classes is None:
-        assert hasattr(
-            model, "num_classes"
-        ), "Model must have `num_classes` attr if not set on cmd line/config."
-        args.num_classes = (
-            model.num_classes
-        )  # FIXME handle model default vs config num_classes more elegantly
-
-    if args.grad_checkpointing:
-        model.set_grad_checkpointing(enable=True)
-
-    if args.local_rank == 0:
-        _logger.info(
-            f"Model {safe_model_name(args.model)} created, \
-                    param count:{sum([m.numel() for m in model.parameters()])}"
-        )
-
-    data_config = resolve_data_config(
-        vars(args), model=model, verbose=args.local_rank == 0
-    )
-
-    # setup augmentation batch splits for contrastive loss or split bn
-    num_aug_splits = 0
-    if args.aug_splits > 0:
-        assert args.aug_splits > 1, "A split of 1 makes no sense"
-        num_aug_splits = args.aug_splits
-
-    # enable split bn (separate bn stats per batch-portion)
-    if args.split_bn:
-        assert num_aug_splits > 1 or args.resplit
-        model = convert_splitbn_model(model, max(num_aug_splits, 2))
-
-    # move model to GPU, enable channels last layout if set
-    model.cuda()
-    if args.channels_last:
-        model = model.to(memory_format=torch.channels_last)
-
-    # setup synchronized BatchNorm for distributed training
-    if args.distributed and args.sync_bn:
-        args.dist_bn = ""  # disable dist_bn when sync BN active
-        assert not args.split_bn
-        if has_apex and use_amp == "apex":
-            # Apex SyncBN used with Apex AMP
-            # WARNING this won't currently work with models using BatchNormAct2d
-            model = convert_syncbn_model(model)
-        else:
-            model = convert_sync_batchnorm(model)
-        if args.local_rank == 0:
-            _logger.info(
-                "Converted model to use Synchronized BatchNorm. \
-                        WARNING: You may have issues if using "
-                "zero initialized BN layers (enabled by default for ResNets) \
-                        while sync-bn enabled."
-            )
-
-    if args.torchscript:
-        assert not use_amp == "apex", "Cannot use APEX AMP with torchscripted model"
-        assert not args.sync_bn, "Cannot use SyncBatchNorm with torchscripted model"
-        model = torch.jit.script(model)
-    if args.aot_autograd:
-        assert has_functorch, "functorch is needed for --aot-autograd"
-        model = memory_efficient_fusion(model)
-
-    optimizer = create_optimizer_v2(model, **optimizer_kwargs(cfg=args))
-
-    # setup automatic mixed-precision (AMP) loss scaling and op casting
-    amp_autocast = suppress  # do nothing
-    loss_scaler = None
-    if use_amp == "apex":
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
-        loss_scaler = ApexScaler()
-        if args.local_rank == 0:
-            _logger.info("Using NVIDIA APEX AMP. Training in mixed precision.")
-    elif use_amp == "native":
-        amp_autocast = torch.cuda.amp.autocast
-        loss_scaler = NativeScaler()
-        if args.local_rank == 0:
-            _logger.info("Using native Torch AMP. Training in mixed precision.")
-    else:
-        if args.local_rank == 0:
-            _logger.info("AMP not enabled. Training in float32.")
-
-    # optionally resume from a checkpoint
-    resume_epoch = None
-    if args.resume:
-        resume_epoch = resume_checkpoint(
-            model,
-            args.resume,
-            optimizer=None if args.no_resume_opt else optimizer,
-            loss_scaler=None if args.no_resume_opt else loss_scaler,
-            log_info=args.local_rank == 0,
-        )
-
-    # setup exponential moving average of model weights, SWA could be used here too
-    model_ema = None
-    if args.model_ema:
-        # Important to create EMA model after cuda()
-        # DP wrapper, and AMP but before DDP wrapper
-        model_ema = utils.ModelEmaV2(
-            model,
-            decay=args.model_ema_decay,
-            device="cpu" if args.model_ema_force_cpu else None,
-        )
-        if args.resume:
-            load_checkpoint(model_ema.module, args.resume, use_ema=True)
-
-    # setup distributed training
-    if args.distributed:
-        if has_apex and use_amp == "apex":
-            # Apex DDP preferred unless native amp is activated
-            if args.local_rank == 0:
-                _logger.info("Using NVIDIA APEX DistributedDataParallel.")
-            model = ApexDDP(model, delay_allreduce=True)
-        else:
-            if args.local_rank == 0:
-                _logger.info("Using native Torch DistributedDataParallel.")
-            model = NativeDDP(
-                model,
-                device_ids=[args.local_rank],
-                broadcast_buffers=not args.no_ddp_bb,
-            )
-        # NOTE: EMA model does not need to be wrapped by DDP
-
-    # setup learning rate schedule and starting epoch
-    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
-    start_epoch = 0
-    if args.start_epoch is not None:
-        # a specified start_epoch will always override the resume epoch
-        start_epoch = args.start_epoch
-    elif resume_epoch is not None:
-        start_epoch = resume_epoch
-    if lr_scheduler is not None and start_epoch > 0:
-        lr_scheduler.step(start_epoch)
-
-    if args.local_rank == 0:
-        _logger.info("Scheduled epochs: {}".format(num_epochs))
-
-    # create the train and eval datasets
-    dataset_train = create_dataset(
-        args.dataset,
-        root=args.data_dir,
-        split=args.train_split,
-        is_training=True,
-        class_map=args.class_map,
-        download=args.dataset_download,
-        batch_size=args.batch_size,
-        repeats=args.epoch_repeats,
-    )
-    dataset_eval = create_dataset(
-        args.dataset,
-        root=args.data_dir,
-        split=args.val_split,
-        is_training=False,
-        class_map=args.class_map,
-        download=args.dataset_download,
-        batch_size=args.batch_size,
-    )
-
-    # setup mixup / cutmix
-    collate_fn = None
-    mixup_fn = None
-    mixup_active = args.mixup > 0 or args.cutmix > 0.0 or args.cutmix_minmax is not None
-    if mixup_active:
-        mixup_args = dict(
-            mixup_alpha=args.mixup,
-            cutmix_alpha=args.cutmix,
-            cutmix_minmax=args.cutmix_minmax,
-            prob=args.mixup_prob,
-            switch_prob=args.mixup_switch_prob,
-            mode=args.mixup_mode,
-            label_smoothing=args.smoothing,
-            num_classes=args.num_classes,
-        )
-        if args.prefetcher:
-            assert (
-                not num_aug_splits
-            )  # collate conflict (need to support deinterleaving in collate mixup)
-            collate_fn = FastCollateMixup(**mixup_args)
-        else:
-            mixup_fn = Mixup(**mixup_args)
-
-    # wrap dataset in AugMix helper
-    if num_aug_splits > 1:
-        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
-
-    # create data loaders w/ augmentation pipeiine
-    train_interpolation = args.train_interpolation
-    if args.no_aug or not train_interpolation:
-        train_interpolation = data_config["interpolation"]
-    loader_train = create_loader(
-        dataset_train,
-        input_size=data_config["input_size"],
-        batch_size=args.batch_size,
-        is_training=True,
-        use_prefetcher=args.prefetcher,
-        no_aug=args.no_aug,
-        re_prob=args.reprob,
-        re_mode=args.remode,
-        re_count=args.recount,
-        re_split=args.resplit,
-        scale=args.scale,
-        ratio=args.ratio,
-        hflip=args.hflip,
-        vflip=args.vflip,
-        color_jitter=args.color_jitter,
-        auto_augment=args.aa,
-        num_aug_repeats=args.aug_repeats,
-        num_aug_splits=num_aug_splits,
-        interpolation=train_interpolation,
-        mean=data_config["mean"],
-        std=data_config["std"],
-        num_workers=args.workers,
-        distributed=args.distributed,
-        collate_fn=collate_fn,
-        pin_memory=args.pin_mem,
-        use_multi_epochs_loader=args.use_multi_epochs_loader,
-        worker_seeding=args.worker_seeding,
-    )
-
-    loader_eval = create_loader(
-        dataset_eval,
-        input_size=data_config["input_size"],
-        batch_size=args.validation_batch_size or args.batch_size,
-        is_training=False,
-        use_prefetcher=args.prefetcher,
-        interpolation=data_config["interpolation"],
-        mean=data_config["mean"],
-        std=data_config["std"],
-        num_workers=args.workers,
-        distributed=args.distributed,
-        crop_pct=data_config["crop_pct"],
-        pin_memory=args.pin_mem,
-    )
-
-    # setup loss function
-    if args.jsd_loss:
-        assert num_aug_splits > 1  # JSD only valid with aug splits set
-        train_loss_fn = JsdCrossEntropy(
-            num_splits=num_aug_splits, smoothing=args.smoothing
-        )
-    elif mixup_active:
-        # smoothing is handled with mixup target transform which outputs sparse,
-        # soft targets
-        if args.bce_loss:
-            train_loss_fn = BinaryCrossEntropy(target_threshold=args.bce_target_thresh)
-        else:
-            train_loss_fn = SoftTargetCrossEntropy()
-    elif args.smoothing:
-        if args.bce_loss:
-            train_loss_fn = BinaryCrossEntropy(
-                smoothing=args.smoothing, target_threshold=args.bce_target_thresh
-            )
-        else:
-            train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
-    else:
-        train_loss_fn = nn.CrossEntropyLoss()
-    train_loss_fn = train_loss_fn.cuda()
-    validate_loss_fn = nn.CrossEntropyLoss().cuda()
-
-    # setup checkpoint saver and eval metric tracking
-    eval_metric = args.eval_metric
-    best_metric = None
-    best_epoch = None
-    saver = None
-    output_dir = None
-    if args.rank == 0:
-        if args.experiment:
-            exp_name = args.experiment
-        else:
-            exp_name = "-".join(
-                [
-                    datetime.now().strftime("%Y%m%d-%H%M%S"),
-                    safe_model_name(args.model),
-                    str(data_config["input_size"][-1]),
-                ]
-            )
-        output_dir = utils.get_outdir(
-            args.output if args.output else "./output/train", exp_name
-        )
-        decreasing = True if eval_metric == "loss" else False
-        saver = utils.CheckpointSaver(
-            model=model,
-            optimizer=optimizer,
-            args=args,
-            model_ema=model_ema,
-            amp_scaler=loss_scaler,
-            checkpoint_dir=output_dir,
-            recovery_dir=output_dir,
-            decreasing=decreasing,
-            max_history=args.checkpoint_hist,
-        )
-        with open(os.path.join(output_dir, "args.yaml"), "w") as f:
-            f.write(args_text)
-
-    try:
-        for epoch in range(start_epoch, num_epochs):
-            if args.distributed and hasattr(loader_train.sampler, "set_epoch"):
-                loader_train.sampler.set_epoch(epoch)
-
-            train_metrics = train_one_epoch(
-                epoch,
-                model,
-                loader_train,
-                optimizer,
-                train_loss_fn,
-                args,
-                lr_scheduler=lr_scheduler,
-                saver=saver,
-                output_dir=output_dir,
-                amp_autocast=amp_autocast,
-                loss_scaler=loss_scaler,
-                model_ema=model_ema,
-                mixup_fn=mixup_fn,
-            )
-
-            if args.distributed and args.dist_bn in ("broadcast", "reduce"):
-                if args.local_rank == 0:
-                    _logger.info("Distributing BatchNorm running means and vars")
-                utils.distribute_bn(model, args.world_size, args.dist_bn == "reduce")
-
-            eval_metrics = validate(
-                model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast
-            )
-
-            if model_ema is not None and not args.model_ema_force_cpu:
-                if args.distributed and args.dist_bn in ("broadcast", "reduce"):
-                    utils.distribute_bn(
-                        model_ema, args.world_size, args.dist_bn == "reduce"
-                    )
-                ema_eval_metrics = validate(
-                    model_ema.module,
-                    loader_eval,
-                    validate_loss_fn,
-                    args,
-                    amp_autocast=amp_autocast,
-                    log_suffix=" (EMA)",
-                )
-                eval_metrics = ema_eval_metrics
-
-            if lr_scheduler is not None:
-                # step LR for next epoch
-                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
-
-            if output_dir is not None:
-                utils.update_summary(
-                    epoch,
-                    train_metrics,
-                    eval_metrics,
-                    os.path.join(output_dir, "summary.csv"),
-                    write_header=best_metric is None,
-                    log_wandb=args.log_wandb and has_wandb,
-                )
-
-            if saver is not None:
-                # save proper checkpoint with eval metric
-                save_metric = eval_metrics[eval_metric]
-                best_metric, best_epoch = saver.save_checkpoint(
-                    epoch, metric=save_metric
-                )
-
-    except KeyboardInterrupt:
-        pass
-    if best_metric is not None:
-        _logger.info("*** Best metric: {0} (epoch {1})".format(best_metric, best_epoch))
-
-
-def train_one_epoch(
-    epoch,
-    model,
-    loader,
-    optimizer,
-    loss_fn,
-    args,
-    lr_scheduler=None,
-    saver=None,
-    output_dir=None,
-    amp_autocast=suppress,
-    loss_scaler=None,
-    model_ema=None,
-    mixup_fn=None,
-):
-    if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
-        if args.prefetcher and loader.mixup_enabled:
-            loader.mixup_enabled = False
-        elif mixup_fn is not None:
-            mixup_fn.mixup_enabled = False
-
-    second_order = hasattr(optimizer, "is_second_order") and optimizer.is_second_order
-    batch_time_m = utils.AverageMeter()
-    data_time_m = utils.AverageMeter()
-    losses_m = utils.AverageMeter()
-
-    model.train()
-
-    end = time.time()
-    last_idx = len(loader) - 1
-    num_updates = epoch * len(loader)
-    for batch_idx, (input, target) in enumerate(loader):
-        last_batch = batch_idx == last_idx
-        data_time_m.update(time.time() - end)
-        if not args.prefetcher:
-            input, target = input.cuda(), target.cuda()
-            if mixup_fn is not None:
-                input, target = mixup_fn(input, target)
-        if args.channels_last:
-            input = input.contiguous(memory_format=torch.channels_last)
-
-        with amp_autocast():
-            output = model(input)
-            loss = loss_fn(output, target)
-
-        if not args.distributed:
-            losses_m.update(loss.item(), input.size(0))
-
-        optimizer.zero_grad()
-        if loss_scaler is not None:
-            loss_scaler(
-                loss,
-                optimizer,
-                clip_grad=args.clip_grad,
-                clip_mode=args.clip_mode,
-                parameters=model_parameters(
-                    model, exclude_head="agc" in args.clip_mode
-                ),
-                create_graph=second_order,
-            )
-        else:
-            loss.backward(create_graph=second_order)
-            if args.clip_grad is not None:
-                utils.dispatch_clip_grad(
-                    model_parameters(model, exclude_head="agc" in args.clip_mode),
-                    value=args.clip_grad,
-                    mode=args.clip_mode,
-                )
-            optimizer.step()
-
-        if model_ema is not None:
-            model_ema.update(model)
-
-        torch.cuda.synchronize()
-        num_updates += 1
-        batch_time_m.update(time.time() - end)
-        if last_batch or batch_idx % args.log_interval == 0:
-            lrl = [param_group["lr"] for param_group in optimizer.param_groups]
-            lr = sum(lrl) / len(lrl)
-
-            if args.distributed:
-                reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
-                losses_m.update(reduced_loss.item(), input.size(0))
-
-            if args.local_rank == 0:
-                _logger.info(
-                    "Train: {} [{:>4d}/{} ({:>3.0f}%)]  "
-                    "Loss: {loss.val:#.4g} ({loss.avg:#.3g})  "
-                    "Time: {batch_time.val:.3f}s, {rate:>7.2f}/s  "
-                    "({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  "
-                    "LR: {lr:.3e}  "
-                    "Data: {data_time.val:.3f} ({data_time.avg:.3f})".format(
-                        epoch,
-                        batch_idx,
-                        len(loader),
-                        100.0 * batch_idx / last_idx,
-                        loss=losses_m,
-                        batch_time=batch_time_m,
-                        rate=input.size(0) * args.world_size / batch_time_m.val,
-                        rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
-                        lr=lr,
-                        data_time=data_time_m,
-                    )
-                )
-
-                if args.save_images and output_dir:
-                    torchvision.utils.save_image(
-                        input,
-                        os.path.join(output_dir, "train-batch-%d.jpg" % batch_idx),
-                        padding=0,
-                        normalize=True,
-                    )
-
-        if (
-            saver is not None
-            and args.recovery_interval
-            and (last_batch or (batch_idx + 1) % args.recovery_interval == 0)
-        ):
-            saver.save_recovery(epoch, batch_idx=batch_idx)
-
-        if lr_scheduler is not None:
-            lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
-
-        end = time.time()
-        # end for
-
-    if hasattr(optimizer, "sync_lookahead"):
-        optimizer.sync_lookahead()
-
-    return OrderedDict([("loss", losses_m.avg)])
-
-
-def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=""):
-    batch_time_m = utils.AverageMeter()
-    losses_m = utils.AverageMeter()
-    top1_m = utils.AverageMeter()
-    top5_m = utils.AverageMeter()
-
-    model.eval()
-
-    end = time.time()
-    last_idx = len(loader) - 1
-    with torch.no_grad():
-        for batch_idx, (input, target) in enumerate(loader):
-            last_batch = batch_idx == last_idx
-            if not args.prefetcher:
-                input = input.cuda()
-                target = target.cuda()
-            if args.channels_last:
-                input = input.contiguous(memory_format=torch.channels_last)
-
-            with amp_autocast():
-                output = model(input)
-            if isinstance(output, (tuple, list)):
-                output = output[0]
-
-            # augmentation reduction
-            reduce_factor = args.tta
-            if reduce_factor > 1:
-                output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
-                target = target[0 : target.size(0) : reduce_factor]
-
-            loss = loss_fn(output, target)
-            acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
-
-            if args.distributed:
-                reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
-                acc1 = utils.reduce_tensor(acc1, args.world_size)
-                acc5 = utils.reduce_tensor(acc5, args.world_size)
-            else:
-                reduced_loss = loss.data
-
-            torch.cuda.synchronize()
-
-            losses_m.update(reduced_loss.item(), input.size(0))
-            top1_m.update(acc1.item(), output.size(0))
-            top5_m.update(acc5.item(), output.size(0))
-
-            batch_time_m.update(time.time() - end)
-            end = time.time()
-            if args.local_rank == 0 and (
-                last_batch or batch_idx % args.log_interval == 0
-            ):
-                log_name = "Test" + log_suffix
-                _logger.info(
-                    "{0}: [{1:>4d}/{2}]  "
-                    "Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  "
-                    "Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  "
-                    "Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  "
-                    "Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})".format(
-                        log_name,
-                        batch_idx,
-                        last_idx,
-                        batch_time=batch_time_m,
-                        loss=losses_m,
-                        top1=top1_m,
-                        top5=top5_m,
-                    )
-                )
-
-    metrics = OrderedDict(
-        [("loss", losses_m.avg), ("top1", top1_m.avg), ("top5", top5_m.avg)]
-    )
-
-    return metrics
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recipes/image_classification/distributed_train.sh b/recipes/image_classification/distributed_train.sh
deleted file mode 100755
index df2fbe1..0000000
--- a/recipes/image_classification/distributed_train.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-NUM_PROC=$1
-shift
-python3 -m torch.distributed.launch --nproc_per_node=$NUM_PROC classification.py "$@"
diff --git a/recipes/image_classification/extra_requirements.txt b/recipes/image_classification/extra_requirements.txt
deleted file mode 100644
index bdbf074..0000000
--- a/recipes/image_classification/extra_requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-timm==0.6.13
diff --git a/recipes/image_classification/launch_training.sh b/recipes/image_classification/launch_training.sh
deleted file mode 100755
index b6056ac..0000000
--- a/recipes/image_classification/launch_training.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# MNIST training
-CUDA_VISIBLE_DEVICES=0 python classification.py ~/data/mnist -b 128 --dataset torch/mnist --num-classes 10 \
-	--model phinet --input-size 1 28 28 --epochs 20 --amp \
-	--opt adam --lr 0.01 --weight-decay 0.01 --no-aug \
-	--pin-mem --apex-amp --use-multi-epochs-loader --mean 0.1307 --std 0.3081 --dataset-download --log-interval 100 \
-	--alpha 0.25 --num_layers 7 --beta 1 --t_zero 6 --experiment mnist_025_1_6_7_test
-
-# CIFAR-10 training
-# CUDA_VISIBLE_DEVICES=2 python classification.py ~/data/cifar10 -b 64 --dataset torch/cifar10 --num-classes 10 \
-# 	--model phinet --input-size 3 160 160 --epochs 100 --amp \
-# 	--opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \
-# 	--hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \
-# 	--pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar10_025_1_6_7 \
-# 	--alpha 0.25 --beta 1 --t_zero 6 --num_layers 7
-
-# CIFAR-100 training
-# CUDA_VISIBLE_DEVICES=1 python classification.py ~/data/cifar100 -b 64 --dataset torch/cifar100 --num-classes 100 \
-# 	--model phinet --input-size 3 160 160 --epochs 100 --amp \
-# 	--opt lamb --sched cosine --lr 0.005 --weight-decay 0.02 --warmup-epochs 10 --warmup-lr 0.008 \
-# 	--hflip 0.5 --aa rand-m3-mstd0.55 --mixup 0.1 --bce-loss \
-# 	--pin-mem --apex-amp --use-multi-epochs-loader --dataset-download --experiment cifar100_025_1_6_7 \
-# 	--alpha 0.25 --beta 1 --t_zero 6 --num_layers 7
diff --git a/tests/test_networks.py b/tests/test_networks.py
index 09c87f5..ed489bd 100644
--- a/tests/test_networks.py
+++ b/tests/test_networks.py
@@ -8,8 +8,8 @@
 
 
 def test_onnx():
-    from micromind import PhiNet
-    from micromind.conversion import convert_to_onnx
+    from micromind.networks import PhiNet
+    from micromind.convert import convert_to_onnx
 
     save_path = "temp.onnx"
 
@@ -29,8 +29,8 @@ def test_onnx():
 
 
 def test_openvino():
-    from micromind import PhiNet
-    from micromind.conversion import convert_to_openvino
+    from micromind.networks import PhiNet
+    from micromind.convert import convert_to_openvino
 
     save_dir = "vino"
 
@@ -45,8 +45,8 @@ def test_openvino():
 
 
 def test_tflite():
-    from micromind import PhiNet
-    from micromind.conversion import convert_to_tflite
+    from micromind.networks import PhiNet
+    from micromind.convert import convert_to_tflite
 
     save_path = "tflite"