vita-epfl · umerhasan17 · Aug 13, 2023 · Aug 28, 2023 · Sep 9, 2023 · Sep 9, 2023
diff --git a/docs/LICENSE.HOURGLASS b/docs/LICENSE.HOURGLASS
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2019, princeton-vl
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/openpifpaf/network/__init__.py b/src/openpifpaf/network/__init__.py
@@ -7,5 +7,6 @@
 from .running_cache import RunningCache
 from .tracking_base import TrackingBase
 from .tracking_heads import TBaseSingleImage, Tcaf
+from .hourglass import HourglassBlock
 from .trainer import Trainer
 from . import losses
diff --git a/src/openpifpaf/network/basenetworks.py b/src/openpifpaf/network/basenetworks.py
@@ -1,10 +1,13 @@
 import argparse
 import logging
 import math
+
 import torch
 import torchvision.models
-from . import effnetv2
+
 from . import bottleneck_transformer
+from . import effnetv2
+from .hourglass import HourglassBlock, convolution, residual
 
 LOG = logging.getLogger(__name__)
 
@@ -760,8 +763,82 @@ def cli(cls, parser: argparse.ArgumentParser):
         group.add_argument('--botnet-input-image-size',
                            default=cls.input_image_size, type=int,
                            help='Input image size. Needs to be the same for training and'
-                           ' prediction, as BotNet only accepts fixed input sizes')
+                                ' prediction, as BotNet only accepts fixed input sizes')
 
     @classmethod
     def configure(cls, args: argparse.Namespace):
         cls.input_image_size = args.botnet_input_image_size
+
+
+class Hourglass(BaseNetwork):
+    """
+    Input Image Data Tensor Shape: (batch_size, 3, 513, 513)  where 513 = image height and width (can be different)
+    Input Hourglass Block Tensor Shape: (batch_size, 3, 256, 256) where 256 must be a fixed image height and width
+    Output Hourglass Block Tensor Shape: (batch_size, 3, 256, 256)
+    Input of CifDet Head Module: (batch_size, 256, 257, 257)
+    Output of CifDet Head Module: (batch_size, 91, 6, 513, 513)
+    Output Target Tensor Shape: (batch_size, 91, 7, 513, 513)
+                                where 91 = number of detection categories
+                                       7 = number of fields + 1 = 6 + 1
+                                     513 = prediction image height and width
+    """
+
+    def __init__(self, *args, inp_dim=256, bn=True, use_conv=True, layers=104, increases=None, modules=None, stride=2,
+                 **kwargs):
+        """
+        :param inp_dim: Input dimension of the hourglass blocks.
+        :param bn: Hourglass batch normalization flag.
+        :param use_conv: Hourglass apply convolutions flag.
+        :param layers: Selects the type of hourglass network (e.g. 52 or 104).
+        :param increases: The difference between data resolution at each component of the hourglass.
+        :param modules: The number of residual modules at each component of the hourglass.
+        :param stride: Stride of the BaseNetwork.
+        """
+        super().__init__(*args, stride=stride, out_features=inp_dim, **kwargs)
+        if increases is None:
+            increases = [0, 128, 0, 0, 128]
+        if modules is None:
+            modules = [2, 2, 2, 2, 2, 4]
+
+        # 1 convolution block takes the data from the input image data tensor shape to the input hourglass block tensor
+        # shape. We opted for this to keep the hourglass architecture the same as the original architecture (it took
+        # in image data of size 256x256). The input image of the hourglass ideally would be a power of 2 to account for
+        # the sizes of the intermediate down-sampled data. If the resolution of the input is too large, it is possible
+        # the same amount of hourglass layers are not able to capture distinct features at low resolutions. Adding more
+        # layers to fully capture down-sampling of high resolution images would significantly increase training time.
+        # Future baselines can be investigated by adjusting hyperparameters such as the number of hourglass layers,
+        # the input image resolution, the amount of down-sampling between each hourglass layer.
+        self.hg_input_block = torch.nn.Sequential(
+            convolution(inp_dim=3, out_dim=128, kernel_size=8, stride=2),
+            residual(inp_dim=128, out_dim=inp_dim)
+        )
+
+        # Number of components in each hourglass block is fixed at 5 (to replicate the original architecture).
+        if layers == 104:
+            # The hourglass-104 network is 2 sequential hourglass blocks.
+            self.hgs = torch.nn.Sequential(
+                HourglassBlock(5, inp_dim, bn, increases=increases, modules=modules, use_conv=use_conv),
+                HourglassBlock(5, inp_dim, bn, increases=increases, modules=modules, use_conv=use_conv),
+            )
+        elif layers == 52:
+            self.hgs = torch.nn.Sequential(
+                HourglassBlock(5, inp_dim, bn, increases=increases, modules=modules, use_conv=use_conv),
+            )
+        else:
+            raise ValueError(f'Number of hourglass layers unsupported: {layers}')
+
+        # This output block transforms data from the output hourglass block tensor shape to the input shape of the
+        # CifDet head module. The output resolution of the hourglass will be 256x256 (power of 2) and the input of the
+        # CifDet is typically (power of 2 + 1) e.g. 257x257.
+        self.hg_output_block = torch.nn.Sequential(
+            torch.nn.Conv2d(256, 256, kernel_size=(2, 2), stride=(self.stride // 2, self.stride // 2), padding=(1, 1),
+                            bias=False),
+            torch.nn.BatchNorm2d(256, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True),
+            torch.nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x):
+        x = self.hg_input_block(x)
+        x = self.hgs(x)
+        x = self.hg_output_block(x)
+        return x
diff --git a/src/openpifpaf/network/factory.py b/src/openpifpaf/network/factory.py
@@ -187,6 +187,8 @@
                                                      ],
                                                      stride=16),
     'botnet': lambda: basenetworks.BotNet('botnet'),
+    'hourglass104': lambda: basenetworks.Hourglass('hourglass', layers=104, use_conv=True, stride=2),
+    'hourglass52': lambda: basenetworks.Hourglass('hourglass', layers=52, use_conv=True, stride=2),
 }
 # base factories that wrap other base factories:
 BASE_FACTORIES['tshufflenetv2k16'] = lambda: TrackingBase(BASE_FACTORIES['shufflenetv2k16']())

diff --git a/src/openpifpaf/network/hourglass.py b/src/openpifpaf/network/hourglass.py
@@ -0,0 +1,124 @@
+"""
+The hourglass module is able to process features across multiple scales and consolidate the results
+to best capture the spatial relationships associated with keypoints for the task of pose estimation.
+
+Code inspired by hourglass modules from other research repositories:
+
+    The CornerNet model for object detection utilises an hourglass backbone.
+    * https://github.com/princeton-vl/CornerNet/tree/master/models
+    BSD 3-Clause License: Copyright (c) 2018, University of Michigan
+
+    The original hourglass model written for the task of human pose estimation.
+    * https://github.com/princeton-vl/pytorch_stacked_hourglass
+    BSD 3-Clause License: Copyright (c) 2019, princeton-vl
+"""
+
+from torch import nn
+
+Pool = nn.MaxPool2d
+
+
+class convolution(nn.Module):
+    """
+    Standard convolution module that is a component of the hourglass module.
+    """
+
+    def __init__(self, inp_dim, out_dim, kernel_size=3, stride=1, with_bn=True):
+        super(convolution, self).__init__()
+
+        pad = (kernel_size - 1) // 2
+        self.conv = nn.Conv2d(inp_dim, out_dim, (kernel_size, kernel_size), padding=(pad, pad), stride=(stride, stride),
+                              bias=not with_bn)
+        self.bn = nn.BatchNorm2d(out_dim, eps=1e-5, momentum=0.1) if with_bn else nn.Sequential()
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        conv = self.conv(x)
+        bn = self.bn(conv)
+        relu = self.relu(bn)
+        return relu
+
+
+class residual(nn.Module):
+    """
+    The residual module is used for up-sampling and down-sampling convolutions within the hourglass.
+    """
+
+    def __init__(self, inp_dim, out_dim, stride=1):
+        super(residual, self).__init__()
+
+        self.conv1 = nn.Conv2d(inp_dim, out_dim, (3, 3), padding=(1, 1), stride=(stride, stride), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_dim, eps=1e-5, momentum=0.1)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(out_dim, out_dim, (3, 3), padding=(1, 1), bias=False)
+        self.bn2 = nn.BatchNorm2d(out_dim, eps=1e-5, momentum=0.1)
+
+        self.skip = nn.Sequential(
+            nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False),
+            nn.BatchNorm2d(out_dim, eps=1e-5, momentum=0.1)
+        ) if stride != 1 or inp_dim != out_dim else nn.Sequential()
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        bn1 = self.bn1(conv1)
+        relu1 = self.relu1(bn1)
+
+        conv2 = self.conv2(relu1)
+        bn2 = self.bn2(conv2)
+
+        skip = self.skip(x)
+        return self.relu(bn2 + skip)
+
+
+class HourglassBlock(nn.Module):
+    def __init__(self, n_hourglass_layers, inp_dim, bn=None, modules=None, increases=None, use_conv=True):
+        super(HourglassBlock, self).__init__()
+        assert increases is not None
+        assert modules is not None
+        assert len(increases) == n_hourglass_layers
+        assert len(modules) == n_hourglass_layers + 1
+        cur_mod = modules[0]
+        nf = inp_dim + increases[0]
+
+        self.up1 = self.make_residuals(inp_dim, inp_dim, cur_mod)
+        # Lower branch
+        if use_conv:
+            self.extra = nn.Conv2d(inp_dim, inp_dim, (2, 2), padding=(0, 0), stride=(2, 2), bias=False)  # pool replacement
+        else:
+            self.extra = Pool(2, 2)
+        self.low1 = self.make_residuals(inp_dim, nf, cur_mod)
+        self.n_hourglass_layers = n_hourglass_layers
+        # Recursive hourglass
+        if self.n_hourglass_layers > 1:
+            self.low2 = HourglassBlock(n_hourglass_layers - 1, nf, bn=bn, increases=increases[1:], modules=modules[1:],
+                                       use_conv=use_conv)
+        else:
+            self.low2 = self.make_residuals(nf, nf, modules[1])
+        self.low3 = self.make_residuals_revr(nf, inp_dim, cur_mod)
+        self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
+
+    @staticmethod
+    def make_residuals(inp_dim, out_dim, num_modules):
+        layers = [residual(inp_dim, out_dim)]
+        for _ in range(num_modules - 1):
+            layers.append(residual(out_dim, out_dim))
+        return nn.Sequential(*layers)
+
+    @staticmethod
+    def make_residuals_revr(inp_dim, out_dim, num_modules):
+        layers = []
+        for _ in range(num_modules - 1):
+            layers.append(residual(inp_dim, inp_dim))
+        layers.append(residual(inp_dim, out_dim))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        up1 = self.up1(x)
+        extra1 = self.extra(x)
+        low1 = self.low1(extra1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2