Merge pull request #256 from chainer/reverse-coords-order

Merge reverse-coords-order
chainer · Jun 9, 2017 · 72297d2 · 72297d2
2 parents ca74f08 + 64526c2
commit 72297d2
Show file tree

Hide file tree

Showing 62 changed files with 392 additions and 352 deletions.
diff --git a/chainercv/datasets/cub/cub_keypoint_dataset.py b/chainercv/datasets/cub/cub_keypoint_dataset.py
@@ -29,7 +29,7 @@ class CUBKeypointDataset(CUBDatasetBase):
     Note that :math:`K=15` in CUB dataset. Also note that not all fifteen
     keypoints are visible in an image. When a keypoint is not visible,
     the values stored for that keypoint are undefined. The second axis
-    corresponds to the :math:`x` and :math:`y` coordinates of the
+    corresponds to the :math:`y` and :math:`x` coordinates of the
     keypoints in the image.
 
     A keypoint mask array indicates whether a keypoint is visible in the
@@ -74,7 +74,8 @@ def __init__(self, data_dir='auto', crop_bbox=True,
             if id_ not in self.kp_mask_dict:
                 self.kp_mask_dict[id_] = []
 
-            keypoint = [float(v) for v in values[2:4]]
+            # (y, x) order
+            keypoint = [float(v) for v in values[2:4][::-1]]
             kp_mask = bool(int(values[4]))
 
             self.kp_dict[id_].append(keypoint)
@@ -92,9 +93,9 @@ def get_example(self, i):
         kp_mask = np.array(self.kp_mask_dict[i], dtype=np.bool)
 
         if self.crop_bbox:
-            bbox = self.bboxes[i]  # (x, y, width, height)
-            img =\
-                img[:, bbox[1]: bbox[1] + bbox[3], bbox[0]: bbox[0] + bbox[2]]
+            # (y_min, x_min, y_max, x_max)
+            bbox = self.bboxes[i].astype(np.int32)
+            img = img[:, bbox[0]: bbox[2], bbox[1]: bbox[3]]
             keypoint[:, :2] = keypoint[:, :2] - np.array([bbox[0], bbox[1]])
 
         if not self.return_mask:
@@ -105,8 +106,6 @@ def get_example(self, i):
             dtype=np.uint8,
             color=False)
         if self.crop_bbox:
-            mask = mask[:,
-                        bbox[1]: bbox[1] + bbox[3],
-                        bbox[0]: bbox[0] + bbox[2]]
+            mask = mask[:, bbox[0]: bbox[2], bbox[1]: bbox[3]]
 
         return img, keypoint, kp_mask, mask
diff --git a/chainercv/datasets/cub/cub_label_dataset.py b/chainercv/datasets/cub/cub_label_dataset.py
@@ -1,3 +1,4 @@
+import numpy as np
 import os
 
 from chainercv.datasets.cub.cub_utils import CUBDatasetBase
@@ -50,9 +51,8 @@ def get_example(self, i):
             os.path.join(self.data_dir, 'images', self.fns[i]), color=True)
 
         if self.crop_bbox:
-            bbox = self.bboxes[i]  # (x, y, width, height)
-            img = img[:,
-                      bbox[1]: bbox[1] + bbox[3],
-                      bbox[0]: bbox[0] + bbox[2]]
+            # (y_min, x_min, y_max, x_max)
+            bbox = self.bboxes[i].astype(np.int32)
+            img = img[:, bbox[0]: bbox[2], bbox[1]: bbox[3]]
         label = self._data_labels[i]
         return img, label
diff --git a/chainercv/datasets/cub/cub_utils.py b/chainercv/datasets/cub/cub_utils.py
@@ -1,3 +1,4 @@
+import numpy as np
 import os
 
 import chainer
@@ -58,8 +59,13 @@ def __init__(self, data_dir='auto', mask_dir='auto', crop_bbox=True):
         bboxes_file = os.path.join(data_dir, 'bounding_boxes.txt')
 
         self.fns = [fn.strip().split()[1] for fn in open(images_file)]
-        bboxes = [bbox.split()[1:] for bbox in open(bboxes_file)]
-        self.bboxes = [[int(float(elem)) for elem in bbox] for bbox in bboxes]
+        y_min = np.array([float(bb.split()[2]) for bb in open(bboxes_file)])
+        x_min = np.array([float(bb.split()[1]) for bb in open(bboxes_file)])
+        height = np.array([float(bb.split()[4]) for bb in open(bboxes_file)])
+        width = np.array([float(bb.split()[3]) for bb in open(bboxes_file)])
+        self.bboxes = np.stack(
+            (y_min, x_min, y_min + height, x_min + width),
+            axis=1).astype(np.float32)
 
         self.crop_bbox = crop_bbox
 

diff --git a/chainercv/datasets/voc/voc_detection_dataset.py b/chainercv/datasets/voc/voc_detection_dataset.py
@@ -28,7 +28,7 @@ class VOCDetectionDataset(chainer.dataset.DatasetMixin):
     The bounding boxes are packed into a two dimensional tensor of shape
     :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
     the image. The second axis represents attributes of the bounding box.
-    They are :obj:`(x_min, y_min, x_max, y_max)`, where the
+    They are :obj:`(y_min, x_min, y_max, x_max)`, where the
     four attributes are coordinates of the bottom left and the top right
     vertices.
 
@@ -119,7 +119,7 @@ def get_example(self, i):
             # subtract 1 to make pixel indexes 0-based
             bbox.append([
                 int(bndbox_anno.find(tag).text) - 1
-                for tag in ('xmin', 'ymin', 'xmax', 'ymax')])
+                for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
             name = obj.find('name').text.lower().strip()
             label.append(voc_utils.voc_detection_label_names.index(name))
         bbox = np.stack(bbox).astype(np.float32)

diff --git a/chainercv/evaluations/eval_detection_voc_ap.py b/chainercv/evaluations/eval_detection_voc_ap.py
@@ -26,7 +26,7 @@ def eval_detection_voc_ap(
             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
             where :math:`R` corresponds
             to the number of bounding boxes, which may vary among boxes.
-            The second axis corresponds to :obj:`x_min, y_min, x_max, y_max`
+            The second axis corresponds to :obj:`y_min, x_min, y_max, x_max`
             of a bounding box.
         pred_labels (iterable of numpy.ndarray): An iterable of labels.
             Similar to :obj:`pred_bboxes`, its index corresponds to an
@@ -94,7 +94,7 @@ def calc_detection_voc_prec_rec(
             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
             where :math:`R` corresponds
             to the number of bounding boxes, which may vary among boxes.
-            The second axis corresponds to :obj:`x_min, y_min, x_max, y_max`
+            The second axis corresponds to :obj:`y_min, x_min, y_max, x_max`
             of a bounding box.
         pred_labels (iterable of numpy.ndarray): An iterable of labels.
             Similar to :obj:`pred_bboxes`, its index corresponds to an

diff --git a/chainercv/evaluations/eval_pck.py b/chainercv/evaluations/eval_pck.py
@@ -12,13 +12,13 @@ def eval_pck(pred, expected, alpha, L):
     :math:`0 < \\alpha < 1` is a variable we control.
     :math:`L` is determined differently depending on the context. For example,
     in evaluation of keypoint matching for CUB dataset,
-    :math:`L=\\sqrt{w^2 + h^2}` is used.
+    :math:`L=\\sqrt{h^2 + w^2}` is used.
 
     Args:
         pred (~numpy.ndarray): An array of shape :math:`(K, 2)`
             :math:`N` is the number of keypoints to be evaluated. The
-            two elements of the second axis corresponds to :math:`x`
-            and :math:`y` coordinate of the keypoint.
+            two elements of the second axis corresponds to :math:`y`
+            and :math:`x` coordinate of the keypoint.
         expected (~numpy.ndarray): Same kind of array as :obj:`pred`.
             This contains ground truth location of the keypoints that
             the user tries to predict.

diff --git a/chainercv/extensions/detection/detection_vis_report.py b/chainercv/extensions/detection/detection_vis_report.py
@@ -52,7 +52,7 @@ class DetectionVisReport(chainer.training.extension.Extension):
         :obj:`gt_bbox` and :obj:`pred_bbox` are float arrays
         of shape :math:`(R, 4)`, where :math:`R` is the number of
         bounding boxes in the image. Each bounding box is organized
-        by :obj:`(x_min, y_min, x_max, y_max)` in the second axis.
+        by :obj:`(y_min, x_min, y_max, x_max)` in the second axis.
 
         :obj:`gt_label` and :obj:`pred_label` are intenger arrays
         of shape :math:`(R,)`. Each label indicates the class of

diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn.py b/chainercv/links/model/faster_rcnn/faster_rcnn.py
@@ -149,7 +149,7 @@ def __call__(self, x, scale=1., test=True):
                 :math:`(R',)`.
 
         """
-        img_size = x.shape[2:][::-1]
+        img_size = x.shape[2:]
 
         h = self.extractor(x, test=test)
         rpn_locs, rpn_scores, rois, roi_indices, anchor =\
@@ -214,7 +214,7 @@ def prepare(self, img):
         if scale * max(H, W) > self.max_size:
             scale = self.max_size / max(H, W)
 
-        img = resize(img, (int(W * scale), int(H * scale)))
+        img = resize(img, (int(H * scale), int(W * scale)))
 
         img = (img - self.mean).astype(np.float32, copy=False)
         return img
@@ -259,7 +259,7 @@ def predict(self, imgs):
            * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
                where :math:`R` is the number of bounding boxes in a image. \
                Each bouding box is organized by \
-               :obj:`(x_min, y_min, x_max, y_max)` \
+               :obj:`(y_min, x_min, y_max, x_max)` \
                in the second axis.
            * **labels** : A list of integer arrays of shape :math:`(R,)`. \
                Each value indicates the class of the bounding box. \
@@ -305,9 +305,9 @@ def predict(self, imgs):
             cls_bbox = cls_bbox.reshape(-1, self.n_class * 4)
             # clip bounding box
             cls_bbox[:, slice(0, 4, 2)] = self.xp.clip(
-                cls_bbox[:, slice(0, 4, 2)], 0, W / scale)
+                cls_bbox[:, slice(0, 4, 2)], 0, H / scale)
             cls_bbox[:, slice(1, 4, 2)] = self.xp.clip(
-                cls_bbox[:, slice(1, 4, 2)], 0, H / scale)
+                cls_bbox[:, slice(1, 4, 2)], 0, W / scale)
 
             prob = F.softmax(roi_score).data
 

diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py b/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py
@@ -101,7 +101,7 @@ def __call__(self, imgs, bboxes, labels, scale):
             raise ValueError('Currently only batch size 1 is supported.')
 
         _, _, H, W = imgs.shape
-        img_size = (W, H)
+        img_size = (H, W)
 
         features = self.faster_rcnn.extractor(imgs, test=not self.train)
         rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(

diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py b/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py
@@ -82,7 +82,7 @@ class FasterRCNNVGG16(FasterRCNN):
         'voc07': {
             'n_fg_class': 20,
             'url': 'https://github.com/yuyu2172/share-weights/releases/'
-            'download/0.0.2/faster_rcnn_vgg16_voc07_2017_05_24.npz'
+            'download/0.0.3/faster_rcnn_vgg16_voc07_2017_06_06.npz'
         }
     }
     feat_stride = 16
@@ -227,10 +227,11 @@ def __call__(self, x, rois, roi_indices, test=True):
 
         """
         roi_indices = roi_indices.astype(np.float32)
-        rois = self.xp.concatenate(
+        indices_and_rois = self.xp.concatenate(
             (roi_indices[:, None], rois), axis=1)
-        pool = F.roi_pooling_2d(
-            x, rois, self.roi_size, self.roi_size, self.spatial_scale)
+        pool = _roi_pooling_2d_yx(
+            x, indices_and_rois, self.roi_size, self.roi_size,
+            self.spatial_scale)
 
         fc6 = _relu(self.fc6(pool))
         fc7 = _relu(self.fc7(fc6))
@@ -291,5 +292,12 @@ def __call__(self, x, test=True):
         return h
 
 
+def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale):
+    xy_indices_and_rois = indices_and_rois[:, [2, 1, 4, 3]]
+    pool = F.roi_pooling_2d(
+        x, xy_indices_and_rois, outh, outw, spatial_scale)
+    return pool
+
+
 def _max_pooling_2d(x):
     return F.max_pooling_2d(x, ksize=2)
diff --git a/chainercv/links/model/faster_rcnn/region_proposal_network.py b/chainercv/links/model/faster_rcnn/region_proposal_network.py
@@ -80,7 +80,7 @@ def __call__(self, x, img_size, scale=1., test=True):
         Args:
             x (~chainer.Variable): The Features extracted from images.
                 Its shape is :math:`(N, C, H, W)`.
-            img_size (tuple of ints): A tuple :obj:`width, height`,
+            img_size (tuple of ints): A tuple :obj:`height, width`,
                 which contains image size after scaling.
             scale (float): The amount of scaling done to the input images after
                 reading them from files.
@@ -110,8 +110,8 @@ def __call__(self, x, img_size, scale=1., test=True):
         """
         n, _, hh, ww = x.shape
         anchor = _enumerate_shifted_anchor(
-            self.xp.array(self.anchor_base), self.feat_stride, ww, hh)
-        n_anchor = anchor.shape[0] // (ww * hh)
+            self.xp.array(self.anchor_base), self.feat_stride, hh, ww)
+        n_anchor = anchor.shape[0] // (hh * ww)
         h = F.relu(self.conv1(x))
 
         rpn_locs = self.loc(h)
@@ -139,19 +139,19 @@ def __call__(self, x, img_size, scale=1., test=True):
         return rpn_locs, rpn_scores, rois, roi_indices, anchor
 
 
-def _enumerate_shifted_anchor(anchor_base, feat_stride, width, height):
+def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
     # Enumerate all shifted anchors:
     #
     # add A anchors (1, A, 4) to
     # cell K shifts (K, 1, 4) to get
     # shift anchors (K, A, 4)
     # reshape to (K*A, 4) shifted anchors
     xp = cuda.get_array_module(anchor_base)
-    shift_x = xp.arange(0, width * feat_stride, feat_stride)
     shift_y = xp.arange(0, height * feat_stride, feat_stride)
+    shift_x = xp.arange(0, width * feat_stride, feat_stride)
     shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
-    shift = xp.stack((shift_x.ravel(), shift_y.ravel(),
-                      shift_x.ravel(), shift_y.ravel()), axis=1)
+    shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
+                      shift_y.ravel(), shift_x.ravel()), axis=1)
 
     A = anchor_base.shape[0]
     K = shift.shape[0]

diff --git a/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py b/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py
@@ -57,7 +57,7 @@ def __call__(self, bbox, anchor, img_size):
                 :math:`(R, 4)`.
             anchor (array): Coordinates of anchors. Its shape is
                 :math:`(S, 4)`.
-            img_size (tuple of ints): A tuple :obj:`W, H`, which
+            img_size (tuple of ints): A tuple :obj:`H, W`, which
                 is a tuple of height and width of an image.
 
         Returns:
@@ -74,10 +74,10 @@ def __call__(self, bbox, anchor, img_size):
         bbox = cuda.to_cpu(bbox)
         anchor = cuda.to_cpu(anchor)
 
-        img_W, img_H = img_size
+        img_H, img_W = img_size
 
         n_anchor = len(anchor)
-        inside_index = _get_inside_index(anchor, img_W, img_H)
+        inside_index = _get_inside_index(anchor, img_H, img_W)
         anchor = anchor[inside_index]
         argmax_ious, label = self._create_label(
             inside_index, anchor, bbox)
@@ -156,15 +156,15 @@ def _unmap(data, count, index, fill=0):
     return ret
 
 
-def _get_inside_index(anchor, W, H):
+def _get_inside_index(anchor, H, W):
     # Calc indicies of anchors which are located completely inside of the image
     # whose size is speficied.
     xp = cuda.get_array_module(anchor)
 
     index_inside = xp.where(
         (anchor[:, 0] >= 0) &
         (anchor[:, 1] >= 0) &
-        (anchor[:, 2] <= W) &  # width
-        (anchor[:, 3] <= H)  # height
+        (anchor[:, 2] <= H) &
+        (anchor[:, 3] <= W)
     )[0]
     return index_inside
diff --git a/chainercv/links/model/faster_rcnn/utils/bbox2loc.py b/chainercv/links/model/faster_rcnn/utils/bbox2loc.py
@@ -6,15 +6,16 @@ def bbox2loc(src_bbox, dst_bbox):
 
     Given bounding boxes, this function computes offsets and scales
     to match the source bounding boxes to the target bounding boxes.
-    Mathematcially, given a bounding box whose center is :math:`p_x, p_y` and
-    size :math:`p_w, p_h` and the target bounding box whose center is
-    :math:`g_x, g_y` and size :math:`g_w, g_h`, the offsets and scales
-    :math:`t_x, t_y, t_w, t_h` can be computed by the following formulas.
+    Mathematcially, given a bounding box whose center is
+    :math:`(y, x) = p_y, p_x` and
+    size :math:`p_h, p_w` and the target bounding box whose center is
+    :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales
+    :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas.
 
-    * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
     * :math:`t_y = \\frac{(g_y - p_y)} {p_h}`
-    * :math:`t_w = \\log(\\frac{g_w} {p_w})`
+    * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
     * :math:`t_h = \\log(\\frac{g_h} {p_h})`
+    * :math:`t_w = \\log(\\frac{g_w} {p_w})`
 
     The output is same type as the type of the inputs.
     The encoding formulas are used in works such as R-CNN [#]_.
@@ -26,35 +27,35 @@ def bbox2loc(src_bbox, dst_bbox):
     Args:
         src_bbox (array): An image coordinate array whose shape is
             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
-            These coordinates are used to compute :math:`p_x, p_y, p_w, p_h`.
+            These coordinates are used to compute :math:`p_y, p_x, p_h, p_w`.
         dst_bbox (array): An image coordinate array whose shape is
             :math:`(R, 4)`.
-            These coordinates are used to compute :math:`g_x, g_y, g_w, g_h`.
+            These coordinates are used to compute :math:`g_y, g_x, g_h, g_w`.
 
     Returns:
         array:
         Bounding box offsets and scales from :obj:`src_bbox` \
         to :obj:`dst_bbox`. \
         This has shape :math:`(R, 4)`.
-        The second axis contains four values :math:`t_x, t_y, t_w, t_h`.
+        The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
 
     """
     xp = cuda.get_array_module(src_bbox)
 
-    width = src_bbox[:, 2] - src_bbox[:, 0]
-    height = src_bbox[:, 3] - src_bbox[:, 1]
-    ctr_x = src_bbox[:, 0] + 0.5 * width
-    ctr_y = src_bbox[:, 1] + 0.5 * height
+    height = src_bbox[:, 2] - src_bbox[:, 0]
+    width = src_bbox[:, 3] - src_bbox[:, 1]
+    ctr_y = src_bbox[:, 0] + 0.5 * height
+    ctr_x = src_bbox[:, 1] + 0.5 * width
 
-    base_width = dst_bbox[:, 2] - dst_bbox[:, 0]
-    base_height = dst_bbox[:, 3] - dst_bbox[:, 1]
-    base_ctr_x = dst_bbox[:, 0] + 0.5 * base_width
-    base_ctr_y = dst_bbox[:, 1] + 0.5 * base_height
+    base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
+    base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
+    base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
+    base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
 
-    dx = (base_ctr_x - ctr_x) / width
     dy = (base_ctr_y - ctr_y) / height
-    dw = xp.log(base_width / width)
+    dx = (base_ctr_x - ctr_x) / width
     dh = xp.log(base_height / height)
+    dw = xp.log(base_width / width)
 
-    loc = xp.vstack((dx, dy, dw, dh)).transpose()
+    loc = xp.vstack((dy, dx, dh, dw)).transpose()
     return loc