diff --git a/chainercv/datasets/cub/cub_keypoint_dataset.py b/chainercv/datasets/cub/cub_keypoint_dataset.py index 1ef0a92cdb..53fbb02295 100644 --- a/chainercv/datasets/cub/cub_keypoint_dataset.py +++ b/chainercv/datasets/cub/cub_keypoint_dataset.py @@ -29,7 +29,7 @@ class CUBKeypointDataset(CUBDatasetBase): Note that :math:`K=15` in CUB dataset. Also note that not all fifteen keypoints are visible in an image. When a keypoint is not visible, the values stored for that keypoint are undefined. The second axis - corresponds to the :math:`x` and :math:`y` coordinates of the + corresponds to the :math:`y` and :math:`x` coordinates of the keypoints in the image. A keypoint mask array indicates whether a keypoint is visible in the @@ -74,7 +74,8 @@ def __init__(self, data_dir='auto', crop_bbox=True, if id_ not in self.kp_mask_dict: self.kp_mask_dict[id_] = [] - keypoint = [float(v) for v in values[2:4]] + # (y, x) order + keypoint = [float(v) for v in values[2:4][::-1]] kp_mask = bool(int(values[4])) self.kp_dict[id_].append(keypoint) @@ -92,9 +93,9 @@ def get_example(self, i): kp_mask = np.array(self.kp_mask_dict[i], dtype=np.bool) if self.crop_bbox: - bbox = self.bboxes[i] # (x, y, width, height) - img =\ - img[:, bbox[1]: bbox[1] + bbox[3], bbox[0]: bbox[0] + bbox[2]] + # (y_min, x_min, y_max, x_max) + bbox = self.bboxes[i].astype(np.int32) + img = img[:, bbox[0]: bbox[2], bbox[1]: bbox[3]] keypoint[:, :2] = keypoint[:, :2] - np.array([bbox[0], bbox[1]]) if not self.return_mask: @@ -105,8 +106,6 @@ def get_example(self, i): dtype=np.uint8, color=False) if self.crop_bbox: - mask = mask[:, - bbox[1]: bbox[1] + bbox[3], - bbox[0]: bbox[0] + bbox[2]] + mask = mask[:, bbox[0]: bbox[2], bbox[1]: bbox[3]] return img, keypoint, kp_mask, mask diff --git a/chainercv/datasets/cub/cub_label_dataset.py b/chainercv/datasets/cub/cub_label_dataset.py index 8c1c811595..e17b19cc8f 100644 --- a/chainercv/datasets/cub/cub_label_dataset.py +++ b/chainercv/datasets/cub/cub_label_dataset.py @@ -1,3 +1,4 @@ +import numpy as np import os from chainercv.datasets.cub.cub_utils import CUBDatasetBase @@ -50,9 +51,8 @@ def get_example(self, i): os.path.join(self.data_dir, 'images', self.fns[i]), color=True) if self.crop_bbox: - bbox = self.bboxes[i] # (x, y, width, height) - img = img[:, - bbox[1]: bbox[1] + bbox[3], - bbox[0]: bbox[0] + bbox[2]] + # (y_min, x_min, y_max, x_max) + bbox = self.bboxes[i].astype(np.int32) + img = img[:, bbox[0]: bbox[2], bbox[1]: bbox[3]] label = self._data_labels[i] return img, label diff --git a/chainercv/datasets/cub/cub_utils.py b/chainercv/datasets/cub/cub_utils.py index 9d2ea2f9e8..7eb4178365 100644 --- a/chainercv/datasets/cub/cub_utils.py +++ b/chainercv/datasets/cub/cub_utils.py @@ -1,3 +1,4 @@ +import numpy as np import os import chainer @@ -58,8 +59,13 @@ def __init__(self, data_dir='auto', mask_dir='auto', crop_bbox=True): bboxes_file = os.path.join(data_dir, 'bounding_boxes.txt') self.fns = [fn.strip().split()[1] for fn in open(images_file)] - bboxes = [bbox.split()[1:] for bbox in open(bboxes_file)] - self.bboxes = [[int(float(elem)) for elem in bbox] for bbox in bboxes] + y_min = np.array([float(bb.split()[2]) for bb in open(bboxes_file)]) + x_min = np.array([float(bb.split()[1]) for bb in open(bboxes_file)]) + height = np.array([float(bb.split()[4]) for bb in open(bboxes_file)]) + width = np.array([float(bb.split()[3]) for bb in open(bboxes_file)]) + self.bboxes = np.stack( + (y_min, x_min, y_min + height, x_min + width), + axis=1).astype(np.float32) self.crop_bbox = crop_bbox diff --git a/chainercv/datasets/voc/voc_detection_dataset.py b/chainercv/datasets/voc/voc_detection_dataset.py index 778e12287e..dfc08ad2c4 100644 --- a/chainercv/datasets/voc/voc_detection_dataset.py +++ b/chainercv/datasets/voc/voc_detection_dataset.py @@ -28,7 +28,7 @@ class VOCDetectionDataset(chainer.dataset.DatasetMixin): The bounding boxes are packed into a two dimensional tensor of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. The second axis represents attributes of the bounding box. - They are :obj:`(x_min, y_min, x_max, y_max)`, where the + They are :obj:`(y_min, x_min, y_max, x_max)`, where the four attributes are coordinates of the bottom left and the top right vertices. @@ -119,7 +119,7 @@ def get_example(self, i): # subtract 1 to make pixel indexes 0-based bbox.append([ int(bndbox_anno.find(tag).text) - 1 - for tag in ('xmin', 'ymin', 'xmax', 'ymax')]) + for tag in ('ymin', 'xmin', 'ymax', 'xmax')]) name = obj.find('name').text.lower().strip() label.append(voc_utils.voc_detection_label_names.index(name)) bbox = np.stack(bbox).astype(np.float32) diff --git a/chainercv/evaluations/eval_detection_voc_ap.py b/chainercv/evaluations/eval_detection_voc_ap.py index 88d17ec303..e734ce5ea3 100644 --- a/chainercv/evaluations/eval_detection_voc_ap.py +++ b/chainercv/evaluations/eval_detection_voc_ap.py @@ -26,7 +26,7 @@ def eval_detection_voc_ap( of bounding boxes. This is an array whose shape is :math:`(R, 4)`, where :math:`R` corresponds to the number of bounding boxes, which may vary among boxes. - The second axis corresponds to :obj:`x_min, y_min, x_max, y_max` + The second axis corresponds to :obj:`y_min, x_min, y_max, x_max` of a bounding box. pred_labels (iterable of numpy.ndarray): An iterable of labels. Similar to :obj:`pred_bboxes`, its index corresponds to an @@ -94,7 +94,7 @@ def calc_detection_voc_prec_rec( of bounding boxes. This is an array whose shape is :math:`(R, 4)`, where :math:`R` corresponds to the number of bounding boxes, which may vary among boxes. - The second axis corresponds to :obj:`x_min, y_min, x_max, y_max` + The second axis corresponds to :obj:`y_min, x_min, y_max, x_max` of a bounding box. pred_labels (iterable of numpy.ndarray): An iterable of labels. Similar to :obj:`pred_bboxes`, its index corresponds to an diff --git a/chainercv/evaluations/eval_pck.py b/chainercv/evaluations/eval_pck.py index ac335f238b..6ee04d22ca 100644 --- a/chainercv/evaluations/eval_pck.py +++ b/chainercv/evaluations/eval_pck.py @@ -12,13 +12,13 @@ def eval_pck(pred, expected, alpha, L): :math:`0 < \\alpha < 1` is a variable we control. :math:`L` is determined differently depending on the context. For example, in evaluation of keypoint matching for CUB dataset, - :math:`L=\\sqrt{w^2 + h^2}` is used. + :math:`L=\\sqrt{h^2 + w^2}` is used. Args: pred (~numpy.ndarray): An array of shape :math:`(K, 2)` :math:`N` is the number of keypoints to be evaluated. The - two elements of the second axis corresponds to :math:`x` - and :math:`y` coordinate of the keypoint. + two elements of the second axis corresponds to :math:`y` + and :math:`x` coordinate of the keypoint. expected (~numpy.ndarray): Same kind of array as :obj:`pred`. This contains ground truth location of the keypoints that the user tries to predict. diff --git a/chainercv/extensions/detection/detection_vis_report.py b/chainercv/extensions/detection/detection_vis_report.py index 216c0165b0..445bdcbd71 100644 --- a/chainercv/extensions/detection/detection_vis_report.py +++ b/chainercv/extensions/detection/detection_vis_report.py @@ -52,7 +52,7 @@ class DetectionVisReport(chainer.training.extension.Extension): :obj:`gt_bbox` and :obj:`pred_bbox` are float arrays of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. Each bounding box is organized - by :obj:`(x_min, y_min, x_max, y_max)` in the second axis. + by :obj:`(y_min, x_min, y_max, x_max)` in the second axis. :obj:`gt_label` and :obj:`pred_label` are intenger arrays of shape :math:`(R,)`. Each label indicates the class of diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn.py b/chainercv/links/model/faster_rcnn/faster_rcnn.py index 76f6869b53..eef3546b95 100644 --- a/chainercv/links/model/faster_rcnn/faster_rcnn.py +++ b/chainercv/links/model/faster_rcnn/faster_rcnn.py @@ -149,7 +149,7 @@ def __call__(self, x, scale=1., test=True): :math:`(R',)`. """ - img_size = x.shape[2:][::-1] + img_size = x.shape[2:] h = self.extractor(x, test=test) rpn_locs, rpn_scores, rois, roi_indices, anchor =\ @@ -214,7 +214,7 @@ def prepare(self, img): if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) - img = resize(img, (int(W * scale), int(H * scale))) + img = resize(img, (int(H * scale), int(W * scale))) img = (img - self.mean).astype(np.float32, copy=False) return img @@ -259,7 +259,7 @@ def predict(self, imgs): * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ - :obj:`(x_min, y_min, x_max, y_max)` \ + :obj:`(y_min, x_min, y_max, x_max)` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ @@ -305,9 +305,9 @@ def predict(self, imgs): cls_bbox = cls_bbox.reshape(-1, self.n_class * 4) # clip bounding box cls_bbox[:, slice(0, 4, 2)] = self.xp.clip( - cls_bbox[:, slice(0, 4, 2)], 0, W / scale) + cls_bbox[:, slice(0, 4, 2)], 0, H / scale) cls_bbox[:, slice(1, 4, 2)] = self.xp.clip( - cls_bbox[:, slice(1, 4, 2)], 0, H / scale) + cls_bbox[:, slice(1, 4, 2)], 0, W / scale) prob = F.softmax(roi_score).data diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py b/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py index 041dcc6102..4027911e4f 100644 --- a/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py +++ b/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py @@ -101,7 +101,7 @@ def __call__(self, imgs, bboxes, labels, scale): raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape - img_size = (W, H) + img_size = (H, W) features = self.faster_rcnn.extractor(imgs, test=not self.train) rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn( diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py b/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py index ac12e23381..762ab92357 100644 --- a/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py +++ b/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py @@ -82,7 +82,7 @@ class FasterRCNNVGG16(FasterRCNN): 'voc07': { 'n_fg_class': 20, 'url': 'https://github.com/yuyu2172/share-weights/releases/' - 'download/0.0.2/faster_rcnn_vgg16_voc07_2017_05_24.npz' + 'download/0.0.3/faster_rcnn_vgg16_voc07_2017_06_06.npz' } } feat_stride = 16 @@ -227,10 +227,11 @@ def __call__(self, x, rois, roi_indices, test=True): """ roi_indices = roi_indices.astype(np.float32) - rois = self.xp.concatenate( + indices_and_rois = self.xp.concatenate( (roi_indices[:, None], rois), axis=1) - pool = F.roi_pooling_2d( - x, rois, self.roi_size, self.roi_size, self.spatial_scale) + pool = _roi_pooling_2d_yx( + x, indices_and_rois, self.roi_size, self.roi_size, + self.spatial_scale) fc6 = _relu(self.fc6(pool)) fc7 = _relu(self.fc7(fc6)) @@ -291,5 +292,12 @@ def __call__(self, x, test=True): return h +def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale): + xy_indices_and_rois = indices_and_rois[:, [2, 1, 4, 3]] + pool = F.roi_pooling_2d( + x, xy_indices_and_rois, outh, outw, spatial_scale) + return pool + + def _max_pooling_2d(x): return F.max_pooling_2d(x, ksize=2) diff --git a/chainercv/links/model/faster_rcnn/region_proposal_network.py b/chainercv/links/model/faster_rcnn/region_proposal_network.py index 6ac193bc94..15f463619d 100644 --- a/chainercv/links/model/faster_rcnn/region_proposal_network.py +++ b/chainercv/links/model/faster_rcnn/region_proposal_network.py @@ -80,7 +80,7 @@ def __call__(self, x, img_size, scale=1., test=True): Args: x (~chainer.Variable): The Features extracted from images. Its shape is :math:`(N, C, H, W)`. - img_size (tuple of ints): A tuple :obj:`width, height`, + img_size (tuple of ints): A tuple :obj:`height, width`, which contains image size after scaling. scale (float): The amount of scaling done to the input images after reading them from files. @@ -110,8 +110,8 @@ def __call__(self, x, img_size, scale=1., test=True): """ n, _, hh, ww = x.shape anchor = _enumerate_shifted_anchor( - self.xp.array(self.anchor_base), self.feat_stride, ww, hh) - n_anchor = anchor.shape[0] // (ww * hh) + self.xp.array(self.anchor_base), self.feat_stride, hh, ww) + n_anchor = anchor.shape[0] // (hh * ww) h = F.relu(self.conv1(x)) rpn_locs = self.loc(h) @@ -139,7 +139,7 @@ def __call__(self, x, img_size, scale=1., test=True): return rpn_locs, rpn_scores, rois, roi_indices, anchor -def _enumerate_shifted_anchor(anchor_base, feat_stride, width, height): +def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width): # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to @@ -147,11 +147,11 @@ def _enumerate_shifted_anchor(anchor_base, feat_stride, width, height): # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors xp = cuda.get_array_module(anchor_base) - shift_x = xp.arange(0, width * feat_stride, feat_stride) shift_y = xp.arange(0, height * feat_stride, feat_stride) + shift_x = xp.arange(0, width * feat_stride, feat_stride) shift_x, shift_y = xp.meshgrid(shift_x, shift_y) - shift = xp.stack((shift_x.ravel(), shift_y.ravel(), - shift_x.ravel(), shift_y.ravel()), axis=1) + shift = xp.stack((shift_y.ravel(), shift_x.ravel(), + shift_y.ravel(), shift_x.ravel()), axis=1) A = anchor_base.shape[0] K = shift.shape[0] diff --git a/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py b/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py index 31c36afa78..4fbf681b5e 100644 --- a/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py +++ b/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py @@ -57,7 +57,7 @@ def __call__(self, bbox, anchor, img_size): :math:`(R, 4)`. anchor (array): Coordinates of anchors. Its shape is :math:`(S, 4)`. - img_size (tuple of ints): A tuple :obj:`W, H`, which + img_size (tuple of ints): A tuple :obj:`H, W`, which is a tuple of height and width of an image. Returns: @@ -74,10 +74,10 @@ def __call__(self, bbox, anchor, img_size): bbox = cuda.to_cpu(bbox) anchor = cuda.to_cpu(anchor) - img_W, img_H = img_size + img_H, img_W = img_size n_anchor = len(anchor) - inside_index = _get_inside_index(anchor, img_W, img_H) + inside_index = _get_inside_index(anchor, img_H, img_W) anchor = anchor[inside_index] argmax_ious, label = self._create_label( inside_index, anchor, bbox) @@ -156,7 +156,7 @@ def _unmap(data, count, index, fill=0): return ret -def _get_inside_index(anchor, W, H): +def _get_inside_index(anchor, H, W): # Calc indicies of anchors which are located completely inside of the image # whose size is speficied. xp = cuda.get_array_module(anchor) @@ -164,7 +164,7 @@ def _get_inside_index(anchor, W, H): index_inside = xp.where( (anchor[:, 0] >= 0) & (anchor[:, 1] >= 0) & - (anchor[:, 2] <= W) & # width - (anchor[:, 3] <= H) # height + (anchor[:, 2] <= H) & + (anchor[:, 3] <= W) )[0] return index_inside diff --git a/chainercv/links/model/faster_rcnn/utils/bbox2loc.py b/chainercv/links/model/faster_rcnn/utils/bbox2loc.py index de968dc3dc..6270f6d122 100644 --- a/chainercv/links/model/faster_rcnn/utils/bbox2loc.py +++ b/chainercv/links/model/faster_rcnn/utils/bbox2loc.py @@ -6,15 +6,16 @@ def bbox2loc(src_bbox, dst_bbox): Given bounding boxes, this function computes offsets and scales to match the source bounding boxes to the target bounding boxes. - Mathematcially, given a bounding box whose center is :math:`p_x, p_y` and - size :math:`p_w, p_h` and the target bounding box whose center is - :math:`g_x, g_y` and size :math:`g_w, g_h`, the offsets and scales - :math:`t_x, t_y, t_w, t_h` can be computed by the following formulas. + Mathematcially, given a bounding box whose center is + :math:`(y, x) = p_y, p_x` and + size :math:`p_h, p_w` and the target bounding box whose center is + :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales + :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas. - * :math:`t_x = \\frac{(g_x - p_x)} {p_w}` * :math:`t_y = \\frac{(g_y - p_y)} {p_h}` - * :math:`t_w = \\log(\\frac{g_w} {p_w})` + * :math:`t_x = \\frac{(g_x - p_x)} {p_w}` * :math:`t_h = \\log(\\frac{g_h} {p_h})` + * :math:`t_w = \\log(\\frac{g_w} {p_w})` The output is same type as the type of the inputs. The encoding formulas are used in works such as R-CNN [#]_. @@ -26,35 +27,35 @@ def bbox2loc(src_bbox, dst_bbox): Args: src_bbox (array): An image coordinate array whose shape is :math:`(R, 4)`. :math:`R` is the number of bounding boxes. - These coordinates are used to compute :math:`p_x, p_y, p_w, p_h`. + These coordinates are used to compute :math:`p_y, p_x, p_h, p_w`. dst_bbox (array): An image coordinate array whose shape is :math:`(R, 4)`. - These coordinates are used to compute :math:`g_x, g_y, g_w, g_h`. + These coordinates are used to compute :math:`g_y, g_x, g_h, g_w`. Returns: array: Bounding box offsets and scales from :obj:`src_bbox` \ to :obj:`dst_bbox`. \ This has shape :math:`(R, 4)`. - The second axis contains four values :math:`t_x, t_y, t_w, t_h`. + The second axis contains four values :math:`t_y, t_x, t_h, t_w`. """ xp = cuda.get_array_module(src_bbox) - width = src_bbox[:, 2] - src_bbox[:, 0] - height = src_bbox[:, 3] - src_bbox[:, 1] - ctr_x = src_bbox[:, 0] + 0.5 * width - ctr_y = src_bbox[:, 1] + 0.5 * height + height = src_bbox[:, 2] - src_bbox[:, 0] + width = src_bbox[:, 3] - src_bbox[:, 1] + ctr_y = src_bbox[:, 0] + 0.5 * height + ctr_x = src_bbox[:, 1] + 0.5 * width - base_width = dst_bbox[:, 2] - dst_bbox[:, 0] - base_height = dst_bbox[:, 3] - dst_bbox[:, 1] - base_ctr_x = dst_bbox[:, 0] + 0.5 * base_width - base_ctr_y = dst_bbox[:, 1] + 0.5 * base_height + base_height = dst_bbox[:, 2] - dst_bbox[:, 0] + base_width = dst_bbox[:, 3] - dst_bbox[:, 1] + base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height + base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width - dx = (base_ctr_x - ctr_x) / width dy = (base_ctr_y - ctr_y) / height - dw = xp.log(base_width / width) + dx = (base_ctr_x - ctr_x) / width dh = xp.log(base_height / height) + dw = xp.log(base_width / width) - loc = xp.vstack((dx, dy, dw, dh)).transpose() + loc = xp.vstack((dy, dx, dh, dw)).transpose() return loc diff --git a/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py b/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py index 666b3e7e0a..c29b3ef2c9 100644 --- a/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py +++ b/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py @@ -35,23 +35,23 @@ def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], ~numpy.ndarray: An array of shape :math:`(R, 4)`. Each element is a set of coordinates of a bounding box. - The second axis corresponds to :obj:`x_min, y_min, x_max, y_max` + The second axis corresponds to :obj:`y_min, x_min, y_max, x_max` of a bounding box. """ - px = base_size / 2. py = base_size / 2. + px = base_size / 2. anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32) for i in six.moves.range(len(ratios)): for j in six.moves.range(len(anchor_scales)): - w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i]) h = base_size * anchor_scales[j] * np.sqrt(ratios[i]) + w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i]) index = i * len(anchor_scales) + j - anchor_base[index, 0] = px - w / 2. - anchor_base[index, 1] = py - h / 2. - anchor_base[index, 2] = px + w / 2. - anchor_base[index, 3] = py + h / 2. + anchor_base[index, 0] = py - h / 2. + anchor_base[index, 1] = px - w / 2. + anchor_base[index, 2] = py + h / 2. + anchor_base[index, 3] = px + w / 2. return anchor_base diff --git a/chainercv/links/model/faster_rcnn/utils/loc2bbox.py b/chainercv/links/model/faster_rcnn/utils/loc2bbox.py index 96bfcbc7ac..00d111ac75 100644 --- a/chainercv/links/model/faster_rcnn/utils/loc2bbox.py +++ b/chainercv/links/model/faster_rcnn/utils/loc2bbox.py @@ -8,16 +8,16 @@ def loc2bbox(src_bbox, loc): :meth:`bbox2loc`, this function decodes the representation to coordinates in 2D image coordinates. - Given scales and offsets :math:`t_x, t_y, t_w, t_h` and a bounding - box whose center is :math:`p_x, p_y` and size :math:`p_w, p_h`, - the decoded bounding box's center :math:`\\hat{g}_x`, :math:`\\hat{g}_y` - and size :math:`\\hat{g}_w`, :math:`\\hat{g}_h` are calculated + Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding + box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`, + the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x` + and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated by the following formulas. - * :math:`\\hat{g}_x = p_w t_x + p_x` * :math:`\\hat{g}_y = p_h t_y + p_y` - * :math:`\\hat{g}_w = p_w \\exp(t_w)` + * :math:`\\hat{g}_x = p_w t_x + p_x` * :math:`\\hat{g}_h = p_h \\exp(t_h)` + * :math:`\\hat{g}_w = p_w \\exp(t_w)` The decoding formulas are used in works such as R-CNN [#]_. @@ -30,16 +30,16 @@ def loc2bbox(src_bbox, loc): Args: src_bbox (array): A coordinates of bounding boxes. Its shape is :math:`(R, 4)`. These coordinates are used to - compute :math:`p_x, p_y, p_w, p_h`. + compute :math:`p_y, p_x, p_h, p_w`. loc (array): An array with offsets and scales. The shapes of :obj:`src_bbox` and :obj:`loc` should be same. - This contains values :math:`t_x, t_y, t_w, t_h`. + This contains values :math:`t_y, t_x, t_h, t_w`. Returns: array: Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \ The second axis contains four values \ - :math:`\\hat{g}_x, \\hat{g}_y, \\hat{g}_w, \\hat{g}_h`. + :math:`\\hat{g}_y, \\hat{g}_x, \\hat{g}_h, \\hat{g}_w`. """ xp = cuda.get_array_module(src_bbox) @@ -49,25 +49,25 @@ def loc2bbox(src_bbox, loc): src_bbox = src_bbox.astype(src_bbox.dtype, copy=False) - src_width = src_bbox[:, 2] - src_bbox[:, 0] - src_height = src_bbox[:, 3] - src_bbox[:, 1] - src_ctr_x = src_bbox[:, 0] + 0.5 * src_width - src_ctr_y = src_bbox[:, 1] + 0.5 * src_height + src_height = src_bbox[:, 2] - src_bbox[:, 0] + src_width = src_bbox[:, 3] - src_bbox[:, 1] + src_ctr_y = src_bbox[:, 0] + 0.5 * src_height + src_ctr_x = src_bbox[:, 1] + 0.5 * src_width - dx = loc[:, 0::4] - dy = loc[:, 1::4] - dw = loc[:, 2::4] - dh = loc[:, 3::4] + dy = loc[:, 0::4] + dx = loc[:, 1::4] + dh = loc[:, 2::4] + dw = loc[:, 3::4] - ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis] ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis] - w = xp.exp(dw) * src_width[:, xp.newaxis] + ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis] h = xp.exp(dh) * src_height[:, xp.newaxis] + w = xp.exp(dw) * src_width[:, xp.newaxis] dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype) - dst_bbox[:, 0::4] = ctr_x - 0.5 * w - dst_bbox[:, 1::4] = ctr_y - 0.5 * h - dst_bbox[:, 2::4] = ctr_x + 0.5 * w - dst_bbox[:, 3::4] = ctr_y + 0.5 * h + dst_bbox[:, 0::4] = ctr_y - 0.5 * h + dst_bbox[:, 1::4] = ctr_x - 0.5 * w + dst_bbox[:, 2::4] = ctr_y + 0.5 * h + dst_bbox[:, 3::4] = ctr_x + 0.5 * w return dst_bbox diff --git a/chainercv/links/model/faster_rcnn/utils/proposal_creator.py b/chainercv/links/model/faster_rcnn/utils/proposal_creator.py index 97c054a7c5..667e35a4db 100644 --- a/chainercv/links/model/faster_rcnn/utils/proposal_creator.py +++ b/chainercv/links/model/faster_rcnn/utils/proposal_creator.py @@ -82,7 +82,7 @@ def __call__(self, loc, score, Its shape is :math:`(R,)`. anchor (array): Coordinates of anchors. Its shape is :math:`(R, 4)`. - img_size (tuple of ints): A tuple :obj:`width, height`, + img_size (tuple of ints): A tuple :obj:`height, width`, which contains image size after scaling. scale (float): The scaling factor used to scale an image after reading it from a file. @@ -118,9 +118,9 @@ def __call__(self, loc, score, # Remove predicted boxes with either height or width < threshold. min_size = self.min_size * scale - ws = roi[:, 2] - roi[:, 0] - hs = roi[:, 3] - roi[:, 1] - keep = np.where((ws >= min_size) & (hs >= min_size))[0] + hs = roi[:, 2] - roi[:, 0] + ws = roi[:, 3] - roi[:, 1] + keep = np.where((hs >= min_size) & (ws >= min_size))[0] roi = roi[keep, :] score = score[keep] diff --git a/chainercv/links/model/segnet/segnet_basic.py b/chainercv/links/model/segnet/segnet_basic.py index d975b8ab49..4bd4857fdb 100644 --- a/chainercv/links/model/segnet/segnet_basic.py +++ b/chainercv/links/model/segnet/segnet_basic.py @@ -177,7 +177,7 @@ def predict(self, imgs): score = chainer.cuda.to_cpu(score) if score.shape != (C, H, W): dtype = score.dtype - score = resize(score, (W, H)).astype(dtype) + score = resize(score, (H, W)).astype(dtype) label = np.argmax(score, axis=0) labels.append(label) diff --git a/chainercv/links/model/ssd/ssd.py b/chainercv/links/model/ssd/ssd.py index f00689c680..c48b915c00 100644 --- a/chainercv/links/model/ssd/ssd.py +++ b/chainercv/links/model/ssd/ssd.py @@ -76,25 +76,25 @@ def __init__( super(SSD, self).__init__(extractor=extractor, multibox=multibox) - # the format of default_bbox is (center_x, center_y, width, height) + # the format of default_bbox is (center_y, center_x, height, width) self._default_bbox = list() for k, grid in enumerate(extractor.grids): for v, u in itertools.product(range(grid), repeat=2): - cx = (u + 0.5) * steps[k] cy = (v + 0.5) * steps[k] + cx = (u + 0.5) * steps[k] s = sizes[k] - self._default_bbox.append((cx, cy, s, s)) + self._default_bbox.append((cy, cx, s, s)) s = np.sqrt(sizes[k] * sizes[k + 1]) - self._default_bbox.append((cx, cy, s, s)) + self._default_bbox.append((cy, cx, s, s)) s = sizes[k] for ar in multibox.aspect_ratios[k]: self._default_bbox.append( - (cx, cy, s * np.sqrt(ar), s / np.sqrt(ar))) + (cy, cx, s / np.sqrt(ar), s * np.sqrt(ar))) self._default_bbox.append( - (cx, cy, s / np.sqrt(ar), s * np.sqrt(ar))) + (cy, cx, s * np.sqrt(ar), s / np.sqrt(ar))) self._default_bbox = np.stack(self._default_bbox) @property @@ -141,13 +141,13 @@ def __call__(self, x): def _decode(self, loc, conf): xp = self.xp - # the format of bbox is (center_x, center_y, width, height) + # the format of bbox is (center_y, center_x, height, width) bboxes = xp.dstack(( self._default_bbox[:, :2] + loc[:, :, :2] * self.variance[0] * self._default_bbox[:, 2:], self._default_bbox[:, 2:] * xp.exp(loc[:, :, 2:] * self.variance[1]))) - # convert the format of bbox to (x_min, y_min, x_max, y_max) + # convert the format of bbox to (y_min, x_min, y_max, x_max) bboxes[:, :, :2] -= bboxes[:, :, 2:] / 2 bboxes[:, :, 2:] += bboxes[:, :, :2] scores = xp.exp(conf) @@ -236,7 +236,7 @@ def predict(self, imgs): * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ - :obj:`(x_min, y_min, x_max, y_max)` \ + :obj:`(y_min, x_min, y_max, x_max)` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ @@ -253,7 +253,7 @@ def predict(self, imgs): _, H, W = img.shape img = self._prepare(img) x.append(self.xp.array(img)) - sizes.append((W, H)) + sizes.append((H, W)) x = chainer.Variable(self.xp.stack(x), volatile=chainer.flag.ON) loc, conf = self(x) diff --git a/chainercv/links/model/ssd/ssd_vgg16.py b/chainercv/links/model/ssd/ssd_vgg16.py index 5ed012b8a7..48e1f90812 100644 --- a/chainercv/links/model/ssd/ssd_vgg16.py +++ b/chainercv/links/model/ssd/ssd_vgg16.py @@ -282,7 +282,7 @@ class SSD300(SSD): 'voc0712': { 'n_fg_class': 20, 'url': 'https://github.com/yuyu2172/share-weights/releases/' - 'download/0.0.2/ssd300_voc0712_2017_05_24.npz' + 'download/0.0.3/ssd300_voc0712_2017_06_06.npz' } } @@ -338,7 +338,7 @@ class SSD512(SSD): 'voc0712': { 'n_fg_class': 20, 'url': 'https://github.com/yuyu2172/share-weights/releases/' - 'download/0.0.2/ssd512_voc0712_2017_05_24.npz' + 'download/0.0.3/ssd512_voc0712_2017_06_06.npz' } } diff --git a/chainercv/transforms/bbox/flip_bbox.py b/chainercv/transforms/bbox/flip_bbox.py index 2968120acf..668c5e96a7 100644 --- a/chainercv/transforms/bbox/flip_bbox.py +++ b/chainercv/transforms/bbox/flip_bbox.py @@ -1,38 +1,38 @@ -def flip_bbox(bbox, size, x_flip=False, y_flip=False): +def flip_bbox(bbox, size, y_flip=False, x_flip=False): """Flip bounding boxes accordingly. The bounding boxes are expected to be packed into a two dimensional tensor of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. The second axis represents attributes of - the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`, + the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`, where the four attributes are coordinates of the bottom left and the top right vertices. Args: bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. :math:`R` is the number of bounding boxes. - size (tuple): A tuple of length 2. The width and the height + size (tuple): A tuple of length 2. The height and the width of the image before resized. - x_flip (bool): Flip bounding box according to a horizontal flip of - an image. y_flip (bool): Flip bounding box according to a vertical flip of an image. + x_flip (bool): Flip bounding box according to a horizontal flip of + an image. Returns: ~numpy.ndarray: Bounding boxes flipped according to the given flips. """ - W, H = size + H, W = size bbox = bbox.copy() - if x_flip: - x_max = W - 1 - bbox[:, 0] - x_min = W - 1 - bbox[:, 2] - bbox[:, 0] = x_min - bbox[:, 2] = x_max if y_flip: - y_max = H - 1 - bbox[:, 1] - y_min = H - 1 - bbox[:, 3] - bbox[:, 1] = y_min - bbox[:, 3] = y_max + y_max = H - 1 - bbox[:, 0] + y_min = H - 1 - bbox[:, 2] + bbox[:, 0] = y_min + bbox[:, 2] = y_max + if x_flip: + x_max = W - 1 - bbox[:, 1] + x_min = W - 1 - bbox[:, 3] + bbox[:, 1] = x_min + bbox[:, 3] = x_max return bbox diff --git a/chainercv/transforms/bbox/resize_bbox.py b/chainercv/transforms/bbox/resize_bbox.py index 59c1293858..d71de4de0a 100644 --- a/chainercv/transforms/bbox/resize_bbox.py +++ b/chainercv/transforms/bbox/resize_bbox.py @@ -4,16 +4,16 @@ def resize_bbox(bbox, in_size, out_size): The bounding boxes are expected to be packed into a two dimensional tensor of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. The second axis represents attributes of - the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`, + the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`, where the four attributes are coordinates of the bottom left and the top right vertices. Args: bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. :math:`R` is the number of bounding boxes. - in_size (tuple): A tuple of length 2. The width and the height + in_size (tuple): A tuple of length 2. The height and the width of the image before resized. - out_size (tuple): A tuple of length 2. The width and the height + out_size (tuple): A tuple of length 2. The height and the width of the image after resized. Returns: @@ -22,10 +22,10 @@ def resize_bbox(bbox, in_size, out_size): """ bbox = bbox.copy() - x_scale = float(out_size[0]) / in_size[0] - y_scale = float(out_size[1]) / in_size[1] - bbox[:, 0] = x_scale * bbox[:, 0] - bbox[:, 2] = x_scale * bbox[:, 2] - bbox[:, 1] = y_scale * bbox[:, 1] - bbox[:, 3] = y_scale * bbox[:, 3] + y_scale = float(out_size[0]) / in_size[0] + x_scale = float(out_size[1]) / in_size[1] + bbox[:, 0] = y_scale * bbox[:, 0] + bbox[:, 2] = y_scale * bbox[:, 2] + bbox[:, 1] = x_scale * bbox[:, 1] + bbox[:, 3] = x_scale * bbox[:, 3] return bbox diff --git a/chainercv/transforms/bbox/translate_bbox.py b/chainercv/transforms/bbox/translate_bbox.py index 1afaf64958..c6410a8915 100644 --- a/chainercv/transforms/bbox/translate_bbox.py +++ b/chainercv/transforms/bbox/translate_bbox.py @@ -1,22 +1,23 @@ -def translate_bbox(bbox, x_offset=0, y_offset=0): +def translate_bbox(bbox, y_offset=0, x_offset=0): """Translate bounding boxes. This method is mainly used together with image transforms, such as padding and cropping, which translates the left top point of the image from - coordinate :math:`(0, 0)` to coordinate :math:`(x\_offset, y\_offset)`. + coordinate :math:`(0, 0)` to coordinate + :math:`(y, x) = (y\_offset, x\_offset)`. The bounding boxes are expected to be packed into a two dimensional tensor of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. The second axis represents attributes of - the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`, + the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`, where the four attributes are coordinates of the bottom left and the top right vertices. Args: bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is :math:`(R, 4)`. :math:`R` is the number of bounding boxes. - x_offset (int or float): The offset along x axis. y_offset (int or float): The offset along y axis. + x_offset (int or float): The offset along x axis. Returns: ~numpy.ndarray: @@ -25,7 +26,7 @@ def translate_bbox(bbox, x_offset=0, y_offset=0): """ out_bbox = bbox.copy() - out_bbox[:, :2] += (x_offset, y_offset) - out_bbox[:, 2:] += (x_offset, y_offset) + out_bbox[:, :2] += (y_offset, x_offset) + out_bbox[:, 2:] += (y_offset, x_offset) return out_bbox diff --git a/chainercv/transforms/image/center_crop.py b/chainercv/transforms/image/center_crop.py index d584b1f7ca..2301993ef8 100644 --- a/chainercv/transforms/image/center_crop.py +++ b/chainercv/transforms/image/center_crop.py @@ -8,7 +8,7 @@ def center_crop(img, size, return_param=False, copy=False): img (~numpy.ndarray): An image array to be cropped. This is in CHW format. size (tuple): The size of output image after cropping. - This value is :math:`(width, height)`. + This value is :math:`(height, width)`. return_param (bool): If :obj:`True`, this function returns information of slices. copy (bool): If :obj:`False`, a view of :obj:`img` is returned. @@ -26,9 +26,9 @@ def center_crop(img, size, return_param=False, copy=False): contents are listed below with key, value-type and the description of the value. - * **x_slice** (*slice*): A slice used to crop the input image.\ - The relation below holds together with :obj:`y_slice`. - * **y_slice** (*slice*): Similar to :obj:`x_slice`. + * **y_slice** (*slice*): A slice used to crop the input image.\ + The relation below holds together with :obj:`x_slice`. + * **x_slice** (*slice*): Similar to :obj:`y_slice`. .. code:: @@ -36,15 +36,15 @@ def center_crop(img, size, return_param=False, copy=False): """ _, H, W = img.shape - oW, oH = size - if oW > W or oH > H: + oH, oW = size + if oH > H or oW > W: raise ValueError('shape of image needs to be larger than size') - x_offset = int(round((W - oW) / 2.)) y_offset = int(round((H - oH) / 2.)) + x_offset = int(round((W - oW) / 2.)) - x_slice = slice(x_offset, x_offset + oW) y_slice = slice(y_offset, y_offset + oH) + x_slice = slice(x_offset, x_offset + oW) img = img[:, y_slice, x_slice] @@ -52,6 +52,6 @@ def center_crop(img, size, return_param=False, copy=False): img = img.copy() if return_param: - return img, {'x_slice': x_slice, 'y_slice': y_slice} + return img, {'y_slice': y_slice, 'x_slice': x_slice} else: return img diff --git a/chainercv/transforms/image/flip.py b/chainercv/transforms/image/flip.py index e0cab4b374..a377999964 100644 --- a/chainercv/transforms/image/flip.py +++ b/chainercv/transforms/image/flip.py @@ -1,20 +1,20 @@ -def flip(img, x_flip=False, y_flip=False, copy=False): +def flip(img, y_flip=False, x_flip=False, copy=False): """Flip an image in vertical or horizontal direction as specified. Args: img (~numpy.ndarray): An array that gets flipped. This is in CHW format. - x_flip (bool): Flip in horizontal direction. y_flip (bool): Flip in vertical direction. + x_flip (bool): Flip in horizontal direction. copy (bool): If False, a view of :obj:`img` will be returned. Returns: Transformed :obj:`img` in CHW format. """ - if x_flip: - img = img[:, :, ::-1] if y_flip: img = img[:, ::-1, :] + if x_flip: + img = img[:, :, ::-1] if copy: img = img.copy() diff --git a/chainercv/transforms/image/random_crop.py b/chainercv/transforms/image/random_crop.py index 9aebce8581..3a79c202e7 100644 --- a/chainercv/transforms/image/random_crop.py +++ b/chainercv/transforms/image/random_crop.py @@ -12,7 +12,7 @@ def random_crop(img, size, return_param=False, copy=False): img (~numpy.ndarray): An image array to be cropped. This is in CHW format. size (tuple): The size of output image after cropping. - This value is :math:`(width, height)`. + This value is :math:`(height, width)`. return_param (bool): If :obj:`True`, this function returns information of slices. copy (bool): If :obj:`False`, a view of :obj:`img` is returned. @@ -30,24 +30,16 @@ def random_crop(img, size, return_param=False, copy=False): contents are listed below with key, value-type and the description of the value. - * **x_slice** (*slice*): A slice used to crop the input image.\ - The relation below holds together with :obj:`y_slice`. - * **y_slice** (*slice*): Similar to :obj:`x_slice`. + * **y_slice** (*slice*): A slice used to crop the input image.\ + The relation below holds together with :obj:`x_slice`. + * **x_slice** (*slice*): Similar to :obj:`x_slice`. .. code:: out_img = img[:, y_slice, x_slice] """ - W, H = size - - if img.shape[2] == W: - x_offset = 0 - elif img.shape[2] > W: - x_offset = random.choice(six.moves.range(img.shape[2] - W)) - else: - raise ValueError('shape of image needs to be larger than output shape') - x_slice = slice(x_offset, x_offset + W) + H, W = size if img.shape[1] == H: y_offset = 0 @@ -57,12 +49,20 @@ def random_crop(img, size, return_param=False, copy=False): raise ValueError('shape of image needs to be larger than output shape') y_slice = slice(y_offset, y_offset + H) + if img.shape[2] == W: + x_offset = 0 + elif img.shape[2] > W: + x_offset = random.choice(six.moves.range(img.shape[2] - W)) + else: + raise ValueError('shape of image needs to be larger than output shape') + x_slice = slice(x_offset, x_offset + W) + img = img[:, y_slice, x_slice] if copy: img = img.copy() if return_param: - return img, {'x_slice': x_slice, 'y_slice': y_slice} + return img, {'y_slice': y_slice, 'x_slice': x_slice} else: return img diff --git a/chainercv/transforms/image/random_expand.py b/chainercv/transforms/image/random_expand.py index 55160dfb9e..a05dc5735e 100644 --- a/chainercv/transforms/image/random_expand.py +++ b/chainercv/transforms/image/random_expand.py @@ -6,7 +6,7 @@ def random_expand(img, max_ratio=4, fill=0, return_param=False): """Expand an image randomly. This method randomly place the input image on a larger canvas. The size of - the canvas is :math:`(rW, rH)`, where :math:`(W, H)` is the size of the + the canvas is :math:`(rH, rW)`, where :math:`(H, W)` is the size of the input image and :math:`r` is a random ratio drawn from :math:`[1, max\_ratio]`. The canvas is filled by a value :obj:`fill` except for the region where the original image is placed. @@ -39,16 +39,16 @@ def random_expand(img, max_ratio=4, fill=0, return_param=False): of the value. * **ratio** (*float*): The sampled value used to make the canvas. - * **x_offset** (*int*): The x coordinate of the top left corner\ - of the image after placing on the canvas. * **y_offset** (*int*): The y coodinate of the top left corner of\ the image after placing on the canvas. + * **x_offset** (*int*): The x coordinate of the top left corner\ + of the image after placing on the canvas. """ if max_ratio <= 1: if return_param: - return img, {'ratio': 1, 'x_offset': 0, 'y_offset': 0} + return img, {'ratio': 1, 'y_offset': 0, 'x_offset': 0} else: return img @@ -57,15 +57,15 @@ def random_expand(img, max_ratio=4, fill=0, return_param=False): ratio = random.uniform(1, max_ratio) out_H, out_W = int(H * ratio), int(W * ratio) - x_offset = random.randint(0, out_W - W) y_offset = random.randint(0, out_H - H) + x_offset = random.randint(0, out_W - W) out_img = np.empty((C, out_H, out_W), dtype=img.dtype) out_img[:] = np.array(fill).reshape(-1, 1, 1) out_img[:, y_offset:y_offset + H, x_offset:x_offset + W] = img if return_param: - param = {'ratio': ratio, 'x_offset': x_offset, 'y_offset': y_offset} + param = {'ratio': ratio, 'y_offset': y_offset, 'x_offset': x_offset} return out_img, param else: return out_img diff --git a/chainercv/transforms/image/random_flip.py b/chainercv/transforms/image/random_flip.py index ce14051302..ff982a7262 100644 --- a/chainercv/transforms/image/random_flip.py +++ b/chainercv/transforms/image/random_flip.py @@ -1,15 +1,15 @@ import random -def random_flip(img, x_random=False, y_random=False, +def random_flip(img, y_random=False, x_random=False, return_param=False, copy=False): """Randomly flip an image in vertical or horizontal direction. Args: img (~numpy.ndarray): An array that gets flipped. This is in CHW format. - x_random (bool): Randomly flip in horizontal direction. y_random (bool): Randomly flip in vertical direction. + x_random (bool): Randomly flip in horizontal direction. return_param (bool): Returns information of flip. copy (bool): If False, a view of :obj:`img` will be returned. @@ -25,27 +25,27 @@ def random_flip(img, x_random=False, y_random=False, contents are listed below with key, value-type and the description of the value. - * **x_flip** (*bool*): Whether the image was flipped in the\ - horizontal direction or not. * **y_flip** (*bool*): Whether the image was flipped in the\ vertical direction or not. + * **x_flip** (*bool*): Whether the image was flipped in the\ + horizontal direction or not. """ - x_flip, y_flip = False, False - if x_random: - x_flip = random.choice([True, False]) + y_flip, x_flip = False, False if y_random: y_flip = random.choice([True, False]) + if x_random: + x_flip = random.choice([True, False]) - if x_flip: - img = img[:, :, ::-1] if y_flip: img = img[:, ::-1, :] + if x_flip: + img = img[:, :, ::-1] if copy: img = img.copy() if return_param: - return img, {'x_flip': x_flip, 'y_flip': y_flip} + return img, {'y_flip': y_flip, 'x_flip': x_flip} else: return img diff --git a/chainercv/transforms/image/resize.py b/chainercv/transforms/image/resize.py index e29cc5c8b1..e056f1437e 100644 --- a/chainercv/transforms/image/resize.py +++ b/chainercv/transforms/image/resize.py @@ -16,7 +16,8 @@ def _resize(img, size, interpolation): cv_interpolation = cv2.INTER_CUBIC elif interpolation == PIL.Image.LANCZOS: cv_interpolation = cv2.INTER_LANCZOS4 - img = cv2.resize(img, dsize=size, interpolation=cv_interpolation) + H, W = size + img = cv2.resize(img, dsize=(W, H), interpolation=cv_interpolation) # If input is a grayscale image, cv2 returns a two-dimentional array. if len(img.shape) == 2: @@ -32,11 +33,11 @@ def _resize(img, size, interpolation): def _resize(img, size, interpolation): C = img.shape[0] - W, H = size + H, W = size out = np.empty((C, H, W), dtype=img.dtype) for ch, out_ch in zip(img, out): ch = PIL.Image.fromarray(ch, mode='F') - out_ch[:] = ch.resize(size, resample=interpolation) + out_ch[:] = ch.resize((W, H), resample=interpolation) return out @@ -57,7 +58,7 @@ def resize(img, size, interpolation=PIL.Image.BILINEAR): img (~numpy.ndarray): An array to be transformed. This is in CHW format and the type should be :obj:`numpy.float32`. size (tuple): This is a tuple of length 2. Its elements are - ordered as (width, height). + ordered as (height, width). interpolation (int): Determines sampling strategy. This is one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.BICUBIC`, :obj:`PIL.Image.LANCZOS`. diff --git a/chainercv/transforms/image/resize_contain.py b/chainercv/transforms/image/resize_contain.py index c124342236..600f9cd041 100644 --- a/chainercv/transforms/image/resize_contain.py +++ b/chainercv/transforms/image/resize_contain.py @@ -6,8 +6,8 @@ def resize_contain(img, size, fill=0, return_param=False): """Resize the image to fit in the given area while keeping aspect ratio. - If both the width and the height in :obj:`size` are larger than the - width and the height of the :obj:`img`, the :obj:`img` is placed on + If both the height and the width in :obj:`size` are larger than + the height and the width of the :obj:`img`, the :obj:`img` is placed on the center with an appropriate padding to match :obj:`size`. Otherwise, the input image is scaled to fit in a canvas whose size @@ -17,7 +17,7 @@ def resize_contain(img, size, fill=0, return_param=False): img (~numpy.ndarray): An array to be transformed. This is in CHW format. size (tuple of two ints): A tuple of two elements: - :obj:`width, height`. The size of the image after resizing. + :obj:`height, width`. The size of the image after resizing. fill (float, tuple or ~numpy.ndarray): The value of padded pixels. return_param (bool): Returns information of resizing and offsetting. @@ -33,30 +33,30 @@ def resize_contain(img, size, fill=0, return_param=False): contents are listed below with key, value-type and the description of the value. - * **x_offset** (*int*): The x coordinate of the top left corner\ - of the image after placing on the canvas. * **y_offset** (*int*): The y coodinate of the top left corner of\ the image after placing on the canvas. + * **x_offset** (*int*): The x coordinate of the top left corner\ + of the image after placing on the canvas. * **scaled_size** (*tuple*): The size to which the image is scaled\ to before placing it on a canvas. This is a tuple of two elements:\ - :obj:`width, height`. + :obj:`height, width`. """ C, H, W = img.shape - out_W, out_H = size + out_H, out_W = size scale_h = out_H / float(H) scale_w = out_W / float(W) scale = min(min(scale_h, scale_w), 1.) - scaled_size = (int(W * scale), int(H * scale)) + scaled_size = (int(H * scale), int(W * scale)) if scale < 1.: img = resize(img, scaled_size) - x_slice, y_slice = _get_pad_slice(img, size=size) + y_slice, x_slice = _get_pad_slice(img, size=size) out_img = np.empty((C, out_H, out_W), dtype=img.dtype) out_img[:] = np.array(fill).reshape(-1, 1, 1) out_img[:, y_slice, x_slice] = img if return_param: - param = {'x_offset': x_slice.start, 'y_offset': y_slice.start, + param = {'y_offset': y_slice.start, 'x_offset': x_slice.start, 'scaled_size': scaled_size} return out_img, param else: @@ -68,27 +68,28 @@ def _get_pad_slice(img, size): Args: img (~numpy.ndarray): This image is in format CHW. - size (tuple of two ints): (max_W, max_H). + size (tuple of two ints): (max_H, max_W). """ _, H, W = img.shape - if W < size[0]: - diff_x = size[0] - W - margin_x = diff_x / 2 - if diff_x % 2 == 0: - x_slice = slice(int(margin_x), int(size[0] - margin_x)) + if H < size[0]: + diff_y = size[0] - H + margin_y = diff_y / 2 + if diff_y % 2 == 0: + y_slice = slice(int(margin_y), int(size[0] - margin_y)) else: - x_slice = slice(int(margin_x), int(size[0] - margin_x - 1)) + y_slice = slice(int(margin_y), int(size[0] - margin_y - 1)) else: - x_slice = slice(0, int(size[0])) + y_slice = slice(0, int(size[0])) - if H < size[1]: - diff_y = size[1] - H - margin_y = diff_y / 2 - if diff_y % 2 == 0: - y_slice = slice(int(margin_y), int(size[1] - margin_y)) + if W < size[1]: + diff_x = size[1] - W + margin_x = diff_x / 2 + if diff_x % 2 == 0: + x_slice = slice(int(margin_x), int(size[1] - margin_x)) else: - y_slice = slice(int(margin_y), int(size[1] - margin_y - 1)) + x_slice = slice(int(margin_x), int(size[1] - margin_x - 1)) else: - y_slice = slice(0, int(size[1])) - return x_slice, y_slice + x_slice = slice(0, int(size[1])) + + return y_slice, x_slice diff --git a/chainercv/transforms/image/scale.py b/chainercv/transforms/image/scale.py index 0700a9b258..32b1c13a7c 100644 --- a/chainercv/transforms/image/scale.py +++ b/chainercv/transforms/image/scale.py @@ -27,19 +27,21 @@ def scale(img, size, fit_short=True): _, H, W = img.shape # If resizing is not necessary, return the input as is. - if fit_short and (W <= H and W == size) or (H <= W and H == size): + if fit_short and (H <= W and H == size) or (W <= H and W == size): return img - if not fit_short and (W >= H and W == size) or (H >= W and H == size): + if not fit_short and (H >= W and H == size) or (W >= H and W == size): return img if fit_short: - if W < H: - out_size = (size, int(size * H / W)) + if H < W: + out_size = (size, int(size * W / H)) else: - out_size = (int(size * W / H), size) + out_size = (int(size * H / W), size) + else: - if W < H: - out_size = (int(size * W / H), size) + if H < W: + out_size = (int(size * H / W), size) else: - out_size = (size, int(size * H / W)) + out_size = (size, int(size * W / H)) + return resize(img, out_size) diff --git a/chainercv/transforms/image/ten_crop.py b/chainercv/transforms/image/ten_crop.py index 1f363ba07d..79a6029785 100644 --- a/chainercv/transforms/image/ten_crop.py +++ b/chainercv/transforms/image/ten_crop.py @@ -25,16 +25,16 @@ def ten_crop(img, size): img (~numpy.ndarray): An image array to be cropped. This is in CHW format. size (tuple): The size of output images after cropping. - This value is :math:`(width, height)`. + This value is :math:`(height, width)`. Returns: The cropped arrays. The shape of tensor is :math:`(10, C, H, W)`. """ - W, H = size + H, W = size iH, iW = img.shape[1:3] - if iW < W or iH < H: + if iH < H or iW < W: raise ValueError('shape of image is larger than output shape') crops = np.stack(( diff --git a/chainercv/transforms/keypoint/flip_keypoint.py b/chainercv/transforms/keypoint/flip_keypoint.py index 749c82eb04..76a89befb3 100644 --- a/chainercv/transforms/keypoint/flip_keypoint.py +++ b/chainercv/transforms/keypoint/flip_keypoint.py @@ -1,28 +1,28 @@ -def flip_keypoint(keypoint, size, x_flip=False, y_flip=False): +def flip_keypoint(keypoint, size, y_flip=False, x_flip=False): """Modify keypoints according to image flips. Args: keypoint (~numpy.ndarray): Keypoints in the image. The shape of this array is :math:`(K, 2)`. :math:`K` is the number of keypoints in the image. - The last dimension is composed of :math:`x` and :math:`y` + The last dimension is composed of :math:`y` and :math:`x` coordinates of the keypoints. - size (tuple): A tuple of length 2. The width and the height + size (tuple): A tuple of length 2. The height and the width of the image which is associated with the keypoints. - x_flip (bool): Modify keypoints according to a horizontal flip of - an image. y_flip (bool): Modify keypoints according to a vertical flip of an image. + x_flip (bool): Modify keypoints according to a horizontal flip of + an image. Returns: ~numpy.ndarray: Keypoints modified according to image flips. """ - W, H = size + H, W = size keypoint = keypoint.copy() - if x_flip: - keypoint[:, 0] = W - 1 - keypoint[:, 0] if y_flip: - keypoint[:, 1] = H - 1 - keypoint[:, 1] + keypoint[:, 0] = H - 1 - keypoint[:, 0] + if x_flip: + keypoint[:, 1] = W - 1 - keypoint[:, 1] return keypoint diff --git a/chainercv/transforms/keypoint/resize_keypoint.py b/chainercv/transforms/keypoint/resize_keypoint.py index 9b79e2ca35..568b08088e 100644 --- a/chainercv/transforms/keypoint/resize_keypoint.py +++ b/chainercv/transforms/keypoint/resize_keypoint.py @@ -5,11 +5,11 @@ def resize_keypoint(keypoint, in_size, out_size): keypoint (~numpy.ndarray): Keypoints in the image. The shape of this array is :math:`(K, 2)`. :math:`K` is the number of keypoint in the image. - The last dimension is composed of :math:`x` and :math:`y` + The last dimension is composed of :math:`y` and :math:`x` coordinates of the keypoints. - in_size (tuple): A tuple of length 2. The width and the height + in_size (tuple): A tuple of length 2. The height and the width of the image before resized. - out_size (tuple): A tuple of length 2. The width and the height + out_size (tuple): A tuple of length 2. The height and the width of the image after resized. Returns: @@ -18,8 +18,8 @@ def resize_keypoint(keypoint, in_size, out_size): """ keypoint = keypoint.copy() - x_scale = float(out_size[0]) / in_size[0] - y_scale = float(out_size[1]) / in_size[1] - keypoint[:, 0] = x_scale * keypoint[:, 0] - keypoint[:, 1] = y_scale * keypoint[:, 1] + y_scale = float(out_size[0]) / in_size[0] + x_scale = float(out_size[1]) / in_size[1] + keypoint[:, 0] = y_scale * keypoint[:, 0] + keypoint[:, 1] = x_scale * keypoint[:, 1] return keypoint diff --git a/chainercv/transforms/keypoint/translate_keypoint.py b/chainercv/transforms/keypoint/translate_keypoint.py index 1719474915..639617ddc5 100644 --- a/chainercv/transforms/keypoint/translate_keypoint.py +++ b/chainercv/transforms/keypoint/translate_keypoint.py @@ -1,19 +1,18 @@ -def translate_keypoint(keypoint, x_offset=0, y_offset=0): +def translate_keypoint(keypoint, y_offset=0, x_offset=0): """Translate keypoints. This method is mainly used together with image transforms, such as padding - and cropping, which translates the left top point of the image from - coordinate :math:`(0, 0)` to coordinate :math:`(x\_offset, y\_offset)`. - + and cropping, which translates the top left point of the image + to the coordinate :math:`(y, x) = (y\_offset, x\_offset)`. Args: keypoint (~numpy.ndarray): Keypoints in the image. The shape of this array is :math:`(K, 2)`. :math:`K` is the number of keypoints in the image. - The last dimension is composed of :math:`x` and :math:`y` + The last dimension is composed of :math:`y` and :math:`x` coordinates of the keypoints. - x_offset (int or float): The offset along x axis. y_offset (int or float): The offset along y axis. + x_offset (int or float): The offset along x axis. Returns: ~numpy.ndarray: @@ -23,7 +22,7 @@ def translate_keypoint(keypoint, x_offset=0, y_offset=0): out_keypoint = keypoint.copy() - out_keypoint[:, 0] += x_offset - out_keypoint[:, 1] += y_offset + out_keypoint[:, 0] += y_offset + out_keypoint[:, 1] += x_offset return out_keypoint diff --git a/chainercv/utils/bbox/bbox_iou.py b/chainercv/utils/bbox/bbox_iou.py index db286bb624..79f4a40b84 100644 --- a/chainercv/utils/bbox/bbox_iou.py +++ b/chainercv/utils/bbox/bbox_iou.py @@ -32,12 +32,12 @@ def bbox_iou(bbox_a, bbox_b): raise IndexError xp = cuda.get_array_module(bbox_a) - # left top - lt = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) - # right bottom - rb = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:]) + # top left + tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) + # bottom right + br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:]) - area_i = xp.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2) area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1) area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1) return area_i / (area_a[:, None] + area_b - area_i) diff --git a/chainercv/utils/bbox/non_maximum_suppression.py b/chainercv/utils/bbox/non_maximum_suppression.py index 1d28a99dc7..cc9382d967 100644 --- a/chainercv/utils/bbox/non_maximum_suppression.py +++ b/chainercv/utils/bbox/non_maximum_suppression.py @@ -31,7 +31,7 @@ def non_maximum_suppression(bbox, thresh, score=None, The bounding boxes are expected to be packed into a two dimensional tensor of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. The second axis represents attributes of - the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`, + the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`, where the four attributes are coordinates of the top left and the bottom right vertices. @@ -80,9 +80,9 @@ def _non_maximum_suppression_cpu(bbox, thresh, score=None, limit=None): selec = np.zeros(bbox.shape[0], dtype=bool) for i, b in enumerate(bbox): - lt = np.maximum(b[:2], bbox[selec, :2]) - rb = np.minimum(b[2:], bbox[selec, 2:]) - area = np.prod(rb - lt, axis=1) * (lt < rb).all(axis=1) + tl = np.maximum(b[:2], bbox[selec, :2]) + br = np.minimum(b[2:], bbox[selec, 2:]) + area = np.prod(br - tl, axis=1) * (tl < br).all(axis=1) iou = area / (bbox_area[i] + bbox_area[selec] - area) if (iou >= thresh).any(): @@ -127,13 +127,13 @@ def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None): __device__ inline float devIoU(float const *const bbox_a, float const *const bbox_b) { - float left = max(bbox_a[0], bbox_b[0]); - float right = min(bbox_a[2], bbox_b[2]); - float top = max(bbox_a[1], bbox_b[1]); - float bottom = min(bbox_a[3], bbox_b[3]); - float width = max(right - left, 0.f); + float top = max(bbox_a[0], bbox_b[0]); + float bottom = min(bbox_a[2], bbox_b[2]); + float left = max(bbox_a[1], bbox_b[1]); + float right = min(bbox_a[3], bbox_b[3]); float height = max(bottom - top, 0.f); - float area_i = width * height; + float width = max(right - left, 0.f); + float area_i = height * width; float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]); float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]); return area_i / (area_a + area_b - area_i); diff --git a/chainercv/utils/testing/generate_random_bbox.py b/chainercv/utils/testing/generate_random_bbox.py index 95f39b7026..fef310d9f4 100644 --- a/chainercv/utils/testing/generate_random_bbox.py +++ b/chainercv/utils/testing/generate_random_bbox.py @@ -6,7 +6,7 @@ def generate_random_bbox(n, img_size, min_length, max_length): Args: n (int): The number of bounding boxes. - img_size (tuple): A tuple of length 2. The width and the height + img_size (tuple): A tuple of length 2. The height and the width of the image on which bounding boxes locate. min_length (float): The minimum length of edges of bounding boxes. max_length (float): The maximum length of edges of bounding boxes. @@ -15,17 +15,17 @@ def generate_random_bbox(n, img_size, min_length, max_length): numpy.ndarray: Coordinates of bounding boxes. Its shape is :math:`(R, 4)`. \ Here, :math:`R` equals :obj:`n`. - The second axis contains :math:`x_{min}, y_{min}, x_{max}, y_{max}`, + The second axis contains :math:`y_{min}, x_{min}, y_{max}, x_{max}`, where - :math:`min\_length \\leq x_{max} - x_{min} < max\_length` - and :math:`min\_length \\leq y_{max} - y_{min} < max\_length`. + and + :math:`min\_length \\leq x_{max} - x_{min} < max\_length` """ - W, H = img_size - x_min = np.random.uniform(0, W - max_length, size=(n,)) + H, W = img_size y_min = np.random.uniform(0, H - max_length, size=(n,)) - x_max = x_min + np.random.uniform(min_length, max_length, size=(n,)) + x_min = np.random.uniform(0, W - max_length, size=(n,)) y_max = y_min + np.random.uniform(min_length, max_length, size=(n,)) - bbox = np.stack((x_min, y_min, x_max, y_max), axis=1).astype(np.float32) + x_max = x_min + np.random.uniform(min_length, max_length, size=(n,)) + bbox = np.stack((y_min, x_min, y_max, x_max), axis=1).astype(np.float32) return bbox diff --git a/chainercv/visualizations/vis_bbox.py b/chainercv/visualizations/vis_bbox.py index 016a98949c..e70110a90e 100644 --- a/chainercv/visualizations/vis_bbox.py +++ b/chainercv/visualizations/vis_bbox.py @@ -23,7 +23,7 @@ def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None): bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. Each element is organized - by :obj:`(x_min, y_min, x_max, y_max)` in the second axis. + by :obj:`(y_min, x_min, y_max, x_max)` in the second axis. label (~numpy.ndarray): An integer array of shape :math:`(R,)`. The values correspond to id for label names stored in :obj:`label_names`. This is optional. @@ -55,9 +55,9 @@ def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None): return ax for i, bb in enumerate(bbox): - xy = (bb[0], bb[1]) - width = bb[2] - bb[0] - height = bb[3] - bb[1] + xy = (bb[1], bb[0]) + height = bb[2] - bb[0] + width = bb[3] - bb[1] ax.add_patch(plot.Rectangle( xy, width, height, fill=False, edgecolor='red', linewidth=3)) @@ -73,7 +73,7 @@ def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None): caption.append('{:.2f}'.format(sc)) if len(caption) > 0: - ax.text(bb[0], bb[1], + ax.text(bb[1], bb[0], ': '.join(caption), style='italic', bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 10}) diff --git a/chainercv/visualizations/vis_keypoint.py b/chainercv/visualizations/vis_keypoint.py index 6c34a63bd3..6974593652 100644 --- a/chainercv/visualizations/vis_keypoint.py +++ b/chainercv/visualizations/vis_keypoint.py @@ -24,7 +24,7 @@ def vis_keypoint(img, keypoint, kp_mask=None, ax=None): keypoint (~numpy.ndarray): An array with keypoint pairs whose shape is :math:`(K, 2)`, where :math:`K` is the number of keypoints in the array. - The second axis corresponds to :math:`x` and :math:`y` coordinates + The second axis corresponds to :math:`y` and :math:`x` coordinates of the keypoint. kp_mask (~numpy.ndarray, optional): A boolean array whose shape is :math:`(K,)`. If :math:`i` th index is :obj:`True`, the @@ -53,7 +53,7 @@ def vis_keypoint(img, keypoint, kp_mask=None, ax=None): for i in range(n_kp): if kp_mask[i]: - ax.scatter(keypoint[i][0], keypoint[i][1], c=colors[i], s=100) + ax.scatter(keypoint[i][1], keypoint[i][0], c=colors[i], s=100) ax.set_xlim(left=0, right=W) ax.set_ylim(bottom=H - 1, top=0) diff --git a/examples/faster_rcnn/train.py b/examples/faster_rcnn/train.py index bd3e6ddd6f..c4a09816ad 100644 --- a/examples/faster_rcnn/train.py +++ b/examples/faster_rcnn/train.py @@ -57,12 +57,13 @@ def transform(in_data): img = faster_rcnn.prepare(img) _, o_H, o_W = img.shape scale = o_H / H - bbox = transforms.resize_bbox(bbox, (W, H), (o_W, o_H)) + bbox = transforms.resize_bbox(bbox, (H, W), (o_H, o_W)) # horizontally flip img, params = transforms.random_flip( img, x_random=True, return_param=True) - bbox = transforms.flip_bbox(bbox, (o_W, o_H), params['x_flip']) + bbox = transforms.flip_bbox( + bbox, (o_H, o_W), x_flip=params['x_flip']) return img, bbox, label, scale train_data = TransformDataset(train_data, transform) diff --git a/examples/ssd/caffe2npz.py b/examples/ssd/caffe2npz.py index f72b193c8a..bbedf0abbf 100644 --- a/examples/ssd/caffe2npz.py +++ b/examples/ssd/caffe2npz.py @@ -67,6 +67,18 @@ def _skip_layer(self, _): pass +def convert_xy_conv(l): + b = l.b.data.reshape(-1, 4) + b = b[:, [1, 0, 3, 2]] + + out_C, in_C, kh, kw = l.W.shape + W = l.W.data.reshape(-1, 4, in_C, kh, kw) + W = W[:, [1, 0, 3, 2]] + + l.b.data[:] = b.reshape(-1) + l.W.data[:] = W.reshape(-1, in_C, kh, kw) + + def main(): parser = argparse.ArgumentParser() parser.add_argument('caffemodel') @@ -78,6 +90,14 @@ def main(): # Convert weights so that they accept RGB images. model['extractor/conv1_1'].W.data[:] =\ model['extractor/conv1_1'].W.data[:, ::-1] + + # The pretrained model outputs coordinates in xy convention. + # This needs to be changed to yx convention, which is used + # in ChainerCV. + for child in model.children(): + if child.name.startswith('multibox/loc'): + convert_xy_conv(model[child.name]) + serializers.save_npz(args.output, model) diff --git a/tests/evaluations_tests/test_eval_detection_voc_ap.py b/tests/evaluations_tests/test_eval_detection_voc_ap.py index 62cd3c0166..22f11d9c47 100644 --- a/tests/evaluations_tests/test_eval_detection_voc_ap.py +++ b/tests/evaluations_tests/test_eval_detection_voc_ap.py @@ -181,14 +181,14 @@ class TestEvalDetectionVOCAP(unittest.TestCase): @classmethod def setUpClass(cls): base_url = 'https://github.com/yuyu2172/' \ - 'share-weights/releases/download/0.0.2' + 'share-weights/releases/download/0.0.3' cls.dataset = np.load(request.urlretrieve(os.path.join( base_url, - 'voc_detection_dataset_2007_test_truncated_2017_06_02.npz'))[0]) + 'voc_detection_dataset_2007_test_truncated_2017_06_06.npz'))[0]) cls.result = np.load(request.urlretrieve(os.path.join( base_url, - 'voc_detection_result_2007_test_truncated_2017_06_02.npz'))[0]) + 'voc_detection_result_2007_test_truncated_2017_06_06.npz'))[0]) def test_eval_detection_voc_ap(self): pred_bboxes = self.result['bboxes'] diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py b/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py index fa6a4a7fd5..b9decac22d 100644 --- a/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py +++ b/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py @@ -59,10 +59,10 @@ def __call__(self, x, img_size, scale, test=False): rpn_locs = _random_array(self.xp, (B, n_anchor, 4)) rpn_cls_scores = _random_array(self.xp, (B, n_anchor, 2)) rois = self.xp.asarray(generate_random_bbox( - self.n_roi, img_size[::-1], 16, min(img_size))) + self.n_roi, img_size, 16, min(img_size))) roi_indices = self.xp.zeros((len(rois),), dtype=np.int32) anchor = self.xp.asarray(generate_random_bbox( - n_anchor, img_size[::-1], 16, min(img_size))) + n_anchor, img_size, 16, min(img_size))) return (chainer.Variable(rpn_locs), chainer.Variable(rpn_cls_scores), rois, roi_indices, anchor) diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py b/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py index 445920cfd4..571347d2d8 100644 --- a/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py +++ b/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py @@ -41,7 +41,7 @@ def check_call(self): x = chainer.Variable( xp.random.uniform( low=-1., high=1., - size=(self.B, 3, feat_size[1] * 16, feat_size[0] * 16) + size=(self.B, 3, feat_size[0] * 16, feat_size[1] * 16) ).astype(np.float32), volatile=chainer.flag.ON) roi_cls_locs, roi_scores, rois, roi_indices = self.link( x, test=not self.train) diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py b/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py index b800c5b264..d4b60238e8 100644 --- a/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py +++ b/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py @@ -33,7 +33,7 @@ def setUp(self): proposal_creator_params=self.proposal_creator_params ) self.x = np.random.uniform(size=(self.B, C, H, W)).astype(np.float32) - self.img_size = (W * feat_stride, H * feat_stride) + self.img_size = (H * feat_stride, W * feat_stride) def _check_call(self, x, img_size, test): _, _, H, W = x.shape diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py index 84bab3352c..97ec20d8e8 100644 --- a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py +++ b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py @@ -60,7 +60,7 @@ def test_loc2bbox_gpu(self): class TestDeltaEncodeDecodeConsistency(unittest.TestCase): def setUp(self): - self.src_bbox = generate_random_bbox(8, (32, 64), 4, 16) + self.src_bbox = generate_random_bbox(8, (64, 32), 4, 16) self.dst_bbox = self.src_bbox + 1 def check_bbox_loc_conversions_consistency( diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py index adaca1c220..4003ef4ad9 100644 --- a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py +++ b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py @@ -13,15 +13,15 @@ class TestGenerateAnchorBase(unittest.TestCase): def test_generaete_anchor_base(self): gt = np.array( - [[-120., -24., 136., 40.], - [-248., -56., 264., 72.], - [-504., -120., 520., 136.], + [[-24., -120., 40., 136.], + [-56., -248., 72., 264.], + [-120., -504., 136., 520.], [-56., -56., 72., 72.], [-120., -120., 136., 136.], [-248., -248., 264., 264.], - [-24., -120., 40., 136.], - [-56., -248., 72., 264.], - [-120., -504., 136., 520.]]) + [-120., -24., 136., 40.], + [-248., -56., 264., 72.], + [-504., -120., 520., 136.]]) base_size = 16 anchor_scales = [8, 16, 32] diff --git a/tests/transforms_tests/bbox_tests/test_flip_bbox.py b/tests/transforms_tests/bbox_tests/test_flip_bbox.py index 39285cdc1c..3cdfc5ee9a 100644 --- a/tests/transforms_tests/bbox_tests/test_flip_bbox.py +++ b/tests/transforms_tests/bbox_tests/test_flip_bbox.py @@ -12,16 +12,16 @@ def test_flip_bbox(self): bbox = np.random.uniform( low=0., high=32., size=(10, 4)) - out = flip_bbox(bbox, size=(32, 34), x_flip=True) + out = flip_bbox(bbox, size=(34, 32), y_flip=True) bbox_expected = bbox.copy() - bbox_expected[:, 0] = 31 - bbox[:, 2] - bbox_expected[:, 2] = 31 - bbox[:, 0] + bbox_expected[:, 0] = 33 - bbox[:, 2] + bbox_expected[:, 2] = 33 - bbox[:, 0] np.testing.assert_equal(out, bbox_expected) - out = flip_bbox(bbox, size=(32, 34), y_flip=True) + out = flip_bbox(bbox, size=(34, 32), x_flip=True) bbox_expected = bbox.copy() - bbox_expected[:, 1] = 33 - bbox[:, 3] - bbox_expected[:, 3] = 33 - bbox[:, 1] + bbox_expected[:, 1] = 31 - bbox[:, 3] + bbox_expected[:, 3] = 31 - bbox[:, 1] np.testing.assert_equal(out, bbox_expected) diff --git a/tests/transforms_tests/bbox_tests/test_resize_bbox.py b/tests/transforms_tests/bbox_tests/test_resize_bbox.py index c9dfe5b085..effd847adc 100644 --- a/tests/transforms_tests/bbox_tests/test_resize_bbox.py +++ b/tests/transforms_tests/bbox_tests/test_resize_bbox.py @@ -12,12 +12,12 @@ def test_resize_bbox(self): bbox = np.random.uniform( low=0., high=32., size=(10, 4)) - out = resize_bbox(bbox, in_size=(32, 32), out_size=(128, 64)) + out = resize_bbox(bbox, in_size=(32, 32), out_size=(64, 128)) bbox_expected = bbox.copy() - bbox_expected[:, 0] = bbox[:, 0] * 4 - bbox_expected[:, 1] = bbox[:, 1] * 2 - bbox_expected[:, 2] = bbox[:, 2] * 4 - bbox_expected[:, 3] = bbox[:, 3] * 2 + bbox_expected[:, 0] = bbox[:, 0] * 2 + bbox_expected[:, 1] = bbox[:, 1] * 4 + bbox_expected[:, 2] = bbox[:, 2] * 2 + bbox_expected[:, 3] = bbox[:, 3] * 4 np.testing.assert_equal(out, bbox_expected) diff --git a/tests/transforms_tests/bbox_tests/test_translate_bbox.py b/tests/transforms_tests/bbox_tests/test_translate_bbox.py index e021ee4e6a..ffca0982a8 100644 --- a/tests/transforms_tests/bbox_tests/test_translate_bbox.py +++ b/tests/transforms_tests/bbox_tests/test_translate_bbox.py @@ -12,12 +12,12 @@ def test_translate_bbox(self): bbox = np.random.uniform( low=0., high=32., size=(10, 4)) - out = translate_bbox(bbox, x_offset=3, y_offset=5) + out = translate_bbox(bbox, y_offset=5, x_offset=3) bbox_expected = np.empty_like(bbox) - bbox_expected[:, 0] = bbox[:, 0] + 3 - bbox_expected[:, 1] = bbox[:, 1] + 5 - bbox_expected[:, 2] = bbox[:, 2] + 3 - bbox_expected[:, 3] = bbox[:, 3] + 5 + bbox_expected[:, 0] = bbox[:, 0] + 5 + bbox_expected[:, 1] = bbox[:, 1] + 3 + bbox_expected[:, 2] = bbox[:, 2] + 5 + bbox_expected[:, 3] = bbox[:, 3] + 3 np.testing.assert_equal(out, bbox_expected) diff --git a/tests/transforms_tests/image_tests/test_center_crop.py b/tests/transforms_tests/image_tests/test_center_crop.py index 059f8f242b..b3505fcf7b 100644 --- a/tests/transforms_tests/image_tests/test_center_crop.py +++ b/tests/transforms_tests/image_tests/test_center_crop.py @@ -11,13 +11,13 @@ class TestCenterCrop(unittest.TestCase): def test_center_crop(self): img = np.random.uniform(size=(3, 48, 32)) - out, param = center_crop(img, (16, 24), return_param=True) - x_slice = param['x_slice'] + out, param = center_crop(img, (24, 16), return_param=True) y_slice = param['y_slice'] + x_slice = param['x_slice'] np.testing.assert_equal(out, img[:, y_slice, x_slice]) - self.assertEqual(x_slice, slice(8, 24)) self.assertEqual(y_slice, slice(12, 36)) + self.assertEqual(x_slice, slice(8, 24)) testing.run_module(__name__, __file__) diff --git a/tests/transforms_tests/image_tests/test_flip_transform.py b/tests/transforms_tests/image_tests/test_flip_transform.py index 1efa8d77ee..e2e3161fd5 100644 --- a/tests/transforms_tests/image_tests/test_flip_transform.py +++ b/tests/transforms_tests/image_tests/test_flip_transform.py @@ -11,7 +11,7 @@ class TestRandomFlip(unittest.TestCase): def test_random_flip(self): img = np.random.uniform(size=(3, 24, 24)) - out = flip(img, x_flip=True, y_flip=True) + out = flip(img, y_flip=True, x_flip=True) expected = img expected = expected[:, :, ::-1] @@ -21,7 +21,7 @@ def test_random_flip(self): def test_random_flip_vertical(self): img = np.random.uniform(size=(3, 24, 24)) - out = flip(img, x_flip=False, y_flip=True) + out = flip(img, y_flip=True, x_flip=False) expected = img expected = expected[:, ::-1, :] diff --git a/tests/transforms_tests/image_tests/test_random_crop.py b/tests/transforms_tests/image_tests/test_random_crop.py index bf814ece37..a4081b3a5d 100644 --- a/tests/transforms_tests/image_tests/test_random_crop.py +++ b/tests/transforms_tests/image_tests/test_random_crop.py @@ -11,14 +11,14 @@ class TestRandomCrop(unittest.TestCase): def test_random_crop(self): img = np.random.uniform(size=(3, 48, 32)) - out, param = random_crop(img, (32, 48), return_param=True) - x_slice = param['x_slice'] + out, param = random_crop(img, (48, 32), return_param=True) y_slice = param['y_slice'] + x_slice = param['x_slice'] np.testing.assert_equal(out, img) - self.assertEqual(x_slice, slice(0, 32)) self.assertEqual(y_slice, slice(0, 48)) + self.assertEqual(x_slice, slice(0, 32)) - out = random_crop(img, (12, 24)) + out = random_crop(img, (24, 12)) self.assertEqual(out.shape[1:], (24, 12)) diff --git a/tests/transforms_tests/image_tests/test_random_expand.py b/tests/transforms_tests/image_tests/test_random_expand.py index b52ab01e40..77e2d9778b 100644 --- a/tests/transforms_tests/image_tests/test_random_expand.py +++ b/tests/transforms_tests/image_tests/test_random_expand.py @@ -23,8 +23,8 @@ def test_random_expand(self): out, param = random_expand( img, max_ratio=self.max_ratio, return_param=True) ratio = param['ratio'] - x_offset = param['x_offset'] y_offset = param['y_offset'] + x_offset = param['x_offset'] np.testing.assert_equal( out[:, y_offset:y_offset + 64, x_offset:x_offset + 32], img) self.assertGreaterEqual(ratio, 1) @@ -47,9 +47,9 @@ def test_random_expand_fill(self): while True: out, param = random_expand(img, fill=self.fill, return_param=True) - x_offset = param['x_offset'] y_offset = param['y_offset'] - if x_offset > 0 or y_offset > 0: + x_offset = param['x_offset'] + if y_offset > 0 or x_offset > 0: break if isinstance(self.fill, int): diff --git a/tests/transforms_tests/image_tests/test_random_flip_transform.py b/tests/transforms_tests/image_tests/test_random_flip.py similarity index 90% rename from tests/transforms_tests/image_tests/test_random_flip_transform.py rename to tests/transforms_tests/image_tests/test_random_flip.py index 712195249a..2574c41c29 100644 --- a/tests/transforms_tests/image_tests/test_random_flip_transform.py +++ b/tests/transforms_tests/image_tests/test_random_flip.py @@ -12,15 +12,15 @@ def test_random_flip(self): img = np.random.uniform(size=(3, 24, 24)) out, param = random_flip( - img, x_random=True, y_random=True, return_param=True) - x_flip = param['x_flip'] + img, y_random=True, x_random=True, return_param=True) y_flip = param['y_flip'] + x_flip = param['x_flip'] expected = img - if x_flip: - expected = expected[:, :, ::-1] if y_flip: expected = expected[:, ::-1, :] + if x_flip: + expected = expected[:, :, ::-1] np.testing.assert_equal(out, expected) diff --git a/tests/transforms_tests/image_tests/test_resize.py b/tests/transforms_tests/image_tests/test_resize.py index 853a28066a..805f796198 100644 --- a/tests/transforms_tests/image_tests/test_resize.py +++ b/tests/transforms_tests/image_tests/test_resize.py @@ -17,12 +17,12 @@ class TestResize(unittest.TestCase): def test_resize_color(self): img = np.random.uniform(size=(3, 24, 32)) - out = resize(img, size=(64, 32), interpolation=self.interpolation) + out = resize(img, size=(32, 64), interpolation=self.interpolation) self.assertEqual(out.shape, (3, 32, 64)) def test_resize_grayscale(self): img = np.random.uniform(size=(1, 24, 32)) - out = resize(img, size=(64, 32), interpolation=self.interpolation) + out = resize(img, size=(32, 64), interpolation=self.interpolation) self.assertEqual(out.shape, (1, 32, 64)) diff --git a/tests/transforms_tests/image_tests/test_resize_contain.py b/tests/transforms_tests/image_tests/test_resize_contain.py index a7462d4db9..17442c2b66 100644 --- a/tests/transforms_tests/image_tests/test_resize_contain.py +++ b/tests/transforms_tests/image_tests/test_resize_contain.py @@ -17,31 +17,31 @@ def test_resize_contain(self): img = np.random.uniform(size=(3, 32, 64)) out, param = resize_contain( - img, (96, 48), fill=self.fill, return_param=True) + img, (48, 96), fill=self.fill, return_param=True) np.testing.assert_array_equal(img, out[:, 8:40, 16:80]) np.testing.assert_array_equal(self.fill, out[:, 0, 0]) - self.assertEqual(param['scaled_size'], (64, 32)) - self.assertEqual(param['x_offset'], 16) + self.assertEqual(param['scaled_size'], (32, 64)) self.assertEqual(param['y_offset'], 8) + self.assertEqual(param['x_offset'], 16) def test_resize_contain_canvas_small_x(self): img = np.random.uniform(size=(3, 32, 64)) out, param = resize_contain( - img, (68, 16), fill=self.fill, return_param=True) - self.assertEqual(param['scaled_size'], (32, 16)) - self.assertEqual(param['x_offset'], 18) + img, (16, 68), fill=self.fill, return_param=True) + self.assertEqual(param['scaled_size'], (16, 32)) self.assertEqual(param['y_offset'], 0) + self.assertEqual(param['x_offset'], 18) def test_resize_contain_canvas_small_y(self): img = np.random.uniform(size=(3, 32, 64)) out, param = resize_contain( - img, (16, 24), fill=self.fill, return_param=True) - self.assertEqual(param['scaled_size'], (16, 8)) - self.assertEqual(param['x_offset'], 0) + img, (24, 16), fill=self.fill, return_param=True) + self.assertEqual(param['scaled_size'], (8, 16)) self.assertEqual(param['y_offset'], 8) + self.assertEqual(param['x_offset'], 0) testing.run_module(__name__, __file__) diff --git a/tests/transforms_tests/image_tests/test_ten_crop.py b/tests/transforms_tests/image_tests/test_ten_crop.py index a220470dee..e09c4a2d20 100644 --- a/tests/transforms_tests/image_tests/test_ten_crop.py +++ b/tests/transforms_tests/image_tests/test_ten_crop.py @@ -11,14 +11,14 @@ class TestTenCrop(unittest.TestCase): def test_ten_crop(self): img = np.random.uniform(size=(3, 48, 32)) - out = ten_crop(img, (32, 48)) + out = ten_crop(img, (48, 32)) self.assertEqual(out.shape, (10, 3, 48, 32)) for crop in out[:5]: np.testing.assert_equal(crop, img) for crop in out[5:]: np.testing.assert_equal(crop[:, :, ::-1], img) - out = ten_crop(img, (12, 24)) + out = ten_crop(img, (24, 12)) self.assertEqual(out.shape, (10, 3, 24, 12)) diff --git a/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py b/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py index d60d46ac2c..f059b47567 100644 --- a/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py +++ b/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py @@ -12,14 +12,14 @@ def test_flip_keypoint(self): keypoint = np.random.uniform( low=0., high=32., size=(12, 2)) - out = flip_keypoint(keypoint, size=(32, 34), x_flip=True) + out = flip_keypoint(keypoint, size=(34, 32), y_flip=True) keypoint_expected = keypoint.copy() - keypoint_expected[:, 0] = 31 - keypoint[:, 0] + keypoint_expected[:, 0] = 33 - keypoint[:, 0] np.testing.assert_equal(out, keypoint_expected) - out = flip_keypoint(keypoint, size=(32, 34), y_flip=True) + out = flip_keypoint(keypoint, size=(34, 32), x_flip=True) keypoint_expected = keypoint.copy() - keypoint_expected[:, 1] = 33 - keypoint[:, 1] + keypoint_expected[:, 1] = 31 - keypoint[:, 1] np.testing.assert_equal(out, keypoint_expected) diff --git a/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py b/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py index 8f70a98d8e..5ed73951bc 100644 --- a/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py +++ b/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py @@ -12,8 +12,9 @@ def test_resize_keypoint(self): keypoint = np.random.uniform( low=0., high=32., size=(12, 2)) - out = resize_keypoint(keypoint, in_size=(32, 32), out_size=(64, 64)) - keypoint[:, :2] *= 2 + out = resize_keypoint(keypoint, in_size=(16, 32), out_size=(8, 64)) + keypoint[:, 0] *= 0.5 + keypoint[:, 1] *= 2 np.testing.assert_equal(out, keypoint) diff --git a/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py b/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py index 5e95fe5783..4978dd22cd 100644 --- a/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py +++ b/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py @@ -12,7 +12,7 @@ def test_translate_keypoint(self): keypoint = np.random.uniform( low=0., high=32., size=(10, 2)) - out = translate_keypoint(keypoint, x_offset=3, y_offset=5) + out = translate_keypoint(keypoint, y_offset=3, x_offset=5) expected = np.empty_like(keypoint) expected[:, 0] = keypoint[:, 0] + 3 expected[:, 1] = keypoint[:, 1] + 5 diff --git a/tests/utils_tests/testing_tests/test_geenerate_random_bbox.py b/tests/utils_tests/testing_tests/test_generate_random_bbox.py similarity index 92% rename from tests/utils_tests/testing_tests/test_geenerate_random_bbox.py rename to tests/utils_tests/testing_tests/test_generate_random_bbox.py index d28c956991..c59cfdd788 100644 --- a/tests/utils_tests/testing_tests/test_geenerate_random_bbox.py +++ b/tests/utils_tests/testing_tests/test_generate_random_bbox.py @@ -23,12 +23,12 @@ def test_generate_random_bbox(self): self.assertTrue(np.all(bbox[:, [1, 3]] < img_size[1])) self.assertTrue(np.all(bbox[:, [1, 3]] >= 0)) - w = bbox[:, 2] - bbox[:, 0] - h = bbox[:, 3] - bbox[:, 1] - self.assertTrue(np.all(w < max_length)) - self.assertTrue(np.all(w >= min_length)) + h = bbox[:, 2] - bbox[:, 0] + w = bbox[:, 3] - bbox[:, 1] self.assertTrue(np.all(h < max_length)) self.assertTrue(np.all(h >= min_length)) + self.assertTrue(np.all(w < max_length)) + self.assertTrue(np.all(w >= min_length)) testing.run_module(__name__, __file__)