diff --git a/chainercv/datasets/cub/cub_keypoint_dataset.py b/chainercv/datasets/cub/cub_keypoint_dataset.py
index 1ef0a92cdb..53fbb02295 100644
--- a/chainercv/datasets/cub/cub_keypoint_dataset.py
+++ b/chainercv/datasets/cub/cub_keypoint_dataset.py
@@ -29,7 +29,7 @@ class CUBKeypointDataset(CUBDatasetBase):
     Note that :math:`K=15` in CUB dataset. Also note that not all fifteen
     keypoints are visible in an image. When a keypoint is not visible,
     the values stored for that keypoint are undefined. The second axis
-    corresponds to the :math:`x` and :math:`y` coordinates of the
+    corresponds to the :math:`y` and :math:`x` coordinates of the
     keypoints in the image.
 
     A keypoint mask array indicates whether a keypoint is visible in the
@@ -74,7 +74,8 @@ def __init__(self, data_dir='auto', crop_bbox=True,
             if id_ not in self.kp_mask_dict:
                 self.kp_mask_dict[id_] = []
 
-            keypoint = [float(v) for v in values[2:4]]
+            # (y, x) order
+            keypoint = [float(v) for v in values[2:4][::-1]]
             kp_mask = bool(int(values[4]))
 
             self.kp_dict[id_].append(keypoint)
@@ -92,9 +93,9 @@ def get_example(self, i):
         kp_mask = np.array(self.kp_mask_dict[i], dtype=np.bool)
 
         if self.crop_bbox:
-            bbox = self.bboxes[i]  # (x, y, width, height)
-            img =\
-                img[:, bbox[1]: bbox[1] + bbox[3], bbox[0]: bbox[0] + bbox[2]]
+            # (y_min, x_min, y_max, x_max)
+            bbox = self.bboxes[i].astype(np.int32)
+            img = img[:, bbox[0]: bbox[2], bbox[1]: bbox[3]]
             keypoint[:, :2] = keypoint[:, :2] - np.array([bbox[0], bbox[1]])
 
         if not self.return_mask:
@@ -105,8 +106,6 @@ def get_example(self, i):
             dtype=np.uint8,
             color=False)
         if self.crop_bbox:
-            mask = mask[:,
-                        bbox[1]: bbox[1] + bbox[3],
-                        bbox[0]: bbox[0] + bbox[2]]
+            mask = mask[:, bbox[0]: bbox[2], bbox[1]: bbox[3]]
 
         return img, keypoint, kp_mask, mask
diff --git a/chainercv/datasets/cub/cub_label_dataset.py b/chainercv/datasets/cub/cub_label_dataset.py
index 8c1c811595..e17b19cc8f 100644
--- a/chainercv/datasets/cub/cub_label_dataset.py
+++ b/chainercv/datasets/cub/cub_label_dataset.py
@@ -1,3 +1,4 @@
+import numpy as np
 import os
 
 from chainercv.datasets.cub.cub_utils import CUBDatasetBase
@@ -50,9 +51,8 @@ def get_example(self, i):
             os.path.join(self.data_dir, 'images', self.fns[i]), color=True)
 
         if self.crop_bbox:
-            bbox = self.bboxes[i]  # (x, y, width, height)
-            img = img[:,
-                      bbox[1]: bbox[1] + bbox[3],
-                      bbox[0]: bbox[0] + bbox[2]]
+            # (y_min, x_min, y_max, x_max)
+            bbox = self.bboxes[i].astype(np.int32)
+            img = img[:, bbox[0]: bbox[2], bbox[1]: bbox[3]]
         label = self._data_labels[i]
         return img, label
diff --git a/chainercv/datasets/cub/cub_utils.py b/chainercv/datasets/cub/cub_utils.py
index 9d2ea2f9e8..7eb4178365 100644
--- a/chainercv/datasets/cub/cub_utils.py
+++ b/chainercv/datasets/cub/cub_utils.py
@@ -1,3 +1,4 @@
+import numpy as np
 import os
 
 import chainer
@@ -58,8 +59,13 @@ def __init__(self, data_dir='auto', mask_dir='auto', crop_bbox=True):
         bboxes_file = os.path.join(data_dir, 'bounding_boxes.txt')
 
         self.fns = [fn.strip().split()[1] for fn in open(images_file)]
-        bboxes = [bbox.split()[1:] for bbox in open(bboxes_file)]
-        self.bboxes = [[int(float(elem)) for elem in bbox] for bbox in bboxes]
+        y_min = np.array([float(bb.split()[2]) for bb in open(bboxes_file)])
+        x_min = np.array([float(bb.split()[1]) for bb in open(bboxes_file)])
+        height = np.array([float(bb.split()[4]) for bb in open(bboxes_file)])
+        width = np.array([float(bb.split()[3]) for bb in open(bboxes_file)])
+        self.bboxes = np.stack(
+            (y_min, x_min, y_min + height, x_min + width),
+            axis=1).astype(np.float32)
 
         self.crop_bbox = crop_bbox
 
diff --git a/chainercv/datasets/voc/voc_detection_dataset.py b/chainercv/datasets/voc/voc_detection_dataset.py
index 778e12287e..dfc08ad2c4 100644
--- a/chainercv/datasets/voc/voc_detection_dataset.py
+++ b/chainercv/datasets/voc/voc_detection_dataset.py
@@ -28,7 +28,7 @@ class VOCDetectionDataset(chainer.dataset.DatasetMixin):
     The bounding boxes are packed into a two dimensional tensor of shape
     :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
     the image. The second axis represents attributes of the bounding box.
-    They are :obj:`(x_min, y_min, x_max, y_max)`, where the
+    They are :obj:`(y_min, x_min, y_max, x_max)`, where the
     four attributes are coordinates of the bottom left and the top right
     vertices.
 
@@ -119,7 +119,7 @@ def get_example(self, i):
             # subtract 1 to make pixel indexes 0-based
             bbox.append([
                 int(bndbox_anno.find(tag).text) - 1
-                for tag in ('xmin', 'ymin', 'xmax', 'ymax')])
+                for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
             name = obj.find('name').text.lower().strip()
             label.append(voc_utils.voc_detection_label_names.index(name))
         bbox = np.stack(bbox).astype(np.float32)
diff --git a/chainercv/evaluations/eval_detection_voc_ap.py b/chainercv/evaluations/eval_detection_voc_ap.py
index 88d17ec303..e734ce5ea3 100644
--- a/chainercv/evaluations/eval_detection_voc_ap.py
+++ b/chainercv/evaluations/eval_detection_voc_ap.py
@@ -26,7 +26,7 @@ def eval_detection_voc_ap(
             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
             where :math:`R` corresponds
             to the number of bounding boxes, which may vary among boxes.
-            The second axis corresponds to :obj:`x_min, y_min, x_max, y_max`
+            The second axis corresponds to :obj:`y_min, x_min, y_max, x_max`
             of a bounding box.
         pred_labels (iterable of numpy.ndarray): An iterable of labels.
             Similar to :obj:`pred_bboxes`, its index corresponds to an
@@ -94,7 +94,7 @@ def calc_detection_voc_prec_rec(
             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
             where :math:`R` corresponds
             to the number of bounding boxes, which may vary among boxes.
-            The second axis corresponds to :obj:`x_min, y_min, x_max, y_max`
+            The second axis corresponds to :obj:`y_min, x_min, y_max, x_max`
             of a bounding box.
         pred_labels (iterable of numpy.ndarray): An iterable of labels.
             Similar to :obj:`pred_bboxes`, its index corresponds to an
diff --git a/chainercv/evaluations/eval_pck.py b/chainercv/evaluations/eval_pck.py
index ac335f238b..6ee04d22ca 100644
--- a/chainercv/evaluations/eval_pck.py
+++ b/chainercv/evaluations/eval_pck.py
@@ -12,13 +12,13 @@ def eval_pck(pred, expected, alpha, L):
     :math:`0 < \\alpha < 1` is a variable we control.
     :math:`L` is determined differently depending on the context. For example,
     in evaluation of keypoint matching for CUB dataset,
-    :math:`L=\\sqrt{w^2 + h^2}` is used.
+    :math:`L=\\sqrt{h^2 + w^2}` is used.
 
     Args:
         pred (~numpy.ndarray): An array of shape :math:`(K, 2)`
             :math:`N` is the number of keypoints to be evaluated. The
-            two elements of the second axis corresponds to :math:`x`
-            and :math:`y` coordinate of the keypoint.
+            two elements of the second axis corresponds to :math:`y`
+            and :math:`x` coordinate of the keypoint.
         expected (~numpy.ndarray): Same kind of array as :obj:`pred`.
             This contains ground truth location of the keypoints that
             the user tries to predict.
diff --git a/chainercv/extensions/detection/detection_vis_report.py b/chainercv/extensions/detection/detection_vis_report.py
index 216c0165b0..445bdcbd71 100644
--- a/chainercv/extensions/detection/detection_vis_report.py
+++ b/chainercv/extensions/detection/detection_vis_report.py
@@ -52,7 +52,7 @@ class DetectionVisReport(chainer.training.extension.Extension):
         :obj:`gt_bbox` and :obj:`pred_bbox` are float arrays
         of shape :math:`(R, 4)`, where :math:`R` is the number of
         bounding boxes in the image. Each bounding box is organized
-        by :obj:`(x_min, y_min, x_max, y_max)` in the second axis.
+        by :obj:`(y_min, x_min, y_max, x_max)` in the second axis.
 
         :obj:`gt_label` and :obj:`pred_label` are intenger arrays
         of shape :math:`(R,)`. Each label indicates the class of
diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn.py b/chainercv/links/model/faster_rcnn/faster_rcnn.py
index 76f6869b53..eef3546b95 100644
--- a/chainercv/links/model/faster_rcnn/faster_rcnn.py
+++ b/chainercv/links/model/faster_rcnn/faster_rcnn.py
@@ -149,7 +149,7 @@ def __call__(self, x, scale=1., test=True):
                 :math:`(R',)`.
 
         """
-        img_size = x.shape[2:][::-1]
+        img_size = x.shape[2:]
 
         h = self.extractor(x, test=test)
         rpn_locs, rpn_scores, rois, roi_indices, anchor =\
@@ -214,7 +214,7 @@ def prepare(self, img):
         if scale * max(H, W) > self.max_size:
             scale = self.max_size / max(H, W)
 
-        img = resize(img, (int(W * scale), int(H * scale)))
+        img = resize(img, (int(H * scale), int(W * scale)))
 
         img = (img - self.mean).astype(np.float32, copy=False)
         return img
@@ -259,7 +259,7 @@ def predict(self, imgs):
            * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
                where :math:`R` is the number of bounding boxes in a image. \
                Each bouding box is organized by \
-               :obj:`(x_min, y_min, x_max, y_max)` \
+               :obj:`(y_min, x_min, y_max, x_max)` \
                in the second axis.
            * **labels** : A list of integer arrays of shape :math:`(R,)`. \
                Each value indicates the class of the bounding box. \
@@ -305,9 +305,9 @@ def predict(self, imgs):
             cls_bbox = cls_bbox.reshape(-1, self.n_class * 4)
             # clip bounding box
             cls_bbox[:, slice(0, 4, 2)] = self.xp.clip(
-                cls_bbox[:, slice(0, 4, 2)], 0, W / scale)
+                cls_bbox[:, slice(0, 4, 2)], 0, H / scale)
             cls_bbox[:, slice(1, 4, 2)] = self.xp.clip(
-                cls_bbox[:, slice(1, 4, 2)], 0, H / scale)
+                cls_bbox[:, slice(1, 4, 2)], 0, W / scale)
 
             prob = F.softmax(roi_score).data
 
diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py b/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py
index 041dcc6102..4027911e4f 100644
--- a/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py
+++ b/chainercv/links/model/faster_rcnn/faster_rcnn_train_chain.py
@@ -101,7 +101,7 @@ def __call__(self, imgs, bboxes, labels, scale):
             raise ValueError('Currently only batch size 1 is supported.')
 
         _, _, H, W = imgs.shape
-        img_size = (W, H)
+        img_size = (H, W)
 
         features = self.faster_rcnn.extractor(imgs, test=not self.train)
         rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(
diff --git a/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py b/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py
index ac12e23381..762ab92357 100644
--- a/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py
+++ b/chainercv/links/model/faster_rcnn/faster_rcnn_vgg.py
@@ -82,7 +82,7 @@ class FasterRCNNVGG16(FasterRCNN):
         'voc07': {
             'n_fg_class': 20,
             'url': 'https://github.com/yuyu2172/share-weights/releases/'
-            'download/0.0.2/faster_rcnn_vgg16_voc07_2017_05_24.npz'
+            'download/0.0.3/faster_rcnn_vgg16_voc07_2017_06_06.npz'
         }
     }
     feat_stride = 16
@@ -227,10 +227,11 @@ def __call__(self, x, rois, roi_indices, test=True):
 
         """
         roi_indices = roi_indices.astype(np.float32)
-        rois = self.xp.concatenate(
+        indices_and_rois = self.xp.concatenate(
             (roi_indices[:, None], rois), axis=1)
-        pool = F.roi_pooling_2d(
-            x, rois, self.roi_size, self.roi_size, self.spatial_scale)
+        pool = _roi_pooling_2d_yx(
+            x, indices_and_rois, self.roi_size, self.roi_size,
+            self.spatial_scale)
 
         fc6 = _relu(self.fc6(pool))
         fc7 = _relu(self.fc7(fc6))
@@ -291,5 +292,12 @@ def __call__(self, x, test=True):
         return h
 
 
+def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale):
+    xy_indices_and_rois = indices_and_rois[:, [2, 1, 4, 3]]
+    pool = F.roi_pooling_2d(
+        x, xy_indices_and_rois, outh, outw, spatial_scale)
+    return pool
+
+
 def _max_pooling_2d(x):
     return F.max_pooling_2d(x, ksize=2)
diff --git a/chainercv/links/model/faster_rcnn/region_proposal_network.py b/chainercv/links/model/faster_rcnn/region_proposal_network.py
index 6ac193bc94..15f463619d 100644
--- a/chainercv/links/model/faster_rcnn/region_proposal_network.py
+++ b/chainercv/links/model/faster_rcnn/region_proposal_network.py
@@ -80,7 +80,7 @@ def __call__(self, x, img_size, scale=1., test=True):
         Args:
             x (~chainer.Variable): The Features extracted from images.
                 Its shape is :math:`(N, C, H, W)`.
-            img_size (tuple of ints): A tuple :obj:`width, height`,
+            img_size (tuple of ints): A tuple :obj:`height, width`,
                 which contains image size after scaling.
             scale (float): The amount of scaling done to the input images after
                 reading them from files.
@@ -110,8 +110,8 @@ def __call__(self, x, img_size, scale=1., test=True):
         """
         n, _, hh, ww = x.shape
         anchor = _enumerate_shifted_anchor(
-            self.xp.array(self.anchor_base), self.feat_stride, ww, hh)
-        n_anchor = anchor.shape[0] // (ww * hh)
+            self.xp.array(self.anchor_base), self.feat_stride, hh, ww)
+        n_anchor = anchor.shape[0] // (hh * ww)
         h = F.relu(self.conv1(x))
 
         rpn_locs = self.loc(h)
@@ -139,7 +139,7 @@ def __call__(self, x, img_size, scale=1., test=True):
         return rpn_locs, rpn_scores, rois, roi_indices, anchor
 
 
-def _enumerate_shifted_anchor(anchor_base, feat_stride, width, height):
+def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
     # Enumerate all shifted anchors:
     #
     # add A anchors (1, A, 4) to
@@ -147,11 +147,11 @@ def _enumerate_shifted_anchor(anchor_base, feat_stride, width, height):
     # shift anchors (K, A, 4)
     # reshape to (K*A, 4) shifted anchors
     xp = cuda.get_array_module(anchor_base)
-    shift_x = xp.arange(0, width * feat_stride, feat_stride)
     shift_y = xp.arange(0, height * feat_stride, feat_stride)
+    shift_x = xp.arange(0, width * feat_stride, feat_stride)
     shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
-    shift = xp.stack((shift_x.ravel(), shift_y.ravel(),
-                      shift_x.ravel(), shift_y.ravel()), axis=1)
+    shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
+                      shift_y.ravel(), shift_x.ravel()), axis=1)
 
     A = anchor_base.shape[0]
     K = shift.shape[0]
diff --git a/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py b/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py
index 31c36afa78..4fbf681b5e 100644
--- a/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py
+++ b/chainercv/links/model/faster_rcnn/utils/anchor_target_creator.py
@@ -57,7 +57,7 @@ def __call__(self, bbox, anchor, img_size):
                 :math:`(R, 4)`.
             anchor (array): Coordinates of anchors. Its shape is
                 :math:`(S, 4)`.
-            img_size (tuple of ints): A tuple :obj:`W, H`, which
+            img_size (tuple of ints): A tuple :obj:`H, W`, which
                 is a tuple of height and width of an image.
 
         Returns:
@@ -74,10 +74,10 @@ def __call__(self, bbox, anchor, img_size):
         bbox = cuda.to_cpu(bbox)
         anchor = cuda.to_cpu(anchor)
 
-        img_W, img_H = img_size
+        img_H, img_W = img_size
 
         n_anchor = len(anchor)
-        inside_index = _get_inside_index(anchor, img_W, img_H)
+        inside_index = _get_inside_index(anchor, img_H, img_W)
         anchor = anchor[inside_index]
         argmax_ious, label = self._create_label(
             inside_index, anchor, bbox)
@@ -156,7 +156,7 @@ def _unmap(data, count, index, fill=0):
     return ret
 
 
-def _get_inside_index(anchor, W, H):
+def _get_inside_index(anchor, H, W):
     # Calc indicies of anchors which are located completely inside of the image
     # whose size is speficied.
     xp = cuda.get_array_module(anchor)
@@ -164,7 +164,7 @@ def _get_inside_index(anchor, W, H):
     index_inside = xp.where(
         (anchor[:, 0] >= 0) &
         (anchor[:, 1] >= 0) &
-        (anchor[:, 2] <= W) &  # width
-        (anchor[:, 3] <= H)  # height
+        (anchor[:, 2] <= H) &
+        (anchor[:, 3] <= W)
     )[0]
     return index_inside
diff --git a/chainercv/links/model/faster_rcnn/utils/bbox2loc.py b/chainercv/links/model/faster_rcnn/utils/bbox2loc.py
index de968dc3dc..6270f6d122 100644
--- a/chainercv/links/model/faster_rcnn/utils/bbox2loc.py
+++ b/chainercv/links/model/faster_rcnn/utils/bbox2loc.py
@@ -6,15 +6,16 @@ def bbox2loc(src_bbox, dst_bbox):
 
     Given bounding boxes, this function computes offsets and scales
     to match the source bounding boxes to the target bounding boxes.
-    Mathematcially, given a bounding box whose center is :math:`p_x, p_y` and
-    size :math:`p_w, p_h` and the target bounding box whose center is
-    :math:`g_x, g_y` and size :math:`g_w, g_h`, the offsets and scales
-    :math:`t_x, t_y, t_w, t_h` can be computed by the following formulas.
+    Mathematcially, given a bounding box whose center is
+    :math:`(y, x) = p_y, p_x` and
+    size :math:`p_h, p_w` and the target bounding box whose center is
+    :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales
+    :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas.
 
-    * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
     * :math:`t_y = \\frac{(g_y - p_y)} {p_h}`
-    * :math:`t_w = \\log(\\frac{g_w} {p_w})`
+    * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
     * :math:`t_h = \\log(\\frac{g_h} {p_h})`
+    * :math:`t_w = \\log(\\frac{g_w} {p_w})`
 
     The output is same type as the type of the inputs.
     The encoding formulas are used in works such as R-CNN [#]_.
@@ -26,35 +27,35 @@ def bbox2loc(src_bbox, dst_bbox):
     Args:
         src_bbox (array): An image coordinate array whose shape is
             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
-            These coordinates are used to compute :math:`p_x, p_y, p_w, p_h`.
+            These coordinates are used to compute :math:`p_y, p_x, p_h, p_w`.
         dst_bbox (array): An image coordinate array whose shape is
             :math:`(R, 4)`.
-            These coordinates are used to compute :math:`g_x, g_y, g_w, g_h`.
+            These coordinates are used to compute :math:`g_y, g_x, g_h, g_w`.
 
     Returns:
         array:
         Bounding box offsets and scales from :obj:`src_bbox` \
         to :obj:`dst_bbox`. \
         This has shape :math:`(R, 4)`.
-        The second axis contains four values :math:`t_x, t_y, t_w, t_h`.
+        The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
 
     """
     xp = cuda.get_array_module(src_bbox)
 
-    width = src_bbox[:, 2] - src_bbox[:, 0]
-    height = src_bbox[:, 3] - src_bbox[:, 1]
-    ctr_x = src_bbox[:, 0] + 0.5 * width
-    ctr_y = src_bbox[:, 1] + 0.5 * height
+    height = src_bbox[:, 2] - src_bbox[:, 0]
+    width = src_bbox[:, 3] - src_bbox[:, 1]
+    ctr_y = src_bbox[:, 0] + 0.5 * height
+    ctr_x = src_bbox[:, 1] + 0.5 * width
 
-    base_width = dst_bbox[:, 2] - dst_bbox[:, 0]
-    base_height = dst_bbox[:, 3] - dst_bbox[:, 1]
-    base_ctr_x = dst_bbox[:, 0] + 0.5 * base_width
-    base_ctr_y = dst_bbox[:, 1] + 0.5 * base_height
+    base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
+    base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
+    base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
+    base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
 
-    dx = (base_ctr_x - ctr_x) / width
     dy = (base_ctr_y - ctr_y) / height
-    dw = xp.log(base_width / width)
+    dx = (base_ctr_x - ctr_x) / width
     dh = xp.log(base_height / height)
+    dw = xp.log(base_width / width)
 
-    loc = xp.vstack((dx, dy, dw, dh)).transpose()
+    loc = xp.vstack((dy, dx, dh, dw)).transpose()
     return loc
diff --git a/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py b/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py
index 666b3e7e0a..c29b3ef2c9 100644
--- a/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py
+++ b/chainercv/links/model/faster_rcnn/utils/generate_anchor_base.py
@@ -35,23 +35,23 @@ def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],
         ~numpy.ndarray:
         An array of shape :math:`(R, 4)`.
         Each element is a set of coordinates of a bounding box.
-        The second axis corresponds to :obj:`x_min, y_min, x_max, y_max`
+        The second axis corresponds to :obj:`y_min, x_min, y_max, x_max`
         of a bounding box.
 
     """
-    px = base_size / 2.
     py = base_size / 2.
+    px = base_size / 2.
 
     anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
                            dtype=np.float32)
     for i in six.moves.range(len(ratios)):
         for j in six.moves.range(len(anchor_scales)):
-            w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
             h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
+            w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
 
             index = i * len(anchor_scales) + j
-            anchor_base[index, 0] = px - w / 2.
-            anchor_base[index, 1] = py - h / 2.
-            anchor_base[index, 2] = px + w / 2.
-            anchor_base[index, 3] = py + h / 2.
+            anchor_base[index, 0] = py - h / 2.
+            anchor_base[index, 1] = px - w / 2.
+            anchor_base[index, 2] = py + h / 2.
+            anchor_base[index, 3] = px + w / 2.
     return anchor_base
diff --git a/chainercv/links/model/faster_rcnn/utils/loc2bbox.py b/chainercv/links/model/faster_rcnn/utils/loc2bbox.py
index 96bfcbc7ac..00d111ac75 100644
--- a/chainercv/links/model/faster_rcnn/utils/loc2bbox.py
+++ b/chainercv/links/model/faster_rcnn/utils/loc2bbox.py
@@ -8,16 +8,16 @@ def loc2bbox(src_bbox, loc):
     :meth:`bbox2loc`, this function decodes the representation to
     coordinates in 2D image coordinates.
 
-    Given scales and offsets :math:`t_x, t_y, t_w, t_h` and a bounding
-    box whose center is :math:`p_x, p_y` and size :math:`p_w, p_h`,
-    the decoded bounding box's center :math:`\\hat{g}_x`, :math:`\\hat{g}_y`
-    and size :math:`\\hat{g}_w`, :math:`\\hat{g}_h` are calculated
+    Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding
+    box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`,
+    the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x`
+    and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated
     by the following formulas.
 
-    * :math:`\\hat{g}_x = p_w t_x + p_x`
     * :math:`\\hat{g}_y = p_h t_y + p_y`
-    * :math:`\\hat{g}_w = p_w \\exp(t_w)`
+    * :math:`\\hat{g}_x = p_w t_x + p_x`
     * :math:`\\hat{g}_h = p_h \\exp(t_h)`
+    * :math:`\\hat{g}_w = p_w \\exp(t_w)`
 
     The decoding formulas are used in works such as R-CNN [#]_.
 
@@ -30,16 +30,16 @@ def loc2bbox(src_bbox, loc):
     Args:
         src_bbox (array): A coordinates of bounding boxes.
             Its shape is :math:`(R, 4)`. These coordinates are used to
-            compute :math:`p_x, p_y, p_w, p_h`.
+            compute :math:`p_y, p_x, p_h, p_w`.
         loc (array): An array with offsets and scales.
             The shapes of :obj:`src_bbox` and :obj:`loc` should be same.
-            This contains values :math:`t_x, t_y, t_w, t_h`.
+            This contains values :math:`t_y, t_x, t_h, t_w`.
 
     Returns:
         array:
         Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \
         The second axis contains four values \
-        :math:`\\hat{g}_x, \\hat{g}_y, \\hat{g}_w, \\hat{g}_h`.
+        :math:`\\hat{g}_y, \\hat{g}_x, \\hat{g}_h, \\hat{g}_w`.
 
     """
     xp = cuda.get_array_module(src_bbox)
@@ -49,25 +49,25 @@ def loc2bbox(src_bbox, loc):
 
     src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
 
-    src_width = src_bbox[:, 2] - src_bbox[:, 0]
-    src_height = src_bbox[:, 3] - src_bbox[:, 1]
-    src_ctr_x = src_bbox[:, 0] + 0.5 * src_width
-    src_ctr_y = src_bbox[:, 1] + 0.5 * src_height
+    src_height = src_bbox[:, 2] - src_bbox[:, 0]
+    src_width = src_bbox[:, 3] - src_bbox[:, 1]
+    src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
+    src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
 
-    dx = loc[:, 0::4]
-    dy = loc[:, 1::4]
-    dw = loc[:, 2::4]
-    dh = loc[:, 3::4]
+    dy = loc[:, 0::4]
+    dx = loc[:, 1::4]
+    dh = loc[:, 2::4]
+    dw = loc[:, 3::4]
 
-    ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
     ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis]
-    w = xp.exp(dw) * src_width[:, xp.newaxis]
+    ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
     h = xp.exp(dh) * src_height[:, xp.newaxis]
+    w = xp.exp(dw) * src_width[:, xp.newaxis]
 
     dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype)
-    dst_bbox[:, 0::4] = ctr_x - 0.5 * w
-    dst_bbox[:, 1::4] = ctr_y - 0.5 * h
-    dst_bbox[:, 2::4] = ctr_x + 0.5 * w
-    dst_bbox[:, 3::4] = ctr_y + 0.5 * h
+    dst_bbox[:, 0::4] = ctr_y - 0.5 * h
+    dst_bbox[:, 1::4] = ctr_x - 0.5 * w
+    dst_bbox[:, 2::4] = ctr_y + 0.5 * h
+    dst_bbox[:, 3::4] = ctr_x + 0.5 * w
 
     return dst_bbox
diff --git a/chainercv/links/model/faster_rcnn/utils/proposal_creator.py b/chainercv/links/model/faster_rcnn/utils/proposal_creator.py
index 97c054a7c5..667e35a4db 100644
--- a/chainercv/links/model/faster_rcnn/utils/proposal_creator.py
+++ b/chainercv/links/model/faster_rcnn/utils/proposal_creator.py
@@ -82,7 +82,7 @@ def __call__(self, loc, score,
                 Its shape is :math:`(R,)`.
             anchor (array): Coordinates of anchors. Its shape is
                 :math:`(R, 4)`.
-            img_size (tuple of ints): A tuple :obj:`width, height`,
+            img_size (tuple of ints): A tuple :obj:`height, width`,
                 which contains image size after scaling.
             scale (float): The scaling factor used to scale an image after
                 reading it from a file.
@@ -118,9 +118,9 @@ def __call__(self, loc, score,
 
         # Remove predicted boxes with either height or width < threshold.
         min_size = self.min_size * scale
-        ws = roi[:, 2] - roi[:, 0]
-        hs = roi[:, 3] - roi[:, 1]
-        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+        hs = roi[:, 2] - roi[:, 0]
+        ws = roi[:, 3] - roi[:, 1]
+        keep = np.where((hs >= min_size) & (ws >= min_size))[0]
         roi = roi[keep, :]
         score = score[keep]
 
diff --git a/chainercv/links/model/segnet/segnet_basic.py b/chainercv/links/model/segnet/segnet_basic.py
index d975b8ab49..4bd4857fdb 100644
--- a/chainercv/links/model/segnet/segnet_basic.py
+++ b/chainercv/links/model/segnet/segnet_basic.py
@@ -177,7 +177,7 @@ def predict(self, imgs):
             score = chainer.cuda.to_cpu(score)
             if score.shape != (C, H, W):
                 dtype = score.dtype
-                score = resize(score, (W, H)).astype(dtype)
+                score = resize(score, (H, W)).astype(dtype)
 
             label = np.argmax(score, axis=0)
             labels.append(label)
diff --git a/chainercv/links/model/ssd/ssd.py b/chainercv/links/model/ssd/ssd.py
index f00689c680..c48b915c00 100644
--- a/chainercv/links/model/ssd/ssd.py
+++ b/chainercv/links/model/ssd/ssd.py
@@ -76,25 +76,25 @@ def __init__(
 
         super(SSD, self).__init__(extractor=extractor, multibox=multibox)
 
-        # the format of default_bbox is (center_x, center_y, width, height)
+        # the format of default_bbox is (center_y, center_x, height, width)
         self._default_bbox = list()
         for k, grid in enumerate(extractor.grids):
             for v, u in itertools.product(range(grid), repeat=2):
-                cx = (u + 0.5) * steps[k]
                 cy = (v + 0.5) * steps[k]
+                cx = (u + 0.5) * steps[k]
 
                 s = sizes[k]
-                self._default_bbox.append((cx, cy, s, s))
+                self._default_bbox.append((cy, cx, s, s))
 
                 s = np.sqrt(sizes[k] * sizes[k + 1])
-                self._default_bbox.append((cx, cy, s, s))
+                self._default_bbox.append((cy, cx, s, s))
 
                 s = sizes[k]
                 for ar in multibox.aspect_ratios[k]:
                     self._default_bbox.append(
-                        (cx, cy, s * np.sqrt(ar), s / np.sqrt(ar)))
+                        (cy, cx, s / np.sqrt(ar), s * np.sqrt(ar)))
                     self._default_bbox.append(
-                        (cx, cy, s / np.sqrt(ar), s * np.sqrt(ar)))
+                        (cy, cx, s * np.sqrt(ar), s / np.sqrt(ar)))
         self._default_bbox = np.stack(self._default_bbox)
 
     @property
@@ -141,13 +141,13 @@ def __call__(self, x):
 
     def _decode(self, loc, conf):
         xp = self.xp
-        # the format of bbox is (center_x, center_y, width, height)
+        # the format of bbox is (center_y, center_x, height, width)
         bboxes = xp.dstack((
             self._default_bbox[:, :2] +
             loc[:, :, :2] * self.variance[0] * self._default_bbox[:, 2:],
             self._default_bbox[:, 2:] *
             xp.exp(loc[:, :, 2:] * self.variance[1])))
-        # convert the format of bbox to (x_min, y_min, x_max, y_max)
+        # convert the format of bbox to (y_min, x_min, y_max, x_max)
         bboxes[:, :, :2] -= bboxes[:, :, 2:] / 2
         bboxes[:, :, 2:] += bboxes[:, :, :2]
         scores = xp.exp(conf)
@@ -236,7 +236,7 @@ def predict(self, imgs):
            * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
                where :math:`R` is the number of bounding boxes in a image. \
                Each bouding box is organized by \
-               :obj:`(x_min, y_min, x_max, y_max)` \
+               :obj:`(y_min, x_min, y_max, x_max)` \
                in the second axis.
            * **labels** : A list of integer arrays of shape :math:`(R,)`. \
                Each value indicates the class of the bounding box. \
@@ -253,7 +253,7 @@ def predict(self, imgs):
             _, H, W = img.shape
             img = self._prepare(img)
             x.append(self.xp.array(img))
-            sizes.append((W, H))
+            sizes.append((H, W))
 
         x = chainer.Variable(self.xp.stack(x), volatile=chainer.flag.ON)
         loc, conf = self(x)
diff --git a/chainercv/links/model/ssd/ssd_vgg16.py b/chainercv/links/model/ssd/ssd_vgg16.py
index 5ed012b8a7..48e1f90812 100644
--- a/chainercv/links/model/ssd/ssd_vgg16.py
+++ b/chainercv/links/model/ssd/ssd_vgg16.py
@@ -282,7 +282,7 @@ class SSD300(SSD):
         'voc0712': {
             'n_fg_class': 20,
             'url': 'https://github.com/yuyu2172/share-weights/releases/'
-            'download/0.0.2/ssd300_voc0712_2017_05_24.npz'
+            'download/0.0.3/ssd300_voc0712_2017_06_06.npz'
         }
     }
 
@@ -338,7 +338,7 @@ class SSD512(SSD):
         'voc0712': {
             'n_fg_class': 20,
             'url': 'https://github.com/yuyu2172/share-weights/releases/'
-            'download/0.0.2/ssd512_voc0712_2017_05_24.npz'
+            'download/0.0.3/ssd512_voc0712_2017_06_06.npz'
         }
     }
 
diff --git a/chainercv/transforms/bbox/flip_bbox.py b/chainercv/transforms/bbox/flip_bbox.py
index 2968120acf..668c5e96a7 100644
--- a/chainercv/transforms/bbox/flip_bbox.py
+++ b/chainercv/transforms/bbox/flip_bbox.py
@@ -1,38 +1,38 @@
-def flip_bbox(bbox, size, x_flip=False, y_flip=False):
+def flip_bbox(bbox, size, y_flip=False, x_flip=False):
     """Flip bounding boxes accordingly.
 
     The bounding boxes are expected to be packed into a two dimensional
     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
     bounding boxes in the image. The second axis represents attributes of
-    the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`,
+    the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`,
     where the four attributes are coordinates of the bottom left and the
     top right vertices.
 
     Args:
         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
             :math:`R` is the number of bounding boxes.
-        size (tuple): A tuple of length 2. The width and the height
+        size (tuple): A tuple of length 2. The height and the width
             of the image before resized.
-        x_flip (bool): Flip bounding box according to a horizontal flip of
-            an image.
         y_flip (bool): Flip bounding box according to a vertical flip of
             an image.
+        x_flip (bool): Flip bounding box according to a horizontal flip of
+            an image.
 
     Returns:
         ~numpy.ndarray:
         Bounding boxes flipped according to the given flips.
 
     """
-    W, H = size
+    H, W = size
     bbox = bbox.copy()
-    if x_flip:
-        x_max = W - 1 - bbox[:, 0]
-        x_min = W - 1 - bbox[:, 2]
-        bbox[:, 0] = x_min
-        bbox[:, 2] = x_max
     if y_flip:
-        y_max = H - 1 - bbox[:, 1]
-        y_min = H - 1 - bbox[:, 3]
-        bbox[:, 1] = y_min
-        bbox[:, 3] = y_max
+        y_max = H - 1 - bbox[:, 0]
+        y_min = H - 1 - bbox[:, 2]
+        bbox[:, 0] = y_min
+        bbox[:, 2] = y_max
+    if x_flip:
+        x_max = W - 1 - bbox[:, 1]
+        x_min = W - 1 - bbox[:, 3]
+        bbox[:, 1] = x_min
+        bbox[:, 3] = x_max
     return bbox
diff --git a/chainercv/transforms/bbox/resize_bbox.py b/chainercv/transforms/bbox/resize_bbox.py
index 59c1293858..d71de4de0a 100644
--- a/chainercv/transforms/bbox/resize_bbox.py
+++ b/chainercv/transforms/bbox/resize_bbox.py
@@ -4,16 +4,16 @@ def resize_bbox(bbox, in_size, out_size):
     The bounding boxes are expected to be packed into a two dimensional
     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
     bounding boxes in the image. The second axis represents attributes of
-    the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`,
+    the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`,
     where the four attributes are coordinates of the bottom left and the
     top right vertices.
 
     Args:
         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
             :math:`R` is the number of bounding boxes.
-        in_size (tuple): A tuple of length 2. The width and the height
+        in_size (tuple): A tuple of length 2. The height and the width
             of the image before resized.
-        out_size (tuple): A tuple of length 2. The width and the height
+        out_size (tuple): A tuple of length 2. The height and the width
             of the image after resized.
 
     Returns:
@@ -22,10 +22,10 @@ def resize_bbox(bbox, in_size, out_size):
 
     """
     bbox = bbox.copy()
-    x_scale = float(out_size[0]) / in_size[0]
-    y_scale = float(out_size[1]) / in_size[1]
-    bbox[:, 0] = x_scale * bbox[:, 0]
-    bbox[:, 2] = x_scale * bbox[:, 2]
-    bbox[:, 1] = y_scale * bbox[:, 1]
-    bbox[:, 3] = y_scale * bbox[:, 3]
+    y_scale = float(out_size[0]) / in_size[0]
+    x_scale = float(out_size[1]) / in_size[1]
+    bbox[:, 0] = y_scale * bbox[:, 0]
+    bbox[:, 2] = y_scale * bbox[:, 2]
+    bbox[:, 1] = x_scale * bbox[:, 1]
+    bbox[:, 3] = x_scale * bbox[:, 3]
     return bbox
diff --git a/chainercv/transforms/bbox/translate_bbox.py b/chainercv/transforms/bbox/translate_bbox.py
index 1afaf64958..c6410a8915 100644
--- a/chainercv/transforms/bbox/translate_bbox.py
+++ b/chainercv/transforms/bbox/translate_bbox.py
@@ -1,22 +1,23 @@
-def translate_bbox(bbox, x_offset=0, y_offset=0):
+def translate_bbox(bbox, y_offset=0, x_offset=0):
     """Translate bounding boxes.
 
     This method is mainly used together with image transforms, such as padding
     and cropping, which translates the left top point of the image from
-    coordinate :math:`(0, 0)` to coordinate :math:`(x\_offset, y\_offset)`.
+    coordinate :math:`(0, 0)` to coordinate
+    :math:`(y, x) = (y\_offset, x\_offset)`.
 
     The bounding boxes are expected to be packed into a two dimensional
     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
     bounding boxes in the image. The second axis represents attributes of
-    the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`,
+    the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`,
     where the four attributes are coordinates of the bottom left and the
     top right vertices.
 
     Args:
         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
-        x_offset (int or float): The offset along x axis.
         y_offset (int or float): The offset along y axis.
+        x_offset (int or float): The offset along x axis.
 
     Returns:
         ~numpy.ndarray:
@@ -25,7 +26,7 @@ def translate_bbox(bbox, x_offset=0, y_offset=0):
     """
 
     out_bbox = bbox.copy()
-    out_bbox[:, :2] += (x_offset, y_offset)
-    out_bbox[:, 2:] += (x_offset, y_offset)
+    out_bbox[:, :2] += (y_offset, x_offset)
+    out_bbox[:, 2:] += (y_offset, x_offset)
 
     return out_bbox
diff --git a/chainercv/transforms/image/center_crop.py b/chainercv/transforms/image/center_crop.py
index d584b1f7ca..2301993ef8 100644
--- a/chainercv/transforms/image/center_crop.py
+++ b/chainercv/transforms/image/center_crop.py
@@ -8,7 +8,7 @@ def center_crop(img, size, return_param=False, copy=False):
         img (~numpy.ndarray): An image array to be cropped. This is in
             CHW format.
         size (tuple): The size of output image after cropping.
-            This value is :math:`(width, height)`.
+            This value is :math:`(height, width)`.
         return_param (bool): If :obj:`True`, this function returns information
             of slices.
         copy (bool): If :obj:`False`, a view of :obj:`img` is returned.
@@ -26,9 +26,9 @@ def center_crop(img, size, return_param=False, copy=False):
         contents are listed below with key, value-type and the description
         of the value.
 
-        * **x_slice** (*slice*): A slice used to crop the input image.\
-            The relation below holds together with :obj:`y_slice`.
-        * **y_slice** (*slice*): Similar to :obj:`x_slice`.
+        * **y_slice** (*slice*): A slice used to crop the input image.\
+            The relation below holds together with :obj:`x_slice`.
+        * **x_slice** (*slice*): Similar to :obj:`y_slice`.
 
             .. code::
 
@@ -36,15 +36,15 @@ def center_crop(img, size, return_param=False, copy=False):
 
     """
     _, H, W = img.shape
-    oW, oH = size
-    if oW > W or oH > H:
+    oH, oW = size
+    if oH > H or oW > W:
         raise ValueError('shape of image needs to be larger than size')
 
-    x_offset = int(round((W - oW) / 2.))
     y_offset = int(round((H - oH) / 2.))
+    x_offset = int(round((W - oW) / 2.))
 
-    x_slice = slice(x_offset, x_offset + oW)
     y_slice = slice(y_offset, y_offset + oH)
+    x_slice = slice(x_offset, x_offset + oW)
 
     img = img[:, y_slice, x_slice]
 
@@ -52,6 +52,6 @@ def center_crop(img, size, return_param=False, copy=False):
         img = img.copy()
 
     if return_param:
-        return img, {'x_slice': x_slice, 'y_slice': y_slice}
+        return img, {'y_slice': y_slice, 'x_slice': x_slice}
     else:
         return img
diff --git a/chainercv/transforms/image/flip.py b/chainercv/transforms/image/flip.py
index e0cab4b374..a377999964 100644
--- a/chainercv/transforms/image/flip.py
+++ b/chainercv/transforms/image/flip.py
@@ -1,20 +1,20 @@
-def flip(img, x_flip=False, y_flip=False, copy=False):
+def flip(img, y_flip=False, x_flip=False, copy=False):
     """Flip an image in vertical or horizontal direction as specified.
 
     Args:
         img (~numpy.ndarray): An array that gets flipped. This is in CHW
             format.
-        x_flip (bool): Flip in horizontal direction.
         y_flip (bool): Flip in vertical direction.
+        x_flip (bool): Flip in horizontal direction.
         copy (bool): If False, a view of :obj:`img` will be returned.
 
     Returns:
         Transformed :obj:`img` in CHW format.
     """
-    if x_flip:
-        img = img[:, :, ::-1]
     if y_flip:
         img = img[:, ::-1, :]
+    if x_flip:
+        img = img[:, :, ::-1]
 
     if copy:
         img = img.copy()
diff --git a/chainercv/transforms/image/random_crop.py b/chainercv/transforms/image/random_crop.py
index 9aebce8581..3a79c202e7 100644
--- a/chainercv/transforms/image/random_crop.py
+++ b/chainercv/transforms/image/random_crop.py
@@ -12,7 +12,7 @@ def random_crop(img, size, return_param=False, copy=False):
         img (~numpy.ndarray): An image array to be cropped. This is in
             CHW format.
         size (tuple): The size of output image after cropping.
-            This value is :math:`(width, height)`.
+            This value is :math:`(height, width)`.
         return_param (bool): If :obj:`True`, this function returns
             information of slices.
         copy (bool): If :obj:`False`, a view of :obj:`img` is returned.
@@ -30,24 +30,16 @@ def random_crop(img, size, return_param=False, copy=False):
         contents are listed below with key, value-type and the description
         of the value.
 
-        * **x_slice** (*slice*): A slice used to crop the input image.\
-            The relation below holds together with :obj:`y_slice`.
-        * **y_slice** (*slice*): Similar to :obj:`x_slice`.
+        * **y_slice** (*slice*): A slice used to crop the input image.\
+            The relation below holds together with :obj:`x_slice`.
+        * **x_slice** (*slice*): Similar to :obj:`x_slice`.
 
             .. code::
 
                 out_img = img[:, y_slice, x_slice]
 
     """
-    W, H = size
-
-    if img.shape[2] == W:
-        x_offset = 0
-    elif img.shape[2] > W:
-        x_offset = random.choice(six.moves.range(img.shape[2] - W))
-    else:
-        raise ValueError('shape of image needs to be larger than output shape')
-    x_slice = slice(x_offset, x_offset + W)
+    H, W = size
 
     if img.shape[1] == H:
         y_offset = 0
@@ -57,12 +49,20 @@ def random_crop(img, size, return_param=False, copy=False):
         raise ValueError('shape of image needs to be larger than output shape')
     y_slice = slice(y_offset, y_offset + H)
 
+    if img.shape[2] == W:
+        x_offset = 0
+    elif img.shape[2] > W:
+        x_offset = random.choice(six.moves.range(img.shape[2] - W))
+    else:
+        raise ValueError('shape of image needs to be larger than output shape')
+    x_slice = slice(x_offset, x_offset + W)
+
     img = img[:, y_slice, x_slice]
 
     if copy:
         img = img.copy()
 
     if return_param:
-        return img, {'x_slice': x_slice, 'y_slice': y_slice}
+        return img, {'y_slice': y_slice, 'x_slice': x_slice}
     else:
         return img
diff --git a/chainercv/transforms/image/random_expand.py b/chainercv/transforms/image/random_expand.py
index 55160dfb9e..a05dc5735e 100644
--- a/chainercv/transforms/image/random_expand.py
+++ b/chainercv/transforms/image/random_expand.py
@@ -6,7 +6,7 @@ def random_expand(img, max_ratio=4, fill=0, return_param=False):
     """Expand an image randomly.
 
     This method randomly place the input image on a larger canvas. The size of
-    the canvas is :math:`(rW, rH)`, where :math:`(W, H)` is the size of the
+    the canvas is :math:`(rH, rW)`, where :math:`(H, W)` is the size of the
     input image and :math:`r` is a random ratio drawn from
     :math:`[1, max\_ratio]`. The canvas is filled by a value :obj:`fill`
     except for the region where the original image is placed.
@@ -39,16 +39,16 @@ def random_expand(img, max_ratio=4, fill=0, return_param=False):
         of the value.
 
         * **ratio** (*float*): The sampled value used to make the canvas.
-        * **x_offset** (*int*): The x coordinate of the top left corner\
-            of the image after placing on the canvas.
         * **y_offset** (*int*): The y coodinate of the top left corner of\
             the image after placing on the canvas.
+        * **x_offset** (*int*): The x coordinate of the top left corner\
+            of the image after placing on the canvas.
 
     """
 
     if max_ratio <= 1:
         if return_param:
-            return img, {'ratio': 1, 'x_offset': 0, 'y_offset': 0}
+            return img, {'ratio': 1, 'y_offset': 0, 'x_offset': 0}
         else:
             return img
 
@@ -57,15 +57,15 @@ def random_expand(img, max_ratio=4, fill=0, return_param=False):
     ratio = random.uniform(1, max_ratio)
     out_H, out_W = int(H * ratio), int(W * ratio)
 
-    x_offset = random.randint(0, out_W - W)
     y_offset = random.randint(0, out_H - H)
+    x_offset = random.randint(0, out_W - W)
 
     out_img = np.empty((C, out_H, out_W), dtype=img.dtype)
     out_img[:] = np.array(fill).reshape(-1, 1, 1)
     out_img[:, y_offset:y_offset + H, x_offset:x_offset + W] = img
 
     if return_param:
-        param = {'ratio': ratio, 'x_offset': x_offset, 'y_offset': y_offset}
+        param = {'ratio': ratio, 'y_offset': y_offset, 'x_offset': x_offset}
         return out_img, param
     else:
         return out_img
diff --git a/chainercv/transforms/image/random_flip.py b/chainercv/transforms/image/random_flip.py
index ce14051302..ff982a7262 100644
--- a/chainercv/transforms/image/random_flip.py
+++ b/chainercv/transforms/image/random_flip.py
@@ -1,15 +1,15 @@
 import random
 
 
-def random_flip(img, x_random=False, y_random=False,
+def random_flip(img, y_random=False, x_random=False,
                 return_param=False, copy=False):
     """Randomly flip an image in vertical or horizontal direction.
 
     Args:
         img (~numpy.ndarray): An array that gets flipped. This is in
             CHW format.
-        x_random (bool): Randomly flip in horizontal direction.
         y_random (bool): Randomly flip in vertical direction.
+        x_random (bool): Randomly flip in horizontal direction.
         return_param (bool): Returns information of flip.
         copy (bool): If False, a view of :obj:`img` will be returned.
 
@@ -25,27 +25,27 @@ def random_flip(img, x_random=False, y_random=False,
         contents are listed below with key, value-type and the description
         of the value.
 
-        * **x_flip** (*bool*): Whether the image was flipped in the\
-            horizontal direction or not.
         * **y_flip** (*bool*): Whether the image was flipped in the\
             vertical direction or not.
+        * **x_flip** (*bool*): Whether the image was flipped in the\
+            horizontal direction or not.
 
     """
-    x_flip, y_flip = False, False
-    if x_random:
-        x_flip = random.choice([True, False])
+    y_flip, x_flip = False, False
     if y_random:
         y_flip = random.choice([True, False])
+    if x_random:
+        x_flip = random.choice([True, False])
 
-    if x_flip:
-        img = img[:, :, ::-1]
     if y_flip:
         img = img[:, ::-1, :]
+    if x_flip:
+        img = img[:, :, ::-1]
 
     if copy:
         img = img.copy()
 
     if return_param:
-        return img, {'x_flip': x_flip, 'y_flip': y_flip}
+        return img, {'y_flip': y_flip, 'x_flip': x_flip}
     else:
         return img
diff --git a/chainercv/transforms/image/resize.py b/chainercv/transforms/image/resize.py
index e29cc5c8b1..e056f1437e 100644
--- a/chainercv/transforms/image/resize.py
+++ b/chainercv/transforms/image/resize.py
@@ -16,7 +16,8 @@ def _resize(img, size, interpolation):
             cv_interpolation = cv2.INTER_CUBIC
         elif interpolation == PIL.Image.LANCZOS:
             cv_interpolation = cv2.INTER_LANCZOS4
-        img = cv2.resize(img, dsize=size, interpolation=cv_interpolation)
+        H, W = size
+        img = cv2.resize(img, dsize=(W, H), interpolation=cv_interpolation)
 
         # If input is a grayscale image, cv2 returns a two-dimentional array.
         if len(img.shape) == 2:
@@ -32,11 +33,11 @@ def _resize(img, size, interpolation):
 
     def _resize(img, size, interpolation):
         C = img.shape[0]
-        W, H = size
+        H, W = size
         out = np.empty((C, H, W), dtype=img.dtype)
         for ch, out_ch in zip(img, out):
             ch = PIL.Image.fromarray(ch, mode='F')
-            out_ch[:] = ch.resize(size, resample=interpolation)
+            out_ch[:] = ch.resize((W, H), resample=interpolation)
         return out
 
 
@@ -57,7 +58,7 @@ def resize(img, size, interpolation=PIL.Image.BILINEAR):
         img (~numpy.ndarray): An array to be transformed.
             This is in CHW format and the type should be :obj:`numpy.float32`.
         size (tuple): This is a tuple of length 2. Its elements are
-            ordered as (width, height).
+            ordered as (height, width).
         interpolation (int): Determines sampling strategy. This is one of
             :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BILINEAR`,
             :obj:`PIL.Image.BICUBIC`, :obj:`PIL.Image.LANCZOS`.
diff --git a/chainercv/transforms/image/resize_contain.py b/chainercv/transforms/image/resize_contain.py
index c124342236..600f9cd041 100644
--- a/chainercv/transforms/image/resize_contain.py
+++ b/chainercv/transforms/image/resize_contain.py
@@ -6,8 +6,8 @@
 def resize_contain(img, size, fill=0, return_param=False):
     """Resize the image to fit in the given area while keeping aspect ratio.
 
-    If both the width and the height in :obj:`size` are larger than the
-    width and the height of the :obj:`img`, the :obj:`img` is placed on
+    If both the height and the width in :obj:`size` are larger than
+    the height and the width of the :obj:`img`, the :obj:`img` is placed on
     the center with an appropriate padding to match :obj:`size`.
 
     Otherwise, the input image is scaled to fit in a canvas whose size
@@ -17,7 +17,7 @@ def resize_contain(img, size, fill=0, return_param=False):
         img (~numpy.ndarray): An array to be transformed. This is in
             CHW format.
         size (tuple of two ints): A tuple of two elements:
-            :obj:`width, height`. The size of the image after resizing.
+            :obj:`height, width`. The size of the image after resizing.
         fill (float, tuple or ~numpy.ndarray): The value of padded pixels.
         return_param (bool): Returns information of resizing and offsetting.
 
@@ -33,30 +33,30 @@ def resize_contain(img, size, fill=0, return_param=False):
         contents are listed below with key, value-type and the description
         of the value.
 
-        * **x_offset** (*int*): The x coordinate of the top left corner\
-            of the image after placing on the canvas.
         * **y_offset** (*int*): The y coodinate of the top left corner of\
             the image after placing on the canvas.
+        * **x_offset** (*int*): The x coordinate of the top left corner\
+            of the image after placing on the canvas.
         * **scaled_size** (*tuple*): The size to which the image is scaled\
             to before placing it on a canvas. This is a tuple of two elements:\
-            :obj:`width, height`.
+            :obj:`height, width`.
 
     """
     C, H, W = img.shape
-    out_W, out_H = size
+    out_H, out_W = size
     scale_h = out_H / float(H)
     scale_w = out_W / float(W)
     scale = min(min(scale_h, scale_w), 1.)
-    scaled_size = (int(W * scale), int(H * scale))
+    scaled_size = (int(H * scale), int(W * scale))
     if scale < 1.:
         img = resize(img, scaled_size)
-    x_slice, y_slice = _get_pad_slice(img, size=size)
+    y_slice, x_slice = _get_pad_slice(img, size=size)
     out_img = np.empty((C, out_H, out_W), dtype=img.dtype)
     out_img[:] = np.array(fill).reshape(-1, 1, 1)
     out_img[:, y_slice, x_slice] = img
 
     if return_param:
-        param = {'x_offset': x_slice.start, 'y_offset': y_slice.start,
+        param = {'y_offset': y_slice.start, 'x_offset': x_slice.start,
                  'scaled_size': scaled_size}
         return out_img, param
     else:
@@ -68,27 +68,28 @@ def _get_pad_slice(img, size):
 
     Args:
         img (~numpy.ndarray): This image is in format CHW.
-        size (tuple of two ints): (max_W, max_H).
+        size (tuple of two ints): (max_H, max_W).
     """
     _, H, W = img.shape
 
-    if W < size[0]:
-        diff_x = size[0] - W
-        margin_x = diff_x / 2
-        if diff_x % 2 == 0:
-            x_slice = slice(int(margin_x), int(size[0] - margin_x))
+    if H < size[0]:
+        diff_y = size[0] - H
+        margin_y = diff_y / 2
+        if diff_y % 2 == 0:
+            y_slice = slice(int(margin_y), int(size[0] - margin_y))
         else:
-            x_slice = slice(int(margin_x), int(size[0] - margin_x - 1))
+            y_slice = slice(int(margin_y), int(size[0] - margin_y - 1))
     else:
-        x_slice = slice(0, int(size[0]))
+        y_slice = slice(0, int(size[0]))
 
-    if H < size[1]:
-        diff_y = size[1] - H
-        margin_y = diff_y / 2
-        if diff_y % 2 == 0:
-            y_slice = slice(int(margin_y), int(size[1] - margin_y))
+    if W < size[1]:
+        diff_x = size[1] - W
+        margin_x = diff_x / 2
+        if diff_x % 2 == 0:
+            x_slice = slice(int(margin_x), int(size[1] - margin_x))
         else:
-            y_slice = slice(int(margin_y), int(size[1] - margin_y - 1))
+            x_slice = slice(int(margin_x), int(size[1] - margin_x - 1))
     else:
-        y_slice = slice(0, int(size[1]))
-    return x_slice, y_slice
+        x_slice = slice(0, int(size[1]))
+
+    return y_slice, x_slice
diff --git a/chainercv/transforms/image/scale.py b/chainercv/transforms/image/scale.py
index 0700a9b258..32b1c13a7c 100644
--- a/chainercv/transforms/image/scale.py
+++ b/chainercv/transforms/image/scale.py
@@ -27,19 +27,21 @@ def scale(img, size, fit_short=True):
     _, H, W = img.shape
 
     # If resizing is not necessary, return the input as is.
-    if fit_short and (W <= H and W == size) or (H <= W and H == size):
+    if fit_short and (H <= W and H == size) or (W <= H and W == size):
         return img
-    if not fit_short and (W >= H and W == size) or (H >= W and H == size):
+    if not fit_short and (H >= W and H == size) or (W >= H and W == size):
         return img
 
     if fit_short:
-        if W < H:
-            out_size = (size, int(size * H / W))
+        if H < W:
+            out_size = (size, int(size * W / H))
         else:
-            out_size = (int(size * W / H), size)
+            out_size = (int(size * H / W), size)
+
     else:
-        if W < H:
-            out_size = (int(size * W / H), size)
+        if H < W:
+            out_size = (int(size * H / W), size)
         else:
-            out_size = (size, int(size * H / W))
+            out_size = (size, int(size * W / H))
+
     return resize(img, out_size)
diff --git a/chainercv/transforms/image/ten_crop.py b/chainercv/transforms/image/ten_crop.py
index 1f363ba07d..79a6029785 100644
--- a/chainercv/transforms/image/ten_crop.py
+++ b/chainercv/transforms/image/ten_crop.py
@@ -25,16 +25,16 @@ def ten_crop(img, size):
         img (~numpy.ndarray): An image array to be cropped. This is in
             CHW format.
         size (tuple): The size of output images after cropping.
-            This value is :math:`(width, height)`.
+            This value is :math:`(height, width)`.
 
     Returns:
         The cropped arrays. The shape of tensor is :math:`(10, C, H, W)`.
 
     """
-    W, H = size
+    H, W = size
     iH, iW = img.shape[1:3]
 
-    if iW < W or iH < H:
+    if iH < H or iW < W:
         raise ValueError('shape of image is larger than output shape')
 
     crops = np.stack((
diff --git a/chainercv/transforms/keypoint/flip_keypoint.py b/chainercv/transforms/keypoint/flip_keypoint.py
index 749c82eb04..76a89befb3 100644
--- a/chainercv/transforms/keypoint/flip_keypoint.py
+++ b/chainercv/transforms/keypoint/flip_keypoint.py
@@ -1,28 +1,28 @@
-def flip_keypoint(keypoint, size, x_flip=False, y_flip=False):
+def flip_keypoint(keypoint, size, y_flip=False, x_flip=False):
     """Modify keypoints according to image flips.
 
     Args:
         keypoint (~numpy.ndarray): Keypoints in the image.
             The shape of this array is :math:`(K, 2)`. :math:`K` is the number
             of keypoints in the image.
-            The last dimension is composed of :math:`x` and :math:`y`
+            The last dimension is composed of :math:`y` and :math:`x`
             coordinates of the keypoints.
-        size (tuple): A tuple of length 2. The width and the height
+        size (tuple): A tuple of length 2. The height and the width
             of the image which is associated with the keypoints.
-        x_flip (bool): Modify keypoints according to a horizontal flip of
-            an image.
         y_flip (bool): Modify keypoints according to a vertical flip of
             an image.
+        x_flip (bool): Modify keypoints according to a horizontal flip of
+            an image.
 
     Returns:
         ~numpy.ndarray:
         Keypoints modified according to image flips.
 
     """
-    W, H = size
+    H, W = size
     keypoint = keypoint.copy()
-    if x_flip:
-        keypoint[:, 0] = W - 1 - keypoint[:, 0]
     if y_flip:
-        keypoint[:, 1] = H - 1 - keypoint[:, 1]
+        keypoint[:, 0] = H - 1 - keypoint[:, 0]
+    if x_flip:
+        keypoint[:, 1] = W - 1 - keypoint[:, 1]
     return keypoint
diff --git a/chainercv/transforms/keypoint/resize_keypoint.py b/chainercv/transforms/keypoint/resize_keypoint.py
index 9b79e2ca35..568b08088e 100644
--- a/chainercv/transforms/keypoint/resize_keypoint.py
+++ b/chainercv/transforms/keypoint/resize_keypoint.py
@@ -5,11 +5,11 @@ def resize_keypoint(keypoint, in_size, out_size):
         keypoint (~numpy.ndarray): Keypoints in the image.
             The shape of this array is :math:`(K, 2)`. :math:`K` is the number
             of keypoint in the image.
-            The last dimension is composed of :math:`x` and :math:`y`
+            The last dimension is composed of :math:`y` and :math:`x`
             coordinates of the keypoints.
-        in_size (tuple): A tuple of length 2. The width and the height
+        in_size (tuple): A tuple of length 2. The height and the width
             of the image before resized.
-        out_size (tuple): A tuple of length 2. The width and the height
+        out_size (tuple): A tuple of length 2. The height and the width
             of the image after resized.
 
     Returns:
@@ -18,8 +18,8 @@ def resize_keypoint(keypoint, in_size, out_size):
 
     """
     keypoint = keypoint.copy()
-    x_scale = float(out_size[0]) / in_size[0]
-    y_scale = float(out_size[1]) / in_size[1]
-    keypoint[:, 0] = x_scale * keypoint[:, 0]
-    keypoint[:, 1] = y_scale * keypoint[:, 1]
+    y_scale = float(out_size[0]) / in_size[0]
+    x_scale = float(out_size[1]) / in_size[1]
+    keypoint[:, 0] = y_scale * keypoint[:, 0]
+    keypoint[:, 1] = x_scale * keypoint[:, 1]
     return keypoint
diff --git a/chainercv/transforms/keypoint/translate_keypoint.py b/chainercv/transforms/keypoint/translate_keypoint.py
index 1719474915..639617ddc5 100644
--- a/chainercv/transforms/keypoint/translate_keypoint.py
+++ b/chainercv/transforms/keypoint/translate_keypoint.py
@@ -1,19 +1,18 @@
-def translate_keypoint(keypoint, x_offset=0, y_offset=0):
+def translate_keypoint(keypoint, y_offset=0, x_offset=0):
     """Translate keypoints.
 
     This method is mainly used together with image transforms, such as padding
-    and cropping, which translates the left top point of the image from
-    coordinate :math:`(0, 0)` to coordinate :math:`(x\_offset, y\_offset)`.
-
+    and cropping, which translates the top left point of the image
+    to the coordinate :math:`(y, x) = (y\_offset, x\_offset)`.
 
     Args:
         keypoint (~numpy.ndarray): Keypoints in the image.
             The shape of this array is :math:`(K, 2)`. :math:`K` is the number
             of keypoints in the image.
-            The last dimension is composed of :math:`x` and :math:`y`
+            The last dimension is composed of :math:`y` and :math:`x`
             coordinates of the keypoints.
-        x_offset (int or float): The offset along x axis.
         y_offset (int or float): The offset along y axis.
+        x_offset (int or float): The offset along x axis.
 
     Returns:
         ~numpy.ndarray:
@@ -23,7 +22,7 @@ def translate_keypoint(keypoint, x_offset=0, y_offset=0):
 
     out_keypoint = keypoint.copy()
 
-    out_keypoint[:, 0] += x_offset
-    out_keypoint[:, 1] += y_offset
+    out_keypoint[:, 0] += y_offset
+    out_keypoint[:, 1] += x_offset
 
     return out_keypoint
diff --git a/chainercv/utils/bbox/bbox_iou.py b/chainercv/utils/bbox/bbox_iou.py
index db286bb624..79f4a40b84 100644
--- a/chainercv/utils/bbox/bbox_iou.py
+++ b/chainercv/utils/bbox/bbox_iou.py
@@ -32,12 +32,12 @@ def bbox_iou(bbox_a, bbox_b):
         raise IndexError
     xp = cuda.get_array_module(bbox_a)
 
-    # left top
-    lt = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
-    # right bottom
-    rb = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
+    # top left
+    tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
+    # bottom right
+    br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
 
-    area_i = xp.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2)
     area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
     area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
     return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/chainercv/utils/bbox/non_maximum_suppression.py b/chainercv/utils/bbox/non_maximum_suppression.py
index 1d28a99dc7..cc9382d967 100644
--- a/chainercv/utils/bbox/non_maximum_suppression.py
+++ b/chainercv/utils/bbox/non_maximum_suppression.py
@@ -31,7 +31,7 @@ def non_maximum_suppression(bbox, thresh, score=None,
     The bounding boxes are expected to be packed into a two dimensional
     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
     bounding boxes in the image. The second axis represents attributes of
-    the bounding box. They are :obj:`(x_min, y_min, x_max, y_max)`,
+    the bounding box. They are :obj:`(y_min, x_min, y_max, x_max)`,
     where the four attributes are coordinates of the top left and the
     bottom right vertices.
 
@@ -80,9 +80,9 @@ def _non_maximum_suppression_cpu(bbox, thresh, score=None, limit=None):
 
     selec = np.zeros(bbox.shape[0], dtype=bool)
     for i, b in enumerate(bbox):
-        lt = np.maximum(b[:2], bbox[selec, :2])
-        rb = np.minimum(b[2:], bbox[selec, 2:])
-        area = np.prod(rb - lt, axis=1) * (lt < rb).all(axis=1)
+        tl = np.maximum(b[:2], bbox[selec, :2])
+        br = np.minimum(b[2:], bbox[selec, 2:])
+        area = np.prod(br - tl, axis=1) * (tl < br).all(axis=1)
 
         iou = area / (bbox_area[i] + bbox_area[selec] - area)
         if (iou >= thresh).any():
@@ -127,13 +127,13 @@ def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None):
 
 __device__
 inline float devIoU(float const *const bbox_a, float const *const bbox_b) {
-  float left = max(bbox_a[0], bbox_b[0]);
-  float right = min(bbox_a[2], bbox_b[2]);
-  float top = max(bbox_a[1], bbox_b[1]);
-  float bottom = min(bbox_a[3], bbox_b[3]);
-  float width = max(right - left, 0.f);
+  float top = max(bbox_a[0], bbox_b[0]);
+  float bottom = min(bbox_a[2], bbox_b[2]);
+  float left = max(bbox_a[1], bbox_b[1]);
+  float right = min(bbox_a[3], bbox_b[3]);
   float height = max(bottom - top, 0.f);
-  float area_i = width * height;
+  float width = max(right - left, 0.f);
+  float area_i = height * width;
   float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]);
   float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]);
   return area_i / (area_a + area_b - area_i);
diff --git a/chainercv/utils/testing/generate_random_bbox.py b/chainercv/utils/testing/generate_random_bbox.py
index 95f39b7026..fef310d9f4 100644
--- a/chainercv/utils/testing/generate_random_bbox.py
+++ b/chainercv/utils/testing/generate_random_bbox.py
@@ -6,7 +6,7 @@ def generate_random_bbox(n, img_size, min_length, max_length):
 
     Args:
         n (int): The number of bounding boxes.
-        img_size (tuple): A tuple of length 2. The width and the height
+        img_size (tuple): A tuple of length 2. The height and the width
             of the image on which bounding boxes locate.
         min_length (float): The minimum length of edges of bounding boxes.
         max_length (float): The maximum length of edges of bounding boxes.
@@ -15,17 +15,17 @@ def generate_random_bbox(n, img_size, min_length, max_length):
         numpy.ndarray:
         Coordinates of bounding boxes. Its shape is :math:`(R, 4)`. \
         Here, :math:`R` equals :obj:`n`.
-        The second axis contains :math:`x_{min}, y_{min}, x_{max}, y_{max}`,
+        The second axis contains :math:`y_{min}, x_{min}, y_{max}, x_{max}`,
         where
-        :math:`min\_length \\leq x_{max} - x_{min} < max\_length`
-        and
         :math:`min\_length \\leq y_{max} - y_{min} < max\_length`.
+        and
+        :math:`min\_length \\leq x_{max} - x_{min} < max\_length`
 
     """
-    W, H = img_size
-    x_min = np.random.uniform(0, W - max_length, size=(n,))
+    H, W = img_size
     y_min = np.random.uniform(0, H - max_length, size=(n,))
-    x_max = x_min + np.random.uniform(min_length, max_length, size=(n,))
+    x_min = np.random.uniform(0, W - max_length, size=(n,))
     y_max = y_min + np.random.uniform(min_length, max_length, size=(n,))
-    bbox = np.stack((x_min, y_min, x_max, y_max), axis=1).astype(np.float32)
+    x_max = x_min + np.random.uniform(min_length, max_length, size=(n,))
+    bbox = np.stack((y_min, x_min, y_max, x_max), axis=1).astype(np.float32)
     return bbox
diff --git a/chainercv/visualizations/vis_bbox.py b/chainercv/visualizations/vis_bbox.py
index 016a98949c..e70110a90e 100644
--- a/chainercv/visualizations/vis_bbox.py
+++ b/chainercv/visualizations/vis_bbox.py
@@ -23,7 +23,7 @@ def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None):
         bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
             :math:`R` is the number of bounding boxes in the image.
             Each element is organized
-            by :obj:`(x_min, y_min, x_max, y_max)` in the second axis.
+            by :obj:`(y_min, x_min, y_max, x_max)` in the second axis.
         label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
             The values correspond to id for label names stored in
             :obj:`label_names`. This is optional.
@@ -55,9 +55,9 @@ def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None):
         return ax
 
     for i, bb in enumerate(bbox):
-        xy = (bb[0], bb[1])
-        width = bb[2] - bb[0]
-        height = bb[3] - bb[1]
+        xy = (bb[1], bb[0])
+        height = bb[2] - bb[0]
+        width = bb[3] - bb[1]
         ax.add_patch(plot.Rectangle(
             xy, width, height, fill=False, edgecolor='red', linewidth=3))
 
@@ -73,7 +73,7 @@ def vis_bbox(img, bbox, label=None, score=None, label_names=None, ax=None):
             caption.append('{:.2f}'.format(sc))
 
         if len(caption) > 0:
-            ax.text(bb[0], bb[1],
+            ax.text(bb[1], bb[0],
                     ': '.join(caption),
                     style='italic',
                     bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 10})
diff --git a/chainercv/visualizations/vis_keypoint.py b/chainercv/visualizations/vis_keypoint.py
index 6c34a63bd3..6974593652 100644
--- a/chainercv/visualizations/vis_keypoint.py
+++ b/chainercv/visualizations/vis_keypoint.py
@@ -24,7 +24,7 @@ def vis_keypoint(img, keypoint, kp_mask=None, ax=None):
         keypoint (~numpy.ndarray): An array with keypoint pairs whose shape is
             :math:`(K, 2)`, where :math:`K` is
             the number of keypoints in the array.
-            The second axis corresponds to :math:`x` and :math:`y` coordinates
+            The second axis corresponds to :math:`y` and :math:`x` coordinates
             of the keypoint.
         kp_mask (~numpy.ndarray, optional): A boolean array whose shape is
             :math:`(K,)`. If :math:`i` th index is :obj:`True`, the
@@ -53,7 +53,7 @@ def vis_keypoint(img, keypoint, kp_mask=None, ax=None):
 
     for i in range(n_kp):
         if kp_mask[i]:
-            ax.scatter(keypoint[i][0], keypoint[i][1], c=colors[i], s=100)
+            ax.scatter(keypoint[i][1], keypoint[i][0], c=colors[i], s=100)
 
     ax.set_xlim(left=0, right=W)
     ax.set_ylim(bottom=H - 1, top=0)
diff --git a/examples/faster_rcnn/train.py b/examples/faster_rcnn/train.py
index bd3e6ddd6f..c4a09816ad 100644
--- a/examples/faster_rcnn/train.py
+++ b/examples/faster_rcnn/train.py
@@ -57,12 +57,13 @@ def transform(in_data):
         img = faster_rcnn.prepare(img)
         _, o_H, o_W = img.shape
         scale = o_H / H
-        bbox = transforms.resize_bbox(bbox, (W, H), (o_W, o_H))
+        bbox = transforms.resize_bbox(bbox, (H, W), (o_H, o_W))
 
         # horizontally flip
         img, params = transforms.random_flip(
             img, x_random=True, return_param=True)
-        bbox = transforms.flip_bbox(bbox, (o_W, o_H), params['x_flip'])
+        bbox = transforms.flip_bbox(
+            bbox, (o_H, o_W), x_flip=params['x_flip'])
 
         return img, bbox, label, scale
     train_data = TransformDataset(train_data, transform)
diff --git a/examples/ssd/caffe2npz.py b/examples/ssd/caffe2npz.py
index f72b193c8a..bbedf0abbf 100644
--- a/examples/ssd/caffe2npz.py
+++ b/examples/ssd/caffe2npz.py
@@ -67,6 +67,18 @@ def _skip_layer(self, _):
         pass
 
 
+def convert_xy_conv(l):
+    b = l.b.data.reshape(-1, 4)
+    b = b[:, [1, 0, 3, 2]]
+
+    out_C, in_C, kh, kw = l.W.shape
+    W = l.W.data.reshape(-1, 4, in_C, kh, kw)
+    W = W[:, [1, 0, 3, 2]]
+
+    l.b.data[:] = b.reshape(-1)
+    l.W.data[:] = W.reshape(-1, in_C, kh, kw)
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('caffemodel')
@@ -78,6 +90,14 @@ def main():
     # Convert weights so that they accept RGB images.
     model['extractor/conv1_1'].W.data[:] =\
         model['extractor/conv1_1'].W.data[:, ::-1]
+
+    # The pretrained model outputs coordinates in xy convention.
+    # This needs to be changed to yx convention, which is used
+    # in ChainerCV.
+    for child in model.children():
+        if child.name.startswith('multibox/loc'):
+            convert_xy_conv(model[child.name])
+
     serializers.save_npz(args.output, model)
 
 
diff --git a/tests/evaluations_tests/test_eval_detection_voc_ap.py b/tests/evaluations_tests/test_eval_detection_voc_ap.py
index 62cd3c0166..22f11d9c47 100644
--- a/tests/evaluations_tests/test_eval_detection_voc_ap.py
+++ b/tests/evaluations_tests/test_eval_detection_voc_ap.py
@@ -181,14 +181,14 @@ class TestEvalDetectionVOCAP(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         base_url = 'https://github.com/yuyu2172/' \
-            'share-weights/releases/download/0.0.2'
+            'share-weights/releases/download/0.0.3'
 
         cls.dataset = np.load(request.urlretrieve(os.path.join(
             base_url,
-            'voc_detection_dataset_2007_test_truncated_2017_06_02.npz'))[0])
+            'voc_detection_dataset_2007_test_truncated_2017_06_06.npz'))[0])
         cls.result = np.load(request.urlretrieve(os.path.join(
             base_url,
-            'voc_detection_result_2007_test_truncated_2017_06_02.npz'))[0])
+            'voc_detection_result_2007_test_truncated_2017_06_06.npz'))[0])
 
     def test_eval_detection_voc_ap(self):
         pred_bboxes = self.result['bboxes']
diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py b/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py
index fa6a4a7fd5..b9decac22d 100644
--- a/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py
+++ b/tests/links_tests/model_tests/faster_rcnn_tests/dummy_faster_rcnn.py
@@ -59,10 +59,10 @@ def __call__(self, x, img_size, scale, test=False):
         rpn_locs = _random_array(self.xp, (B, n_anchor, 4))
         rpn_cls_scores = _random_array(self.xp, (B, n_anchor, 2))
         rois = self.xp.asarray(generate_random_bbox(
-            self.n_roi, img_size[::-1], 16, min(img_size)))
+            self.n_roi, img_size, 16, min(img_size)))
         roi_indices = self.xp.zeros((len(rois),), dtype=np.int32)
         anchor = self.xp.asarray(generate_random_bbox(
-            n_anchor, img_size[::-1], 16, min(img_size)))
+            n_anchor, img_size, 16, min(img_size)))
         return (chainer.Variable(rpn_locs),
                 chainer.Variable(rpn_cls_scores), rois, roi_indices, anchor)
 
diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py b/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py
index 445920cfd4..571347d2d8 100644
--- a/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py
+++ b/tests/links_tests/model_tests/faster_rcnn_tests/test_faster_rcnn_vgg.py
@@ -41,7 +41,7 @@ def check_call(self):
         x = chainer.Variable(
             xp.random.uniform(
                 low=-1., high=1.,
-                size=(self.B, 3, feat_size[1] * 16, feat_size[0] * 16)
+                size=(self.B, 3, feat_size[0] * 16, feat_size[1] * 16)
             ).astype(np.float32), volatile=chainer.flag.ON)
         roi_cls_locs, roi_scores, rois, roi_indices = self.link(
             x, test=not self.train)
diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py b/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py
index b800c5b264..d4b60238e8 100644
--- a/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py
+++ b/tests/links_tests/model_tests/faster_rcnn_tests/test_region_proposal_network.py
@@ -33,7 +33,7 @@ def setUp(self):
             proposal_creator_params=self.proposal_creator_params
         )
         self.x = np.random.uniform(size=(self.B, C, H, W)).astype(np.float32)
-        self.img_size = (W * feat_stride, H * feat_stride)
+        self.img_size = (H * feat_stride, W * feat_stride)
 
     def _check_call(self, x, img_size, test):
         _, _, H, W = x.shape
diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py
index 84bab3352c..97ec20d8e8 100644
--- a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py
+++ b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_bbox2loc_loc2bbox.py
@@ -60,7 +60,7 @@ def test_loc2bbox_gpu(self):
 class TestDeltaEncodeDecodeConsistency(unittest.TestCase):
 
     def setUp(self):
-        self.src_bbox = generate_random_bbox(8, (32, 64), 4, 16)
+        self.src_bbox = generate_random_bbox(8, (64, 32), 4, 16)
         self.dst_bbox = self.src_bbox + 1
 
     def check_bbox_loc_conversions_consistency(
diff --git a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py
index adaca1c220..4003ef4ad9 100644
--- a/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py
+++ b/tests/links_tests/model_tests/faster_rcnn_tests/utils_tests/test_generate_anchor_base.py
@@ -13,15 +13,15 @@ class TestGenerateAnchorBase(unittest.TestCase):
 
     def test_generaete_anchor_base(self):
         gt = np.array(
-            [[-120., -24., 136., 40.],
-             [-248., -56., 264., 72.],
-             [-504., -120., 520., 136.],
+            [[-24., -120., 40., 136.],
+             [-56., -248., 72., 264.],
+             [-120., -504., 136., 520.],
              [-56., -56., 72., 72.],
              [-120., -120., 136., 136.],
              [-248., -248., 264., 264.],
-             [-24., -120., 40., 136.],
-             [-56., -248., 72., 264.],
-             [-120., -504., 136., 520.]])
+             [-120., -24., 136., 40.],
+             [-248., -56., 264., 72.],
+             [-504., -120., 520., 136.]])
 
         base_size = 16
         anchor_scales = [8, 16, 32]
diff --git a/tests/transforms_tests/bbox_tests/test_flip_bbox.py b/tests/transforms_tests/bbox_tests/test_flip_bbox.py
index 39285cdc1c..3cdfc5ee9a 100644
--- a/tests/transforms_tests/bbox_tests/test_flip_bbox.py
+++ b/tests/transforms_tests/bbox_tests/test_flip_bbox.py
@@ -12,16 +12,16 @@ def test_flip_bbox(self):
         bbox = np.random.uniform(
             low=0., high=32., size=(10, 4))
 
-        out = flip_bbox(bbox, size=(32, 34), x_flip=True)
+        out = flip_bbox(bbox, size=(34, 32), y_flip=True)
         bbox_expected = bbox.copy()
-        bbox_expected[:, 0] = 31 - bbox[:, 2]
-        bbox_expected[:, 2] = 31 - bbox[:, 0]
+        bbox_expected[:, 0] = 33 - bbox[:, 2]
+        bbox_expected[:, 2] = 33 - bbox[:, 0]
         np.testing.assert_equal(out, bbox_expected)
 
-        out = flip_bbox(bbox, size=(32, 34), y_flip=True)
+        out = flip_bbox(bbox, size=(34, 32), x_flip=True)
         bbox_expected = bbox.copy()
-        bbox_expected[:, 1] = 33 - bbox[:, 3]
-        bbox_expected[:, 3] = 33 - bbox[:, 1]
+        bbox_expected[:, 1] = 31 - bbox[:, 3]
+        bbox_expected[:, 3] = 31 - bbox[:, 1]
         np.testing.assert_equal(out, bbox_expected)
 
 
diff --git a/tests/transforms_tests/bbox_tests/test_resize_bbox.py b/tests/transforms_tests/bbox_tests/test_resize_bbox.py
index c9dfe5b085..effd847adc 100644
--- a/tests/transforms_tests/bbox_tests/test_resize_bbox.py
+++ b/tests/transforms_tests/bbox_tests/test_resize_bbox.py
@@ -12,12 +12,12 @@ def test_resize_bbox(self):
         bbox = np.random.uniform(
             low=0., high=32., size=(10, 4))
 
-        out = resize_bbox(bbox, in_size=(32, 32), out_size=(128, 64))
+        out = resize_bbox(bbox, in_size=(32, 32), out_size=(64, 128))
         bbox_expected = bbox.copy()
-        bbox_expected[:, 0] = bbox[:, 0] * 4
-        bbox_expected[:, 1] = bbox[:, 1] * 2
-        bbox_expected[:, 2] = bbox[:, 2] * 4
-        bbox_expected[:, 3] = bbox[:, 3] * 2
+        bbox_expected[:, 0] = bbox[:, 0] * 2
+        bbox_expected[:, 1] = bbox[:, 1] * 4
+        bbox_expected[:, 2] = bbox[:, 2] * 2
+        bbox_expected[:, 3] = bbox[:, 3] * 4
         np.testing.assert_equal(out, bbox_expected)
 
 
diff --git a/tests/transforms_tests/bbox_tests/test_translate_bbox.py b/tests/transforms_tests/bbox_tests/test_translate_bbox.py
index e021ee4e6a..ffca0982a8 100644
--- a/tests/transforms_tests/bbox_tests/test_translate_bbox.py
+++ b/tests/transforms_tests/bbox_tests/test_translate_bbox.py
@@ -12,12 +12,12 @@ def test_translate_bbox(self):
         bbox = np.random.uniform(
             low=0., high=32., size=(10, 4))
 
-        out = translate_bbox(bbox, x_offset=3, y_offset=5)
+        out = translate_bbox(bbox, y_offset=5, x_offset=3)
         bbox_expected = np.empty_like(bbox)
-        bbox_expected[:, 0] = bbox[:, 0] + 3
-        bbox_expected[:, 1] = bbox[:, 1] + 5
-        bbox_expected[:, 2] = bbox[:, 2] + 3
-        bbox_expected[:, 3] = bbox[:, 3] + 5
+        bbox_expected[:, 0] = bbox[:, 0] + 5
+        bbox_expected[:, 1] = bbox[:, 1] + 3
+        bbox_expected[:, 2] = bbox[:, 2] + 5
+        bbox_expected[:, 3] = bbox[:, 3] + 3
         np.testing.assert_equal(out, bbox_expected)
 
 
diff --git a/tests/transforms_tests/image_tests/test_center_crop.py b/tests/transforms_tests/image_tests/test_center_crop.py
index 059f8f242b..b3505fcf7b 100644
--- a/tests/transforms_tests/image_tests/test_center_crop.py
+++ b/tests/transforms_tests/image_tests/test_center_crop.py
@@ -11,13 +11,13 @@ class TestCenterCrop(unittest.TestCase):
     def test_center_crop(self):
         img = np.random.uniform(size=(3, 48, 32))
 
-        out, param = center_crop(img, (16, 24), return_param=True)
-        x_slice = param['x_slice']
+        out, param = center_crop(img, (24, 16), return_param=True)
         y_slice = param['y_slice']
+        x_slice = param['x_slice']
 
         np.testing.assert_equal(out, img[:, y_slice, x_slice])
-        self.assertEqual(x_slice, slice(8, 24))
         self.assertEqual(y_slice, slice(12, 36))
+        self.assertEqual(x_slice, slice(8, 24))
 
 
 testing.run_module(__name__, __file__)
diff --git a/tests/transforms_tests/image_tests/test_flip_transform.py b/tests/transforms_tests/image_tests/test_flip_transform.py
index 1efa8d77ee..e2e3161fd5 100644
--- a/tests/transforms_tests/image_tests/test_flip_transform.py
+++ b/tests/transforms_tests/image_tests/test_flip_transform.py
@@ -11,7 +11,7 @@ class TestRandomFlip(unittest.TestCase):
     def test_random_flip(self):
         img = np.random.uniform(size=(3, 24, 24))
 
-        out = flip(img, x_flip=True, y_flip=True)
+        out = flip(img, y_flip=True, x_flip=True)
 
         expected = img
         expected = expected[:, :, ::-1]
@@ -21,7 +21,7 @@ def test_random_flip(self):
     def test_random_flip_vertical(self):
         img = np.random.uniform(size=(3, 24, 24))
 
-        out = flip(img, x_flip=False, y_flip=True)
+        out = flip(img, y_flip=True, x_flip=False)
 
         expected = img
         expected = expected[:, ::-1, :]
diff --git a/tests/transforms_tests/image_tests/test_random_crop.py b/tests/transforms_tests/image_tests/test_random_crop.py
index bf814ece37..a4081b3a5d 100644
--- a/tests/transforms_tests/image_tests/test_random_crop.py
+++ b/tests/transforms_tests/image_tests/test_random_crop.py
@@ -11,14 +11,14 @@ class TestRandomCrop(unittest.TestCase):
     def test_random_crop(self):
         img = np.random.uniform(size=(3, 48, 32))
 
-        out, param = random_crop(img, (32, 48), return_param=True)
-        x_slice = param['x_slice']
+        out, param = random_crop(img, (48, 32), return_param=True)
         y_slice = param['y_slice']
+        x_slice = param['x_slice']
         np.testing.assert_equal(out, img)
-        self.assertEqual(x_slice, slice(0, 32))
         self.assertEqual(y_slice, slice(0, 48))
+        self.assertEqual(x_slice, slice(0, 32))
 
-        out = random_crop(img, (12, 24))
+        out = random_crop(img, (24, 12))
         self.assertEqual(out.shape[1:], (24, 12))
 
 
diff --git a/tests/transforms_tests/image_tests/test_random_expand.py b/tests/transforms_tests/image_tests/test_random_expand.py
index b52ab01e40..77e2d9778b 100644
--- a/tests/transforms_tests/image_tests/test_random_expand.py
+++ b/tests/transforms_tests/image_tests/test_random_expand.py
@@ -23,8 +23,8 @@ def test_random_expand(self):
         out, param = random_expand(
             img, max_ratio=self.max_ratio, return_param=True)
         ratio = param['ratio']
-        x_offset = param['x_offset']
         y_offset = param['y_offset']
+        x_offset = param['x_offset']
         np.testing.assert_equal(
             out[:, y_offset:y_offset + 64, x_offset:x_offset + 32], img)
         self.assertGreaterEqual(ratio, 1)
@@ -47,9 +47,9 @@ def test_random_expand_fill(self):
 
         while True:
             out, param = random_expand(img, fill=self.fill, return_param=True)
-            x_offset = param['x_offset']
             y_offset = param['y_offset']
-            if x_offset > 0 or y_offset > 0:
+            x_offset = param['x_offset']
+            if y_offset > 0 or x_offset > 0:
                 break
 
         if isinstance(self.fill, int):
diff --git a/tests/transforms_tests/image_tests/test_random_flip_transform.py b/tests/transforms_tests/image_tests/test_random_flip.py
similarity index 90%
rename from tests/transforms_tests/image_tests/test_random_flip_transform.py
rename to tests/transforms_tests/image_tests/test_random_flip.py
index 712195249a..2574c41c29 100644
--- a/tests/transforms_tests/image_tests/test_random_flip_transform.py
+++ b/tests/transforms_tests/image_tests/test_random_flip.py
@@ -12,15 +12,15 @@ def test_random_flip(self):
         img = np.random.uniform(size=(3, 24, 24))
 
         out, param = random_flip(
-            img, x_random=True, y_random=True, return_param=True)
-        x_flip = param['x_flip']
+            img, y_random=True, x_random=True, return_param=True)
         y_flip = param['y_flip']
+        x_flip = param['x_flip']
 
         expected = img
-        if x_flip:
-            expected = expected[:, :, ::-1]
         if y_flip:
             expected = expected[:, ::-1, :]
+        if x_flip:
+            expected = expected[:, :, ::-1]
         np.testing.assert_equal(out, expected)
 
 
diff --git a/tests/transforms_tests/image_tests/test_resize.py b/tests/transforms_tests/image_tests/test_resize.py
index 853a28066a..805f796198 100644
--- a/tests/transforms_tests/image_tests/test_resize.py
+++ b/tests/transforms_tests/image_tests/test_resize.py
@@ -17,12 +17,12 @@ class TestResize(unittest.TestCase):
 
     def test_resize_color(self):
         img = np.random.uniform(size=(3, 24, 32))
-        out = resize(img, size=(64, 32), interpolation=self.interpolation)
+        out = resize(img, size=(32, 64), interpolation=self.interpolation)
         self.assertEqual(out.shape, (3, 32, 64))
 
     def test_resize_grayscale(self):
         img = np.random.uniform(size=(1, 24, 32))
-        out = resize(img, size=(64, 32), interpolation=self.interpolation)
+        out = resize(img, size=(32, 64), interpolation=self.interpolation)
         self.assertEqual(out.shape, (1, 32, 64))
 
 
diff --git a/tests/transforms_tests/image_tests/test_resize_contain.py b/tests/transforms_tests/image_tests/test_resize_contain.py
index a7462d4db9..17442c2b66 100644
--- a/tests/transforms_tests/image_tests/test_resize_contain.py
+++ b/tests/transforms_tests/image_tests/test_resize_contain.py
@@ -17,31 +17,31 @@ def test_resize_contain(self):
         img = np.random.uniform(size=(3, 32, 64))
 
         out, param = resize_contain(
-            img, (96, 48), fill=self.fill, return_param=True)
+            img, (48, 96), fill=self.fill, return_param=True)
 
         np.testing.assert_array_equal(img, out[:, 8:40, 16:80])
         np.testing.assert_array_equal(self.fill, out[:, 0, 0])
-        self.assertEqual(param['scaled_size'], (64, 32))
-        self.assertEqual(param['x_offset'], 16)
+        self.assertEqual(param['scaled_size'], (32, 64))
         self.assertEqual(param['y_offset'], 8)
+        self.assertEqual(param['x_offset'], 16)
 
     def test_resize_contain_canvas_small_x(self):
         img = np.random.uniform(size=(3, 32, 64))
 
         out, param = resize_contain(
-            img, (68, 16), fill=self.fill, return_param=True)
-        self.assertEqual(param['scaled_size'], (32, 16))
-        self.assertEqual(param['x_offset'], 18)
+            img, (16, 68), fill=self.fill, return_param=True)
+        self.assertEqual(param['scaled_size'], (16, 32))
         self.assertEqual(param['y_offset'], 0)
+        self.assertEqual(param['x_offset'], 18)
 
     def test_resize_contain_canvas_small_y(self):
         img = np.random.uniform(size=(3, 32, 64))
 
         out, param = resize_contain(
-            img, (16, 24), fill=self.fill, return_param=True)
-        self.assertEqual(param['scaled_size'], (16, 8))
-        self.assertEqual(param['x_offset'], 0)
+            img, (24, 16), fill=self.fill, return_param=True)
+        self.assertEqual(param['scaled_size'], (8, 16))
         self.assertEqual(param['y_offset'], 8)
+        self.assertEqual(param['x_offset'], 0)
 
 
 testing.run_module(__name__, __file__)
diff --git a/tests/transforms_tests/image_tests/test_ten_crop.py b/tests/transforms_tests/image_tests/test_ten_crop.py
index a220470dee..e09c4a2d20 100644
--- a/tests/transforms_tests/image_tests/test_ten_crop.py
+++ b/tests/transforms_tests/image_tests/test_ten_crop.py
@@ -11,14 +11,14 @@ class TestTenCrop(unittest.TestCase):
     def test_ten_crop(self):
         img = np.random.uniform(size=(3, 48, 32))
 
-        out = ten_crop(img, (32, 48))
+        out = ten_crop(img, (48, 32))
         self.assertEqual(out.shape, (10, 3, 48, 32))
         for crop in out[:5]:
             np.testing.assert_equal(crop, img)
         for crop in out[5:]:
             np.testing.assert_equal(crop[:, :, ::-1], img)
 
-        out = ten_crop(img, (12, 24))
+        out = ten_crop(img, (24, 12))
         self.assertEqual(out.shape, (10, 3, 24, 12))
 
 
diff --git a/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py b/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py
index d60d46ac2c..f059b47567 100644
--- a/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py
+++ b/tests/transforms_tests/keypoint_tests/test_flip_keypoint.py
@@ -12,14 +12,14 @@ def test_flip_keypoint(self):
         keypoint = np.random.uniform(
             low=0., high=32., size=(12, 2))
 
-        out = flip_keypoint(keypoint, size=(32, 34), x_flip=True)
+        out = flip_keypoint(keypoint, size=(34, 32), y_flip=True)
         keypoint_expected = keypoint.copy()
-        keypoint_expected[:, 0] = 31 - keypoint[:, 0]
+        keypoint_expected[:, 0] = 33 - keypoint[:, 0]
         np.testing.assert_equal(out, keypoint_expected)
 
-        out = flip_keypoint(keypoint, size=(32, 34), y_flip=True)
+        out = flip_keypoint(keypoint, size=(34, 32), x_flip=True)
         keypoint_expected = keypoint.copy()
-        keypoint_expected[:, 1] = 33 - keypoint[:, 1]
+        keypoint_expected[:, 1] = 31 - keypoint[:, 1]
         np.testing.assert_equal(out, keypoint_expected)
 
 
diff --git a/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py b/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py
index 8f70a98d8e..5ed73951bc 100644
--- a/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py
+++ b/tests/transforms_tests/keypoint_tests/test_resize_keypoint.py
@@ -12,8 +12,9 @@ def test_resize_keypoint(self):
         keypoint = np.random.uniform(
             low=0., high=32., size=(12, 2))
 
-        out = resize_keypoint(keypoint, in_size=(32, 32), out_size=(64, 64))
-        keypoint[:, :2] *= 2
+        out = resize_keypoint(keypoint, in_size=(16, 32), out_size=(8, 64))
+        keypoint[:, 0] *= 0.5
+        keypoint[:, 1] *= 2
         np.testing.assert_equal(out, keypoint)
 
 
diff --git a/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py b/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py
index 5e95fe5783..4978dd22cd 100644
--- a/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py
+++ b/tests/transforms_tests/keypoint_tests/test_translate_keypoint.py
@@ -12,7 +12,7 @@ def test_translate_keypoint(self):
         keypoint = np.random.uniform(
             low=0., high=32., size=(10, 2))
 
-        out = translate_keypoint(keypoint, x_offset=3, y_offset=5)
+        out = translate_keypoint(keypoint, y_offset=3, x_offset=5)
         expected = np.empty_like(keypoint)
         expected[:, 0] = keypoint[:, 0] + 3
         expected[:, 1] = keypoint[:, 1] + 5
diff --git a/tests/utils_tests/testing_tests/test_geenerate_random_bbox.py b/tests/utils_tests/testing_tests/test_generate_random_bbox.py
similarity index 92%
rename from tests/utils_tests/testing_tests/test_geenerate_random_bbox.py
rename to tests/utils_tests/testing_tests/test_generate_random_bbox.py
index d28c956991..c59cfdd788 100644
--- a/tests/utils_tests/testing_tests/test_geenerate_random_bbox.py
+++ b/tests/utils_tests/testing_tests/test_generate_random_bbox.py
@@ -23,12 +23,12 @@ def test_generate_random_bbox(self):
         self.assertTrue(np.all(bbox[:, [1, 3]] < img_size[1]))
         self.assertTrue(np.all(bbox[:, [1, 3]] >= 0))
 
-        w = bbox[:, 2] - bbox[:, 0]
-        h = bbox[:, 3] - bbox[:, 1]
-        self.assertTrue(np.all(w < max_length))
-        self.assertTrue(np.all(w >= min_length))
+        h = bbox[:, 2] - bbox[:, 0]
+        w = bbox[:, 3] - bbox[:, 1]
         self.assertTrue(np.all(h < max_length))
         self.assertTrue(np.all(h >= min_length))
+        self.assertTrue(np.all(w < max_length))
+        self.assertTrue(np.all(w >= min_length))
 
 
 testing.run_module(__name__, __file__)