Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Support single depth to 3D hand keypoints baseline, NYU Hand Pose Dataset and AWR #1487

Open
wants to merge 1 commit into
base: dev-0.29
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions configs/_base_/datasets/nyu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
dataset_info = dict(
dataset_name='nyu',
paper_info=dict(
author='Jonathan Tompson and Murphy Stein and Yann Lecun and '
'Ken Perlin',
title='Real-Time Continuous Pose Recovery of Human Hands '
'Using Convolutional Networks',
container='ACM Transactions on Graphics',
year='2014',
homepage='https://jonathantompson.github.io/NYU_Hand_Pose_Dataset.htm',
),
keypoint_info={
0: dict(name='F1_KNU3_A', id=0, color=[255, 128, 0], type='', swap=''),
1: dict(name='F1_KNU3_B', id=1, color=[255, 128, 0], type='', swap=''),
2: dict(name='F1_KNU2_A', id=2, color=[255, 128, 0], type='', swap=''),
3: dict(name='F1_KNU2_B', id=3, color=[255, 128, 0], type='', swap=''),
4:
dict(name='F1_KNU1_A', id=4, color=[255, 153, 255], type='', swap=''),
5:
dict(name='F1_KNU1_B', id=5, color=[255, 153, 255], type='', swap=''),
6:
dict(name='F2_KNU3_A', id=6, color=[255, 153, 255], type='', swap=''),
7:
dict(name='F2_KNU3_B', id=7, color=[255, 153, 255], type='', swap=''),
8:
dict(name='F2_KNU2_A', id=8, color=[102, 178, 255], type='', swap=''),
9:
dict(name='F2_KNU2_B', id=9, color=[102, 178, 255], type='', swap=''),
10:
dict(name='F2_KNU1_A', id=10, color=[102, 178, 255], type='', swap=''),
11:
dict(name='F2_KNU1_B', id=11, color=[102, 178, 255], type='', swap=''),
12:
dict(name='F3_KNU3_A', id=12, color=[255, 51, 51], type='', swap=''),
13:
dict(name='F3_KNU3_B', id=13, color=[255, 51, 51], type='', swap=''),
14:
dict(name='F3_KNU2_A', id=14, color=[255, 51, 51], type='', swap=''),
15:
dict(name='F3_KNU2_B', id=15, color=[255, 51, 51], type='', swap=''),
16: dict(name='F3_KNU1_A', id=16, color=[0, 255, 0], type='', swap=''),
17: dict(name='F3_KNU1_B', id=17, color=[0, 255, 0], type='', swap=''),
18: dict(name='F4_KNU3_A', id=18, color=[0, 255, 0], type='', swap=''),
19: dict(name='F4_KNU3_B', id=19, color=[0, 255, 0], type='', swap=''),
20:
dict(name='F4_KNU2_A', id=20, color=[255, 255, 255], type='', swap=''),
21:
dict(name='F4_KNU2_B', id=21, color=[255, 128, 0], type='', swap=''),
22:
dict(name='F4_KNU1_A', id=22, color=[255, 128, 0], type='', swap=''),
23:
dict(name='F4_KNU1_B', id=23, color=[255, 128, 0], type='', swap=''),
24:
dict(name='TH_KNU3_A', id=24, color=[255, 128, 0], type='', swap=''),
25:
dict(name='TH_KNU3_B', id=25, color=[255, 153, 255], type='', swap=''),
26:
dict(name='TH_KNU2_A', id=26, color=[255, 153, 255], type='', swap=''),
27:
dict(name='TH_KNU2_B', id=27, color=[255, 153, 255], type='', swap=''),
28:
dict(name='TH_KNU1_A', id=28, color=[255, 153, 255], type='', swap=''),
29:
dict(name='TH_KNU1_B', id=29, color=[102, 178, 255], type='', swap=''),
30:
dict(name='PALM_1', id=30, color=[102, 178, 255], type='', swap=''),
31:
dict(name='PALM_2', id=31, color=[102, 178, 255], type='', swap=''),
32:
dict(name='PALM_3', id=32, color=[102, 178, 255], type='', swap=''),
33: dict(name='PALM_4', id=33, color=[255, 51, 51], type='', swap=''),
34: dict(name='PALM_5', id=34, color=[255, 51, 51], type='', swap=''),
35: dict(name='PALM_6', id=35, color=[255, 51, 51], type='', swap=''),
},
skeleton_info={
0: dict(link=('PALM_3', 'F1_KNU2_B'), id=0, color=[255, 128, 0]),
1: dict(link=('F1_KNU2_B', 'F1_KNU3_A'), id=1, color=[255, 128, 0]),
2: dict(link=('PALM_3', 'F2_KNU2_B'), id=2, color=[255, 128, 0]),
3: dict(link=('F2_KNU2_B', 'F2_KNU3_A'), id=3, color=[255, 128, 0]),
4: dict(link=('PALM_3', 'F3_KNU2_B'), id=4, color=[255, 153, 255]),
5: dict(link=('F3_KNU2_B', 'F3_KNU3_A'), id=5, color=[255, 153, 255]),
6: dict(link=('PALM_3', 'F4_KNU2_B'), id=6, color=[255, 153, 255]),
7: dict(link=('F4_KNU2_B', 'F4_KNU3_A'), id=7, color=[255, 153, 255]),
8: dict(link=('PALM_3', 'TH_KNU2_B'), id=8, color=[102, 178, 255]),
9: dict(link=('TH_KNU2_B', 'TH_KNU3_B'), id=9, color=[102, 178, 255]),
10:
dict(link=('TH_KNU3_B', 'TH_KNU3_A'), id=10, color=[102, 178, 255]),
11: dict(link=('PALM_3', 'PALM_1'), id=11, color=[102, 178, 255]),
12: dict(link=('PALM_3', 'PALM_2'), id=12, color=[255, 51, 51]),
},
joint_weights=[1.] * 36,
sigmas=[])
177 changes: 177 additions & 0 deletions configs/hand/3d_kpt_sview_depth_img/awr/nyu/res50_nyu_all_128x128.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/nyu.py'
]
checkpoint_config = dict(interval=1)
# TODO: metric
evaluation = dict(
interval=1,
metric=['MRRPE', 'MPJPE', 'Handedness_acc'],
save_best='MPJPE_all')

optimizer = dict(
type='Adam',
lr=2e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='step', step=[15, 17])
total_epochs = 20
log_config = dict(
interval=20,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])

load_from = '/root/mmpose/data/ckpt/new_res50.pth'
used_keypoints_index = [0, 3, 6, 9, 12, 15, 18, 21, 24, 25, 27, 30, 31, 32]

channel_cfg = dict(
num_output_channels=14,
dataset_joints=36,
dataset_channel=used_keypoints_index,
inference_channel=used_keypoints_index)

# model settings
model = dict(
type='Depthhand3D', # pretrained=None
backbone=dict(
type='AWRResNet',
depth=50,
frozen_stages=-1,
zero_init_residual=False,
in_channels=1),
keypoint_head=dict(
type='AdaptiveWeightingRegression3DHead',
offset_head_cfg=dict(
in_channels=256,
out_channels_vector=42,
out_channels_scalar=14,
heatmap_kernel_size=1.0,
),
deconv_head_cfg=dict(
in_channels=2048,
out_channels=256,
depth_size=64,
num_deconv_layers=3,
num_deconv_filters=(256, 256, 256),
num_deconv_kernels=(4, 4, 4),
extra=dict(final_conv_kernel=0, )),
loss_offset=dict(type='AWRSmoothL1Loss', use_target_weight=False),
loss_keypoint=dict(type='AWRSmoothL1Loss', use_target_weight=True),
),
train_cfg=dict(use_img_for_head=True),
test_cfg=dict(use_img_for_head=True, flip_test=False))

data_cfg = dict(
image_size=[128, 128],
heatmap_size=[64, 64, 56],
cube_size=[300, 300, 300],
heatmap_size_root=64,
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])

train_pipeline = [
dict(type='LoadImageFromFile', color_type='unchanged'),
dict(type='TopDownGetBboxCenterScale', padding=1.0),
dict(type='TopDownAffine'),
dict(type='DepthToTensor'),
dict(
type='MultitaskGatherTarget',
pipeline_list=[
[
dict(
type='TopDownGenerateTargetRegression',
use_zero_mean=True,
joint_indices=used_keypoints_index,
is_3d=True,
normalize_depth=True,
),
dict(
type='HandGenerateJointToOffset',
heatmap_kernel_size=1.0,
)
],
[
dict(
type='TopDownGenerateTargetRegression',
use_zero_mean=True,
joint_indices=used_keypoints_index,
is_3d=True,
normalize_depth=True,
)
],
],
pipeline_indices=[0, 1],
),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'cube_size', 'center_depth', 'focal',
'princpt', 'image_size', 'joints_cam', 'dataset_channel',
'joints_uvd'
]),
]

val_pipeline = [
dict(type='LoadImageFromFile', color_type='unchanged'),
dict(type='TopDownGetBboxCenterScale', padding=1.0),
dict(type='TopDownAffine'),
dict(type='DepthToTensor'),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'cube_size', 'center_depth', 'focal',
'princpt', 'image_size', 'joints_cam', 'dataset_channel',
'joints_uvd'
])
]

test_pipeline = val_pipeline

data_root = 'data/nyu'
data = dict(
samples_per_gpu=4,
workers_per_gpu=0,
shuffle=False,
train=dict(
type='NYUHandDataset',
ann_file=f'{data_root}/annotations/nyu_test_data.json',
camera_file=f'{data_root}/annotations/nyu_test_camera.json',
joint_file=f'{data_root}/annotations/nyu_test_joint_3d.json',
img_prefix=f'{data_root}/images/test/',
data_cfg=data_cfg,
use_refined_center=False,
align_uvd_xyz_direction=True,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='NYUHandDataset',
ann_file=f'{data_root}/annotations/nyu_test_data.json',
camera_file=f'{data_root}/annotations/nyu_test_camera.json',
joint_file=f'{data_root}/annotations/nyu_test_joint_3d.json',
img_prefix=f'{data_root}/images/test/',
data_cfg=data_cfg,
use_refined_center=False,
align_uvd_xyz_direction=True,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='NYUHandDataset',
ann_file=f'{data_root}/annotations/nyu_test_data.json',
camera_file=f'{data_root}/annotations/nyu_test_camera.json',
joint_file=f'{data_root}/annotations/nyu_test_joint_3d.json',
img_prefix=f'{data_root}/images/test/',
data_cfg=data_cfg,
use_refined_center=False,
align_uvd_xyz_direction=True,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
35 changes: 35 additions & 0 deletions mmpose/core/evaluation/top_down_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,41 @@ def keypoints_from_heatmaps3d(heatmaps, center, scale):
return preds, maxvals


def keypoints_from_joint_uvd(joint_uvd, center, scale, image_size):
"""Get final keypoint predictions from 3d heatmaps and transform them back
to the image.

Note:
- batch size: N
- num keypoints: K
- heatmap depth size: D
- heatmap height: H
- heatmap width: W

Args:
heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
center (np.ndarray[N, 2]): Center of the bounding box (x, y).
scale (np.ndarray[N, 2]): Scale of the bounding box
wrt height/width.

Returns:
tuple: A tuple containing keypoint predictions and scores.

- preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \
in images.
- maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
"""
N, K, D = joint_uvd.shape
preds = joint_uvd
maxvals = np.ones((N, K, 1), dtype=np.float32)
# Transform back to the image
for i in range(N):
preds[i, :, :2] = transform_preds(
(preds[i, :, :2] + 1) * image_size[i] / 2, center[i], scale[i],
[image_size[i, 1], image_size[i, 0]])
return preds, maxvals


def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
"""Get multi-label classification accuracy.

Expand Down
12 changes: 9 additions & 3 deletions mmpose/datasets/datasets/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@
from .kpt_2d_sview_rgb_vid_top_down_dataset import \
Kpt2dSviewRgbVidTopDownDataset
from .kpt_3d_mview_rgb_img_direct_dataset import Kpt3dMviewRgbImgDirectDataset
from .kpt_3d_sview_depth_img_top_down_dataset import \
Kpt3dSviewDepthImgTopDownDataset
from .kpt_3d_sview_kpt_2d_dataset import Kpt3dSviewKpt2dDataset
from .kpt_3d_sview_rgb_img_top_down_dataset import \
Kpt3dSviewRgbImgTopDownDataset

__all__ = [
'Kpt3dMviewRgbImgDirectDataset', 'Kpt2dSviewRgbImgTopDownDataset',
'Kpt3dSviewRgbImgTopDownDataset', 'Kpt2dSviewRgbImgBottomUpDataset',
'Kpt3dSviewKpt2dDataset', 'Kpt2dSviewRgbVidTopDownDataset'
'Kpt3dMviewRgbImgDirectDataset',
'Kpt2dSviewRgbImgTopDownDataset',
'Kpt3dSviewRgbImgTopDownDataset',
'Kpt2dSviewRgbImgBottomUpDataset',
'Kpt3dSviewKpt2dDataset',
'Kpt2dSviewRgbVidTopDownDataset',
'Kpt3dSviewDepthImgTopDownDataset',
]
Loading