diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 17f15a03f..11732c819 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,11 +32,8 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7]
-        torch: [1.5.0, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
+        torch: [1.6.0, 1.7.0, 1.8.0, 1.9.0]
         include:
-          - torch: 1.5.0
-            torch_version: 1.5
-            torchvision: 0.6.0
           - torch: 1.6.0
             torch_version: 1.6
             torchvision: 0.7.0
@@ -88,7 +85,7 @@ jobs:
         run: |
           coverage run --branch --source mmselfsup -m pytest tests/
           coverage xml
-          coverage report -m --omit="mmselfsup/apis/*"
+          coverage report -m
       # Only upload coverage report for python3.8 && pytorch1.9.0
       - name: Upload coverage to Codecov
         if: ${{matrix.torch == '1.9.0' && matrix.python-version == '3.8'}}
diff --git a/.gitignore b/.gitignore
index 0c7c355e1..3c319c23b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,7 @@ tensorboard.sh
 replace.sh
 benchmarks/detection/datasets
 benchmarks/detection/output
+INFO
 
 # Pytorch
 *.pth
diff --git a/README.md b/README.md
index 7797fc08a..e2f493d8b 100644
--- a/README.md
+++ b/README.md
@@ -66,13 +66,12 @@ This project is released under the [Apache 2.0 license](LICENSE).
 
 ## ChangeLog
 
-MMSelfSup **v0.8.0** was released in 31/03/2022.
+MMSelfSup **v0.9.0** was released in 29/04/2022.
 
 Highlights of the new version:
 
-* Support **SimMIM**
-* Add **KNN** benchmark, support KNN test with checkpoint and extracted backbone weights
-* Support ImageNet-21k dataset
+* Support **CAE**
+* Support **Barlow Twins**
 
 Please refer to [changelog.md](docs/en/changelog.md) for details and release history.
 
@@ -97,9 +96,11 @@ Supported algorithms:
 - [x] [SwAV (NeurIPS'2020)](https://arxiv.org/abs/2006.09882)
 - [x] [DenseCL (CVPR'2021)](https://arxiv.org/abs/2011.09157)
 - [x] [SimSiam (CVPR'2021)](https://arxiv.org/abs/2011.10566)
+- [x] [Barlow Twins (ICML'2021)](https://arxiv.org/abs/2103.03230)
 - [x] [MoCo v3 (ICCV'2021)](https://arxiv.org/abs/2104.02057)
 - [x] [MAE](https://arxiv.org/abs/2111.06377)
 - [x] [SimMIM](https://arxiv.org/abs/2111.09886)
+- [x] [CAE](https://arxiv.org/abs/2202.03026)
 
 More algorithms are in our plan.
 
@@ -121,7 +122,7 @@ More algorithms are in our plan.
 
 ## Installation
 
-MMSelfSup depends on [PyTorch](https://pytorch.org/)], [MMCV](https://github.com/open-mmlab/mmcv) and [MMClassification](https://github.com/open-mmlab/mmclassification).
+MMSelfSup depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv) and [MMClassification](https://github.com/open-mmlab/mmclassification).
 
 Please refer to [install.md](docs/en/install.md) for more detailed instruction.
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4a47441c5..35631f512 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -64,13 +64,12 @@ MMSelfSup 是一个基于 PyTorch 实现的开源自监督表征学习工具箱
 
 ## 更新日志
 
-最新的 **v0.8.0** 版本已经在 2022.03.31 发布。
+最新的 **v0.9.0** 版本已经在 2022.04.29 发布。
 
 新版本亮点：
 
-* 支持 **SimMIM**
-* 增加 **KNN** 基准测试，支持中间 checkpoint 和提取的 backbone 权重进行评估
-* 支持 ImageNet-21k 数据集
+* 支持 **CAE**
+* 支持 **Barlow Twins**
 
 请参考 [更新日志](docs/zh_cn/changelog.md) 获取更多细节和历史版本信息。
 
@@ -96,9 +95,11 @@ MMSelfSup 和 OpenSelfSup 的不同点写在 [对比文档](docs/en/compatibilit
 - [x] [SwAV (NeurIPS'2020)](https://arxiv.org/abs/2006.09882)
 - [x] [DenseCL (CVPR'2021)](https://arxiv.org/abs/2011.09157)
 - [x] [SimSiam (CVPR'2021)](https://arxiv.org/abs/2011.10566)
+- [x] [Barlow Twins (ICML'2021)](https://arxiv.org/abs/2103.03230)
 - [x] [MoCo v3 (ICCV'2021)](https://arxiv.org/abs/2104.02057)
 - [x] [MAE](https://arxiv.org/abs/2111.06377)
 - [x] [SimMIM](https://arxiv.org/abs/2111.09886)
+- [x] [CAE](https://arxiv.org/abs/2202.03026)
 
 更多的算法实现已经在我们的计划中。
 
@@ -120,7 +121,7 @@ MMSelfSup 和 OpenSelfSup 的不同点写在 [对比文档](docs/en/compatibilit
 
 ## 安装
 
-MMSelfSup 依赖 [PyTorch](https://pytorch.org/)], [MMCV](https://github.com/open-mmlab/mmcv) 和 [MMClassification](https://github.com/open-mmlab/mmclassification).
+MMSelfSup 依赖 [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv) 和 [MMClassification](https://github.com/open-mmlab/mmclassification).
 
 请参考 [安装文档](docs/zh_cn/install.md) 获取更详细的安装指南。
 
diff --git a/configs/benchmarks/classification/_base_/datasets/imagenet.py b/configs/benchmarks/classification/_base_/datasets/imagenet.py
index a4400aa4c..410397593 100644
--- a/configs/benchmarks/classification/_base_/datasets/imagenet.py
+++ b/configs/benchmarks/classification/_base_/datasets/imagenet.py
@@ -30,8 +30,7 @@
         data_source=dict(
             type=data_source,
             data_prefix='data/imagenet/train',
-            ann_file='data/imagenet/meta/train.txt',
-        ),
+            ann_file='data/imagenet/meta/train.txt'),
         pipeline=train_pipeline,
         prefetch=prefetch),
     val=dict(
@@ -39,8 +38,7 @@
         data_source=dict(
             type=data_source,
             data_prefix='data/imagenet/val',
-            ann_file='data/imagenet/meta/val.txt',
-        ),
+            ann_file='data/imagenet/meta/val.txt'),
         pipeline=test_pipeline,
         prefetch=prefetch))
 evaluation = dict(interval=10, topk=(1, 5))
diff --git a/configs/benchmarks/classification/imagenet/resnet50-sobel_8xb32-steplr-100e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50-sobel_linear-8xb32-steplr-100e_in1k.py
similarity index 64%
rename from configs/benchmarks/classification/imagenet/resnet50-sobel_8xb32-steplr-100e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50-sobel_linear-8xb32-steplr-100e_in1k.py
index d0f759291..f8fb3b2df 100644
--- a/configs/benchmarks/classification/imagenet/resnet50-sobel_8xb32-steplr-100e_in1k.py
+++ b/configs/benchmarks/classification/imagenet/resnet50-sobel_linear-8xb32-steplr-100e_in1k.py
@@ -1,4 +1,4 @@
-_base_ = 'resnet50_8xb32-steplr-100e_in1k.py'
+_base_ = 'resnet50_linear-8xb32-steplr-100e_in1k.py'
 
 # model settings
 model = dict(with_sobel=True, backbone=dict(in_channels=2, frozen_stages=4))
diff --git a/configs/benchmarks/classification/imagenet/resnet50-sobel_mhead_8xb32-steplr-90e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50-sobel_mhead_linear-8xb32-steplr-90e_in1k.py
similarity index 62%
rename from configs/benchmarks/classification/imagenet/resnet50-sobel_mhead_8xb32-steplr-90e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50-sobel_mhead_linear-8xb32-steplr-90e_in1k.py
index 5047ac10c..37a434185 100644
--- a/configs/benchmarks/classification/imagenet/resnet50-sobel_mhead_8xb32-steplr-90e_in1k.py
+++ b/configs/benchmarks/classification/imagenet/resnet50-sobel_mhead_linear-8xb32-steplr-90e_in1k.py
@@ -1,4 +1,4 @@
-_base_ = 'resnet50_mhead_8xb32-steplr-90e_in1k.py'
+_base_ = 'resnet50_mhead_linear-8xb32-steplr-90e_in1k.py'
 
 # model settings
 model = dict(with_sobel=True, backbone=dict(in_channels=2, frozen_stages=4))
diff --git a/configs/benchmarks/classification/imagenet/resnet50-nofrz_8xb32-steplr-90e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-90e_in1k.py
similarity index 100%
rename from configs/benchmarks/classification/imagenet/resnet50-nofrz_8xb32-steplr-90e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-90e_in1k.py
diff --git a/configs/benchmarks/classification/imagenet/resnet50_8xb32-coslr-100e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-coslr-100e_in1k.py
similarity index 93%
rename from configs/benchmarks/classification/imagenet/resnet50_8xb32-coslr-100e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-coslr-100e_in1k.py
index c9d4d546a..e63c2bbd2 100644
--- a/configs/benchmarks/classification/imagenet/resnet50_8xb32-coslr-100e_in1k.py
+++ b/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-coslr-100e_in1k.py
@@ -4,10 +4,10 @@
     '../_base_/schedules/sgd_coslr-100e.py',
     '../_base_/default_runtime.py',
 ]
+# SwAV linear evaluation setting
 
 model = dict(backbone=dict(frozen_stages=4))
 
-# swav setting
 # runtime settings
 # the max_keep_ckpts controls the max number of ckpt file in your work_dirs
 # if it is 3, when CheckpointHook (in mmcv) saves the 4th ckpt
diff --git a/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py
similarity index 94%
rename from configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py
index a0bb07ec2..3a7795a05 100644
--- a/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py
+++ b/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py
@@ -4,12 +4,12 @@
     '../_base_/schedules/sgd_steplr-100e.py',
     '../_base_/default_runtime.py',
 ]
+# MoCo v1/v2 linear evaluation setting
 
 model = dict(backbone=dict(frozen_stages=4))
 
 evaluation = dict(interval=1, topk=(1, 5))
 
-# moco setting
 # optimizer
 optimizer = dict(type='SGD', lr=30., momentum=0.9, weight_decay=0.)
 
diff --git a/configs/benchmarks/classification/imagenet/resnet50_8xb512-coslr-90e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50_linear-8xb512-coslr-90e_in1k.py
similarity index 67%
rename from configs/benchmarks/classification/imagenet/resnet50_8xb512-coslr-90e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50_linear-8xb512-coslr-90e_in1k.py
index ea9d8ceea..2742bf133 100644
--- a/configs/benchmarks/classification/imagenet/resnet50_8xb512-coslr-90e_in1k.py
+++ b/configs/benchmarks/classification/imagenet/resnet50_linear-8xb512-coslr-90e_in1k.py
@@ -4,13 +4,17 @@
     '../_base_/schedules/lars_coslr-90e.py',
     '../_base_/default_runtime.py',
 ]
+# SimSiam linear evaluation setting
+# According to SimSiam paper, this setting can also be used to evaluate
+# other methods like SimCLR, MoCo, BYOL, SwAV
 
 model = dict(backbone=dict(frozen_stages=4))
 
 # dataset summary
-data = dict(samples_per_gpu=512)  # total 512*8=4096, 8GPU linear cls
+data = dict(
+    samples_per_gpu=512,
+    workers_per_gpu=8)  # total 512*8=4096, 8GPU linear cls
 
-# simsiam setting
 # runtime settings
 # the max_keep_ckpts controls the max number of ckpt file in your work_dirs
 # if it is 3, when CheckpointHook (in mmcv) saves the 4th ckpt
diff --git a/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py b/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py
similarity index 79%
rename from configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py
rename to configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py
index 16cc862d3..c569c7fd8 100644
--- a/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py
+++ b/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py
@@ -4,6 +4,7 @@
     '../_base_/schedules/sgd_steplr-100e.py',
     '../_base_/default_runtime.py',
 ]
+# Multi-head linear evaluation setting
 
 model = dict(backbone=dict(frozen_stages=4))
 
@@ -45,4 +46,8 @@
 
 # runtime settings
 runner = dict(type='EpochBasedRunner', max_epochs=90)
-checkpoint_config = dict(interval=10)
+
+# the max_keep_ckpts controls the max number of ckpt file in your work_dirs
+# if it is 3, when CheckpointHook (in mmcv) saves the 4th ckpt
+# it will remove the oldest one to keep the number of total ckpts as 3
+checkpoint_config = dict(interval=10, max_keep_ckpts=3)
diff --git a/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py b/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py
new file mode 100644
index 000000000..d69ea5d0e
--- /dev/null
+++ b/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py
@@ -0,0 +1,34 @@
+_base_ = 'swin-base_ft-8xb256-coslr-100e_in1k.py'
+
+# model
+model = dict(
+    backbone=dict(
+        img_size=224, stage_cfgs=dict(block_cfgs=dict(window_size=7))))
+
+# dataset
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(
+        type='RandomAug',
+        input_size=224,
+        color_jitter=0.4,
+        auto_augment='rand-m9-mstd0.5-inc1',
+        interpolation='bicubic',
+        re_prob=0.25,
+        re_mode='pixel',
+        re_count=1,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225))
+]
+test_pipeline = [
+    dict(type='Resize', size=256, interpolation=3),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg)
+]
+data = dict(
+    samples_per_gpu=256,
+    drop_last=False,
+    workers_per_gpu=32,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline))
diff --git a/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py b/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py
new file mode 100644
index 000000000..eab6e459e
--- /dev/null
+++ b/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py
@@ -0,0 +1,38 @@
+_base_ = 'vit-base-p16_ft-8xb128-coslr-100e_in1k.py'
+
+# model
+model = dict(backbone=dict(use_window=True, init_values=0.1))
+
+# optimizer
+optimizer = dict(lr=8e-3)
+
+# learning policy
+lr_config = dict(warmup_iters=5)
+
+# dataset
+img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+train_pipeline = [
+    dict(
+        type='RandomAug',
+        input_size=224,
+        color_jitter=0.4,
+        auto_augment='rand-m9-mstd0.5-inc1',
+        interpolation='bicubic',
+        re_prob=0.25,
+        re_mode='pixel',
+        re_count=1,
+        mean=(0.5, 0.5, 0.5),
+        std=(0.5, 0.5, 0.5))
+]
+test_pipeline = [
+    dict(type='Resize', size=256, interpolation=3),
+    dict(type='CenterCrop', size=224),
+    dict(type='ToTensor'),
+    dict(type='Normalize', **img_norm_cfg)
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    samples_per_gpu=128)
+
+find_unused_parameters = True
diff --git a/configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py b/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py
similarity index 100%
rename from configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py
rename to configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py
diff --git a/configs/benchmarks/classification/imagenet/vit-small-p16_8xb128-coslr-90e_in1k.py b/configs/benchmarks/classification/imagenet/vit-small-p16_linear-8xb128-coslr-90e_in1k.py
similarity index 100%
rename from configs/benchmarks/classification/imagenet/vit-small-p16_8xb128-coslr-90e_in1k.py
rename to configs/benchmarks/classification/imagenet/vit-small-p16_linear-8xb128-coslr-90e_in1k.py
diff --git a/configs/selfsup/_base_/datasets/imagenet_cae.py b/configs/selfsup/_base_/datasets/imagenet_cae.py
new file mode 100644
index 000000000..944696c2f
--- /dev/null
+++ b/configs/selfsup/_base_/datasets/imagenet_cae.py
@@ -0,0 +1,40 @@
+# dataset settings
+data_source = 'ImageNet'
+dataset_type = 'SingleViewDataset'
+img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+train_pipeline = [
+    dict(type='RandomHorizontalFlip', p=0.5),
+    dict(
+        type='RandomResizedCropAndInterpolationWithTwoPic',
+        size=224,
+        second_size=112,
+        interpolation='bicubic',
+        second_interpolation='lanczos',
+        scale=(0.08, 1.0)),
+]
+
+# prefetch
+prefetch = False
+if not prefetch:
+    train_pipeline.extend([dict(type='ToTensor')])
+
+train_pipeline.append(
+    dict(
+        type='BEiTMaskGenerator',
+        input_size=(14, 14),
+        num_masking_patches=75,
+        max_num_patches=None,
+        min_num_patches=16))
+
+# dataset summary
+data = dict(
+    samples_per_gpu=256,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            type=data_source,
+            data_prefix='data/imagenet/train',
+            ann_file='data/imagenet/meta/train.txt'),
+        pipeline=train_pipeline,
+        prefetch=prefetch))
diff --git a/configs/selfsup/_base_/datasets/imagenet_mae.py b/configs/selfsup/_base_/datasets/imagenet_mae.py
index 939fc1039..cd833b5ab 100644
--- a/configs/selfsup/_base_/datasets/imagenet_mae.py
+++ b/configs/selfsup/_base_/datasets/imagenet_mae.py
@@ -17,7 +17,7 @@
 
 # dataset summary
 data = dict(
-    imgs_per_gpu=128,
+    samples_per_gpu=128,
     workers_per_gpu=8,
     train=dict(
         type=dataset_type,
diff --git a/configs/selfsup/_base_/models/barlowtwins.py b/configs/selfsup/_base_/models/barlowtwins.py
new file mode 100644
index 000000000..577284db1
--- /dev/null
+++ b/configs/selfsup/_base_/models/barlowtwins.py
@@ -0,0 +1,22 @@
+# model settings
+model = dict(
+    type='BarlowTwins',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        in_channels=3,
+        out_indices=[4],  # 0: conv-1, x: stage-x
+        norm_cfg=dict(type='SyncBN'),
+        zero_init_residual=True),
+    neck=dict(
+        type='NonLinearNeck',
+        in_channels=2048,
+        hid_channels=8192,
+        out_channels=8192,
+        num_layers=3,
+        with_last_bn=False,
+        with_last_bn_affine=False,
+        with_avg_pool=True,
+        init_cfg=dict(
+            type='Kaiming', distribution='uniform', layer=['Linear'])),
+    head=dict(type='LatentCrossCorrelationHead', in_channels=8192))
diff --git a/configs/selfsup/_base_/models/cae.py b/configs/selfsup/_base_/models/cae.py
new file mode 100644
index 000000000..941a56505
--- /dev/null
+++ b/configs/selfsup/_base_/models/cae.py
@@ -0,0 +1,17 @@
+# model settings
+model = dict(
+    type='CAE',
+    backbone=dict(type='CAEViT', arch='b', patch_size=16, init_values=0.1),
+    neck=dict(
+        type='CAENeck',
+        patch_size=16,
+        embed_dims=768,
+        num_heads=12,
+        regressor_depth=4,
+        decoder_depth=4,
+        mlp_ratio=4,
+        init_values=0.1,
+    ),
+    head=dict(
+        type='CAEHead', tokenizer_path='cae_ckpt/dalle_encoder.pth', lambd=2),
+    base_momentum=0.0)
diff --git a/configs/selfsup/_base_/models/simclr.py b/configs/selfsup/_base_/models/simclr.py
index e9f8a9dd9..150f74b5e 100644
--- a/configs/selfsup/_base_/models/simclr.py
+++ b/configs/selfsup/_base_/models/simclr.py
@@ -6,7 +6,8 @@
         depth=50,
         in_channels=3,
         out_indices=[4],  # 0: conv-1, x: stage-x
-        norm_cfg=dict(type='SyncBN')),
+        norm_cfg=dict(type='SyncBN'),
+        zero_init_residual=True),
     neck=dict(
         type='NonLinearNeck',  # SimCLR non-linear neck
         in_channels=2048,
diff --git a/configs/selfsup/barlowtwins/README.md b/configs/selfsup/barlowtwins/README.md
new file mode 100644
index 000000000..06e25543c
--- /dev/null
+++ b/configs/selfsup/barlowtwins/README.md
@@ -0,0 +1,52 @@
+# BarlowTwins
+
+> [Barlow Twins: Self-Supervised Learning via Redundancy Reduction](https://arxiv.org/abs/2103.03230)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Self-supervised learning (SSL) is rapidly closing the gap with supervised methods on large computer vision benchmarks. A successful approach to SSL is to learn embeddings which are invariant to distortions of the input sample. However, a recurring issue with this approach is the existence of trivial constant solutions. Most current methods avoid such solutions by careful implementation details. We propose an objective function that naturally avoids collapse by measuring the cross-correlation matrix between the outputs of two identical networks fed with distorted versions of a sample, and making it as close to the identity matrix as possible. This causes the embedding vectors of distorted versions of a sample to be similar, while minimizing the redundancy between the components of these vectors. The method is called Barlow Twins, owing to neuroscientist H. Barlow's redundancy-reduction principle applied to a pair of identical networks. Barlow Twins does not require large batches nor asymmetry between the network twins such as a predictor network, gradient stopping, or a moving average on the weight updates. Intriguingly it benefits from very high-dimensional output vectors. Barlow Twins outperforms previous methods on ImageNet for semi-supervised classification in the low-data regime, and is on par with current state of the art for ImageNet classification with a linear classifier head, and for transfer tasks of classification and object detection.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/36138628/163914714-082de804-0b5f-4024-94f9-880e6ef334fa.png" width="800" />
+</div>
+
+## Results and Models
+
+**Back to [model_zoo.md](https://github.com/open-mmlab/mmselfsup/blob/master/docs/en/model_zoo.md) to download models.**
+
+In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models are pre-trained on ImageNet-1k dataset.
+
+### Classification
+
+The classification benchmarks includes 1 downstream task datasets, **ImageNet**. If not specified, the results are Top-1 (%).
+
+#### ImageNet Linear Evaluation
+
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+
+| Self-Supervised Config                                                                                                                                                       | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
+| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/arlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 15.51    | 33.98    | 45.96    | 61.90    | 71.01    | 71.66   |
+
+#### ImageNet Nearest-Neighbor Classification
+
+The results are obtained from the features after GlobalAveragePooling. Here, k=10 to 200 indicates different number of nearest neighbors.
+
+| Self-Supervised Config                                                                                                                                                       | k=10 | k=20 | k=100 | k=200 |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---- | ----- | ----- |
+| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/arlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 63.6 | 63.8 | 62.7  | 61.9  |
+
+## Citation
+
+```bibtex
+@inproceedings{zbontar2021barlow,
+  title={Barlow twins: Self-supervised learning via redundancy reduction},
+  author={Zbontar, Jure and Jing, Li and Misra, Ishan and LeCun, Yann and Deny, St{\'e}phane},
+  booktitle={International Conference on Machine Learning},
+  year={2021},
+}
+```
diff --git a/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-1000e_in1k.py b/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-1000e_in1k.py
new file mode 100644
index 000000000..5e522a41d
--- /dev/null
+++ b/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-1000e_in1k.py
@@ -0,0 +1,4 @@
+_base_ = 'barlowtwins_resnet50_8xb256-coslr-300e_in1k.py'
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=1000)
diff --git a/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py b/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py
new file mode 100644
index 000000000..e58821d4d
--- /dev/null
+++ b/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py
@@ -0,0 +1,42 @@
+_base_ = [
+    '../_base_/models/barlowtwins.py',
+    '../_base_/datasets/imagenet_byol.py',
+    '../_base_/schedules/lars_coslr-200e_in1k.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=256)
+
+# optimizer
+optimizer = dict(
+    type='LARS',
+    lr=1.6,
+    momentum=0.9,
+    weight_decay=1e-6,
+    paramwise_options={
+        '(bn|gn)(\\d+)?.(weight|bias)':
+        dict(weight_decay=0, lr_mult=0.024, lars_exclude=True),
+        'bias':
+        dict(weight_decay=0, lr_mult=0.024, lars_exclude=True),
+        # bn layer in ResNet block downsample module
+        'downsample.1':
+        dict(weight_decay=0, lr_mult=0.024, lars_exclude=True),
+    })
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False,
+    min_lr=0.0016,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=1.6e-4,  # cannot be 0
+    warmup_by_epoch=True)
+
+# runtime settings
+# the max_keep_ckpts controls the max number of ckpt file in your work_dirs
+# if it is 3, when CheckpointHook (in mmcv) saves the 4th ckpt
+# it will remove the oldest one to keep the number of total ckpts as 3
+checkpoint_config = dict(interval=10, max_keep_ckpts=3)
+
+runner = dict(type='EpochBasedRunner', max_epochs=300)
diff --git a/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb32-accum8-coslr-100e_in1k.py b/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb32-accum8-coslr-100e_in1k.py
new file mode 100644
index 000000000..172eca7ab
--- /dev/null
+++ b/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb32-accum8-coslr-100e_in1k.py
@@ -0,0 +1,13 @@
+_base_ = 'barlowtwins_resnet50_8xb256-coslr-300e_in1k.py'
+
+data = dict(samples_per_gpu=32)
+
+# additional hooks
+# interval for accumulate gradient, total 8*32*8(interval)=2048
+update_interval = 8
+
+# optimizer
+optimizer_config = dict(update_interval=update_interval)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=100)
diff --git a/configs/selfsup/barlowtwins/metafile.yml b/configs/selfsup/barlowtwins/metafile.yml
new file mode 100644
index 000000000..ddb44060d
--- /dev/null
+++ b/configs/selfsup/barlowtwins/metafile.yml
@@ -0,0 +1,28 @@
+Collections:
+  - Name: BarlowTwins
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - LARS
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ResNet
+        - BarlowTwins
+    Paper:
+        URL: https://arxiv.org/abs/2103.03230
+        Title: "Barlow Twins: Self-Supervised Learning via Redundancy Reduction"
+    README: configs/selfsup/barlowtwins/README.md
+
+Models:
+  - Name: barlowtwins_resnet50_8xb256-coslr-300e_in1k
+    In Collection: BarlowTwins
+    Metadata:
+      Epochs: 300
+      Batch Size: 2048
+    Results:
+      - Task: Self-Supervised Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 71.66
+    Config: configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py
+    Weights: https://download.openmmlab.com/mmselfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220419-5ae15f89.pth
diff --git a/configs/selfsup/byol/README.md b/configs/selfsup/byol/README.md
index 988972b2d..9d50bb8d9 100644
--- a/configs/selfsup/byol/README.md
+++ b/configs/selfsup/byol/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                                       | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/cae/RAEDME.md b/configs/selfsup/cae/RAEDME.md
new file mode 100644
index 000000000..a79750ece
--- /dev/null
+++ b/configs/selfsup/cae/RAEDME.md
@@ -0,0 +1,42 @@
+# CAE
+
+> [Context Autoencoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a novel masked image modeling (MIM) approach, context autoencoder (CAE), for self-supervised learning. We randomly partition the image into two sets: visible patches and masked patches. The CAE architecture consists of: (i) an encoder that takes visible patches as input and outputs their latent representations, (ii) a latent context regressor that predicts the masked patch representations from the visible patch representations that are not updated in this regressor, (iii) a decoder that takes the estimated masked patch representations as input and makes predictions for the masked patches, and (iv) an alignment module that aligns the masked patch representation estimation with the masked patch representations computed from the encoder. In comparison to previous MIM methods that couple the encoding and decoding roles, e.g., using a single module in BEiT, our approach attempts to separate the encoding role (content understanding) from the decoding role (making predictions for masked patches) using different modules, improving the content understanding capability. In addition, our approach makes predictions from the visible patches to the masked patches in the latent representation space that is expected to take on semantics. In addition, we present the explanations about why contrastive pretraining and supervised pretraining perform similarly and why MIM potentially performs better. We demonstrate the effectiveness of our CAE through superior transfer performance in downstream tasks: semantic segmentation, and object detection and instance segmentation.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/30762564/165459947-6c6ef13c-0593-4765-b44e-6da0a079802a.png" width="40%"/>
+</div>
+
+
+## Prerequisite
+
+Create a new folder ``cae_ckpt`` under the root directory and download the
+[weights](https://download.openmmlab.com/mmselfsup/cae/dalle_encoder.pth) for ``dalle`` encoder to that folder
+## Models and Benchmarks
+
+Here, we report the results of the model, which is pre-trained on ImageNet-1k
+for 300 epochs, the details are below:
+
+
+
+| Backbone | Pre-train epoch | Fine-tuning Top-1 |                  Pre-train Config                   |                                    Fine-tuning Config                                     |                                                                                                                        Download                                                                                                                         |
+| :------: | :-------------: | :---------------: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ViT-B/16 |       300       |       83.2        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.log.json) |
+
+
+## Citation
+
+```bibtex
+@article{CAE,
+  title={Context Autoencoder for Self-Supervised Representation Learning},
+  author={Xiaokang Chen, Mingyu Ding, Xiaodi Wang, Ying Xin, Shentong Mo,
+  Yunhao Wang, Shumin Han, Ping Luo, Gang Zeng, Jingdong Wang},
+  journal={ArXiv},
+  year={2022}
+}
+```
diff --git a/configs/selfsup/cae/cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k.py b/configs/selfsup/cae/cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k.py
new file mode 100644
index 000000000..e460c0eef
--- /dev/null
+++ b/configs/selfsup/cae/cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k.py
@@ -0,0 +1,4 @@
+_base_ = 'cae_vit-base-p16_32xb64-fp16-coslr-300e_in1k.py'
+
+# dataset
+data = dict(samples_per_gpu=128, workers_per_gpu=8)
diff --git a/configs/selfsup/cae/cae_vit-base-p16_32xb64-fp16-coslr-300e_in1k.py b/configs/selfsup/cae/cae_vit-base-p16_32xb64-fp16-coslr-300e_in1k.py
new file mode 100644
index 000000000..ab8b35c37
--- /dev/null
+++ b/configs/selfsup/cae/cae_vit-base-p16_32xb64-fp16-coslr-300e_in1k.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../_base_/models/cae.py',
+    '../_base_/datasets/imagenet_cae.py',
+    '../_base_/schedules/adamw_coslr-200e_in1k.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset
+data = dict(samples_per_gpu=64, workers_per_gpu=8)
+
+# optimizer
+optimizer = dict(
+    lr=1.5e-3,
+    paramwise_options={
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'gamma': dict(weight_decay=0.)
+    },
+    betas=(0.9, 0.999))
+
+# learning policy
+lr_config = dict(
+    policy='StepFixCosineAnnealing',
+    min_lr=1e-5,
+    warmup='linear',
+    warmup_iters=10,
+    warmup_ratio=1e-4,
+    warmup_by_epoch=True,
+    by_epoch=False)
+
+# schedule
+runner = dict(max_epochs=300)
+
+# clip gradient
+optimizer_config = dict(grad_clip=dict(max_norm=3.0))
+
+# mixed precision
+fp16 = dict(loss_scale='dynamic')
+
+# runtime
+checkpoint_config = dict(interval=1, max_keep_ckpts=2, out_dir='')
+persistent_workers = True
+log_config = dict(
+    interval=100, hooks=[
+        dict(type='TextLoggerHook'),
+    ])
+
+find_unused_parameters = True
diff --git a/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py b/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py
new file mode 100644
index 000000000..f98b546a6
--- /dev/null
+++ b/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py
@@ -0,0 +1,4 @@
+_base_ = 'cae_vit-base-p16_16xb128-fp16-coslr-300e_in1k.py'
+
+# dataset
+data = dict(samples_per_gpu=256, workers_per_gpu=8)
diff --git a/configs/selfsup/cae/metafile.yaml b/configs/selfsup/cae/metafile.yaml
new file mode 100644
index 000000000..627153ef3
--- /dev/null
+++ b/configs/selfsup/cae/metafile.yaml
@@ -0,0 +1,27 @@
+Collections:
+  - Name: CAE
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x A100-80G GPUs
+      Architecture:
+        - ViT
+    Paper:
+        URL: https://arxiv.org/abs/2202.03026
+        Title: "Context Autoencoder for Self-Supervised Representation Learning"
+    README: configs/selfsup/cae/README.md
+
+Models:
+  - Name: cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k
+    In Collection: CAE
+    Metadata:
+      Epochs: 300
+      Batch Size: 2048
+    Results:
+      - Task: Self-Supervised Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.2
+    Config: configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py
+    Weights: https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth
diff --git a/configs/selfsup/deepcluster/README.md b/configs/selfsup/deepcluster/README.md
index f11f32a54..a6a6f907e 100644
--- a/configs/selfsup/deepcluster/README.md
+++ b/configs/selfsup/deepcluster/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                                                   | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/densecl/README.md b/configs/selfsup/densecl/README.md
index ed12476d5..0ff5f4558 100644
--- a/configs/selfsup/densecl/README.md
+++ b/configs/selfsup/densecl/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                             | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | -------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/mae/README.md b/configs/selfsup/mae/README.md
index 3831bee9b..230ee30b1 100644
--- a/configs/selfsup/mae/README.md
+++ b/configs/selfsup/mae/README.md
@@ -35,9 +35,9 @@ for 400 epochs, the details are below:
 
 
 
-| Backbone | Pre-train epoch | Fine-tuning Top-1 |                  Pre-train Config                   |                                    Fine-tuning Config                                     |                                                                                                                        Download                                                                                                                         |
-| :------: | :-------------: | :---------------: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| ViT-B/16 |       400       |       83.1        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-b-p16_8xb512-coslr-400e_in1k.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-b-p16_ft-8xb128-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k-224_20220223-85be947b.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-300e_in1k-224_20220210_140925.log.json) |
+| Backbone | Pre-train epoch | Fine-tuning Top-1 |                                                     Pre-train Config                                                      |                                                                 Fine-tuning Config                                                                 |                                                                                                                        Download                                                                                                                         |
+| :------: | :-------------: | :---------------: | :-----------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ViT-B/16 |       400       |       83.1        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-b-p16_8xb512-coslr-400e_in1k.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k-224_20220223-85be947b.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-300e_in1k-224_20220210_140925.log.json) |
 
 
 ## Citation
diff --git a/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e-fp16_in1k.py b/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e-fp16_in1k.py
new file mode 100644
index 000000000..cd63e9bcb
--- /dev/null
+++ b/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e-fp16_in1k.py
@@ -0,0 +1,4 @@
+_base_ = 'mae_vit-base-p16_8xb512-coslr-400e_in1k.py'
+
+# mixed precision
+fp16 = dict(loss_scale='dynamic')
diff --git a/configs/selfsup/mocov2/README.md b/configs/selfsup/mocov2/README.md
index 7014b5a2f..96ce39d75 100644
--- a/configs/selfsup/mocov2/README.md
+++ b/configs/selfsup/mocov2/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                           | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | ------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/mocov3/README.md b/configs/selfsup/mocov3/README.md
index e80646d57..ef9043dee 100644
--- a/configs/selfsup/mocov3/README.md
+++ b/configs/selfsup/mocov3/README.md
@@ -26,9 +26,9 @@ The classification benchmarks includes 4 downstream task datasets, **VOC**, **Im
 
 The **Linear Evaluation** result is obtained by training a linear head upon the pre-trained backbone. Please refer to [vit-small-p16_8xb128-coslr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-small-p16_8xb128-coslr-90e_in1k.py) for details of config.
 
-| Self-Supervised Config                                                                                                                                                                | Linear Evaluation |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------- |
-| [vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | 73.19             |
+| Self-Supervised Config                                                                                                                                                                              | Linear Evaluation |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------- |
+| [vit-small-p16_linear-32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_linear-32xb128-fp16-coslr-300e_in1k-224.py) | 73.19             |
 
 ## Citation
 
diff --git a/configs/selfsup/npid/README.md b/configs/selfsup/npid/README.md
index 3e9462b56..d7f561054 100644
--- a/configs/selfsup/npid/README.md
+++ b/configs/selfsup/npid/README.md
@@ -38,9 +38,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                         | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | ---------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/odc/README.md b/configs/selfsup/odc/README.md
index dc1d9b597..ba29f8444 100644
--- a/configs/selfsup/odc/README.md
+++ b/configs/selfsup/odc/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                       | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | -------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/relative_loc/README.md b/configs/selfsup/relative_loc/README.md
index 9529647ad..5f8dc1c56 100644
--- a/configs/selfsup/relative_loc/README.md
+++ b/configs/selfsup/relative_loc/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                                       | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/rotation_pred/README.md b/configs/selfsup/rotation_pred/README.md
index 97e9d5a97..ffade3016 100644
--- a/configs/selfsup/rotation_pred/README.md
+++ b/configs/selfsup/rotation_pred/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                                         | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/simclr/README.md b/configs/selfsup/simclr/README.md
index 93be60128..90b857ca5 100644
--- a/configs/selfsup/simclr/README.md
+++ b/configs/selfsup/simclr/README.md
@@ -34,13 +34,14 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb512-coslr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb512-coslr-90e_in1k.py) for details of config.
 
-| Self-Supervised Config                                                                                                                           | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
-| ------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | -------- | -------- | -------- | -------- | ------- |
-| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py) | 14.43    | 30.97    | 41.02    | 53.92    | 61.24    | 57.28   |
+| Self-Supervised Config                                                                                                                               | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
+| [resnet50_8xb32-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)     | 16.29    | 31.11    | 39.99    | 55.06    | 62.91    | 62.56   |
+| [resnet50_16xb256-coslr-200e](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py) | 15.44    | 31.47    | 41.83    | 59.44    | 66.41    | 66.66   |
 
 #### Places205 Linear Evaluation
 
diff --git a/configs/selfsup/simclr/metafile.yml b/configs/selfsup/simclr/metafile.yml
index 914b692d5..e13397137 100644
--- a/configs/selfsup/simclr/metafile.yml
+++ b/configs/selfsup/simclr/metafile.yml
@@ -4,7 +4,7 @@ Collections:
       Training Data: ImageNet-1k
       Training Techniques:
         - LARS
-      Training Resources: 8x V100 GPUs
+      Training Resources: 8x V100 GPUs (b256), 16x A100-80G GPUs (b4096)
       Architecture:
         - ResNet
         - SimCLR
@@ -23,6 +23,18 @@ Models:
       - Task: Self-Supervised Image Classification
         Dataset: ImageNet-1k
         Metrics:
-          Top 1 Accuracy: 57.28
+          Top 1 Accuracy: 62.56
     Config: configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py
-    Weights: https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k_20220225-97d2abef.pth
+    Weights: https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k_20220428-46ef6bb9.pth
+  - Name: simclr_resnet50_16xb256-coslr-200e_in1k
+    In Collection: SimCLR
+    Metadata:
+      Epochs: 200
+      Batch Size: 4096
+    Results:
+      - Task: Self-Supervised Image Classification
+        Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 66.66
+    Config: configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py
+    Weights: https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k_20220428-8c24b063.pth
diff --git a/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py b/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py
new file mode 100644
index 000000000..476fe85a0
--- /dev/null
+++ b/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py
@@ -0,0 +1,7 @@
+_base_ = 'simclr_resnet50_8xb32-coslr-200e_in1k.py'
+
+# optimizer
+optimizer = dict(lr=4.8)
+
+# dataset summary
+data = dict(samples_per_gpu=256, workers_per_gpu=8)  # total 256*16
diff --git a/configs/selfsup/simclr/simclr_resnet50_8xb64-coslr-200e_in1k.py b/configs/selfsup/simclr/simclr_resnet50_8xb64-coslr-200e_in1k.py
index f6dff5572..054c62a11 100644
--- a/configs/selfsup/simclr/simclr_resnet50_8xb64-coslr-200e_in1k.py
+++ b/configs/selfsup/simclr/simclr_resnet50_8xb64-coslr-200e_in1k.py
@@ -1,4 +1,7 @@
 _base_ = 'simclr_resnet50_8xb32-coslr-200e_in1k.py'
 
+# optimizer
+optimizer = dict(lr=0.6)
+
 # dataset summary
 data = dict(samples_per_gpu=64)  # total 64*8
diff --git a/configs/selfsup/simmim/README.md b/configs/selfsup/simmim/README.md
index d1f3f0924..2ed7aa563 100644
--- a/configs/selfsup/simmim/README.md
+++ b/configs/selfsup/simmim/README.md
@@ -19,9 +19,10 @@ Here, we report the results of the model, and more results will be coming soon.
 
 
 
-| Backbone | Pre-train epoch | Fine-tuning Top-1 |                  Pre-train Config                   |                                    Fine-tuning Config                                     |                                                                                                                        Download                                                                                                                         |
-| :------: | :-------------: | :---------------: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Swin-Base |       100       |       82.9        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k) | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json) |
+| Backbone | Pre-train epoch | Pre-train resolution|Fine-tuning resolution|Fine-tuning Top-1 |                  Pre-train Config                   |                                    Fine-tuning Config                                     |                                                                                                                        Download                                                                                                                         |
+| :------: | :-------------: | :---------------: | :---------------: |:---------------: |:-------------------------------------------------: | :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Swin-Base |       100       |  192| 192|     82.9        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json) |
+| Swin-Base |       100       |  192| 224|     83.5        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py) | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json) |
 
 
 ## Citation
diff --git a/configs/selfsup/simsiam/README.md b/configs/selfsup/simsiam/README.md
index 2d4f9356e..a710a0721 100644
--- a/configs/selfsup/simsiam/README.md
+++ b/configs/selfsup/simsiam/README.md
@@ -35,9 +35,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb512-coslr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb512-coslr-90e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb512-coslr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb512-coslr-90e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                             | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | -------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/configs/selfsup/swav/README.md b/configs/selfsup/swav/README.md
index bf289d99d..51afc1074 100644
--- a/configs/selfsup/swav/README.md
+++ b/configs/selfsup/swav/README.md
@@ -34,9 +34,9 @@ Besides, k=1 to 96 indicates the hyper-parameter of Low-shot SVM.
 
 #### ImageNet Linear Evaluation
 
-The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_linear-8xb32-steplr-90e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_linear-8xb32-steplr-90e_in1k.py) for details of config.
 
-The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-coslr-100e_in1k.py) for details of config.
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_linear-8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-coslr-100e_in1k.py) for details of config.
 
 | Self-Supervised Config                                                                                                                                                              | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
diff --git a/demo/mmselfsup_colab_tutorial.ipynb b/demo/mmselfsup_colab_tutorial.ipynb
index 0fcb6fa2f..75af94202 100644
--- a/demo/mmselfsup_colab_tutorial.ipynb
+++ b/demo/mmselfsup_colab_tutorial.ipynb
@@ -1547,7 +1547,7 @@
       "source": [
         "# Load the basic config file\n",
         "from mmcv import Config\n",
-        "benchmark_cfg = Config.fromfile('configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py')\n",
+        "benchmark_cfg = Config.fromfile('configs/benchmarks/classification/imagenet/resnet50_linear-8xb32-steplr-100e_in1k.py')\n",
         "\n",
         "# Modify the model\n",
         "checkpoint_file = 'work_dirs/selfsup/relative-loc_resnet50_8xb64-steplr-70e_in1k_colab/relative-loc_backbone-weights.pth'\n",
diff --git a/docs/en/algorithms/barlowtwins.md b/docs/en/algorithms/barlowtwins.md
new file mode 100644
index 000000000..06e25543c
--- /dev/null
+++ b/docs/en/algorithms/barlowtwins.md
@@ -0,0 +1,52 @@
+# BarlowTwins
+
+> [Barlow Twins: Self-Supervised Learning via Redundancy Reduction](https://arxiv.org/abs/2103.03230)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Self-supervised learning (SSL) is rapidly closing the gap with supervised methods on large computer vision benchmarks. A successful approach to SSL is to learn embeddings which are invariant to distortions of the input sample. However, a recurring issue with this approach is the existence of trivial constant solutions. Most current methods avoid such solutions by careful implementation details. We propose an objective function that naturally avoids collapse by measuring the cross-correlation matrix between the outputs of two identical networks fed with distorted versions of a sample, and making it as close to the identity matrix as possible. This causes the embedding vectors of distorted versions of a sample to be similar, while minimizing the redundancy between the components of these vectors. The method is called Barlow Twins, owing to neuroscientist H. Barlow's redundancy-reduction principle applied to a pair of identical networks. Barlow Twins does not require large batches nor asymmetry between the network twins such as a predictor network, gradient stopping, or a moving average on the weight updates. Intriguingly it benefits from very high-dimensional output vectors. Barlow Twins outperforms previous methods on ImageNet for semi-supervised classification in the low-data regime, and is on par with current state of the art for ImageNet classification with a linear classifier head, and for transfer tasks of classification and object detection.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/36138628/163914714-082de804-0b5f-4024-94f9-880e6ef334fa.png" width="800" />
+</div>
+
+## Results and Models
+
+**Back to [model_zoo.md](https://github.com/open-mmlab/mmselfsup/blob/master/docs/en/model_zoo.md) to download models.**
+
+In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models are pre-trained on ImageNet-1k dataset.
+
+### Classification
+
+The classification benchmarks includes 1 downstream task datasets, **ImageNet**. If not specified, the results are Top-1 (%).
+
+#### ImageNet Linear Evaluation
+
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+
+| Self-Supervised Config                                                                                                                                                       | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
+| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/arlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 15.51    | 33.98    | 45.96    | 61.90    | 71.01    | 71.66   |
+
+#### ImageNet Nearest-Neighbor Classification
+
+The results are obtained from the features after GlobalAveragePooling. Here, k=10 to 200 indicates different number of nearest neighbors.
+
+| Self-Supervised Config                                                                                                                                                       | k=10 | k=20 | k=100 | k=200 |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---- | ----- | ----- |
+| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/arlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 63.6 | 63.8 | 62.7  | 61.9  |
+
+## Citation
+
+```bibtex
+@inproceedings{zbontar2021barlow,
+  title={Barlow twins: Self-supervised learning via redundancy reduction},
+  author={Zbontar, Jure and Jing, Li and Misra, Ishan and LeCun, Yann and Deny, St{\'e}phane},
+  booktitle={International Conference on Machine Learning},
+  year={2021},
+}
+```
diff --git a/docs/en/algorithms/cae.md b/docs/en/algorithms/cae.md
new file mode 100644
index 000000000..a79750ece
--- /dev/null
+++ b/docs/en/algorithms/cae.md
@@ -0,0 +1,42 @@
+# CAE
+
+> [Context Autoencoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a novel masked image modeling (MIM) approach, context autoencoder (CAE), for self-supervised learning. We randomly partition the image into two sets: visible patches and masked patches. The CAE architecture consists of: (i) an encoder that takes visible patches as input and outputs their latent representations, (ii) a latent context regressor that predicts the masked patch representations from the visible patch representations that are not updated in this regressor, (iii) a decoder that takes the estimated masked patch representations as input and makes predictions for the masked patches, and (iv) an alignment module that aligns the masked patch representation estimation with the masked patch representations computed from the encoder. In comparison to previous MIM methods that couple the encoding and decoding roles, e.g., using a single module in BEiT, our approach attempts to separate the encoding role (content understanding) from the decoding role (making predictions for masked patches) using different modules, improving the content understanding capability. In addition, our approach makes predictions from the visible patches to the masked patches in the latent representation space that is expected to take on semantics. In addition, we present the explanations about why contrastive pretraining and supervised pretraining perform similarly and why MIM potentially performs better. We demonstrate the effectiveness of our CAE through superior transfer performance in downstream tasks: semantic segmentation, and object detection and instance segmentation.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/30762564/165459947-6c6ef13c-0593-4765-b44e-6da0a079802a.png" width="40%"/>
+</div>
+
+
+## Prerequisite
+
+Create a new folder ``cae_ckpt`` under the root directory and download the
+[weights](https://download.openmmlab.com/mmselfsup/cae/dalle_encoder.pth) for ``dalle`` encoder to that folder
+## Models and Benchmarks
+
+Here, we report the results of the model, which is pre-trained on ImageNet-1k
+for 300 epochs, the details are below:
+
+
+
+| Backbone | Pre-train epoch | Fine-tuning Top-1 |                  Pre-train Config                   |                                    Fine-tuning Config                                     |                                                                                                                        Download                                                                                                                         |
+| :------: | :-------------: | :---------------: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ViT-B/16 |       300       |       83.2        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.log.json) |
+
+
+## Citation
+
+```bibtex
+@article{CAE,
+  title={Context Autoencoder for Self-Supervised Representation Learning},
+  author={Xiaokang Chen, Mingyu Ding, Xiaodi Wang, Ying Xin, Shentong Mo,
+  Yunhao Wang, Shumin Han, Ping Luo, Gang Zeng, Jingdong Wang},
+  journal={ArXiv},
+  year={2022}
+}
+```
diff --git a/docs/en/changelog.md b/docs/en/changelog.md
index 531ad06e8..748424e8a 100644
--- a/docs/en/changelog.md
+++ b/docs/en/changelog.md
@@ -2,6 +2,34 @@
 
 ## MMSelfSup
 
+### v0.9.0 (29/04/2022)
+
+#### Highlight
+* Support **CAE** ([#284](https://github.com/open-mmlab/mmselfsup/pull/284))
+* Support **Barlow Twins** ([#207](https://github.com/open-mmlab/mmselfsup/pull/207))
+
+#### New Features
+* Support CAE ([#284](https://github.com/open-mmlab/mmselfsup/pull/284))
+* Support Barlow twins ([#207](https://github.com/open-mmlab/mmselfsup/pull/207))
+* Add SimMIM 192 pretrain and 224 fine-tuning results ([#280](https://github.com/open-mmlab/mmselfsup/pull/280))
+* Add MAE pretrain with fp16 ([#271](https://github.com/open-mmlab/mmselfsup/pull/271))
+
+#### Bug Fixes
+* Fix args error ([#290](https://github.com/open-mmlab/mmselfsup/pull/290))
+* Change imgs_per_gpu to samples_per_gpu in MAE config ([#278](https://github.com/open-mmlab/mmselfsup/pull/278))
+* Avoid GPU memory leak with prefetch dataloader ([#277](https://github.com/open-mmlab/mmselfsup/pull/277))
+* Fix key error bug when registering custom hooks ([#273](https://github.com/open-mmlab/mmselfsup/pull/273))
+
+#### Improvements
+* Update SimCLR models and results ([#295](https://github.com/open-mmlab/mmselfsup/pull/295))
+* Reduce memory usage while running unit test ([#291](https://github.com/open-mmlab/mmselfsup/pull/291))
+* Remove pytorch1.5 test ([#288](https://github.com/open-mmlab/mmselfsup/pull/288))
+* Rename linear probing config file names ([#281](https://github.com/open-mmlab/mmselfsup/pull/281))
+* add unit test for apis ([#276](https://github.com/open-mmlab/mmselfsup/pull/276))
+
+#### Docs
+* Fix SimMIM config link, and add SimMIM to model_zoo ([#272](https://github.com/open-mmlab/mmselfsup/pull/272))
+
 ### v0.8.0 (31/03/2022)
 
 #### Highlight
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 394b98cce..7bd861a7b 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -43,6 +43,8 @@ Welcome to MMSelfSup's documentation!
    algorithms/mocov3.md
    algorithms/mae.md
    algorithms/simmim.md
+   algorithms/barlowtwins.md
+   algorithms/cae.md
 
 
 .. toctree::
diff --git a/docs/en/install.md b/docs/en/install.md
index 0fa627aa9..49807ce4e 100644
--- a/docs/en/install.md
+++ b/docs/en/install.md
@@ -7,8 +7,8 @@
 - PyTorch 1.5+
 - CUDA 9.2+
 - GCC 5+
-- [mmcv](https://github.com/open-mmlab/mmcv) 1.3.16+
-- [mmcls](https://mmclassification.readthedocs.io/en/latest/install.html) 0.19.0+
+- [mmcv](https://github.com/open-mmlab/mmcv) 1.4.2+
+- [mmcls](https://mmclassification.readthedocs.io/en/latest/install.html) 0.21.0+
 - [mmdet](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 2.16.0+
 - [mmseg](https://mmsegmentation.readthedocs.io/en/latest/get_started.html#installation) 0.20.2+
 
@@ -16,7 +16,8 @@ Compatible MMCV, MMClassification, MMDetection and MMSegmentation versions are s
 
 | MMSelfSup version |    MMCV version     |  MMClassification version  | MMSegmentation version | MMDetection version |
 | :---------------: | :-----------------: | :------------------------: | :--------------------: | :-----------------: |
-|  0.8.0 (master)   | mmcv-full >= 1.3.16 |      mmcls >= 0.21.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
+|  0.9.0 (master)   | mmcv-full >= 1.4.2  |      mmcls >= 0.21.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
+|       0.8.0       | mmcv-full >= 1.4.2  |      mmcls >= 0.21.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
 |       0.7.1       | mmcv-full >= 1.3.16 | mmcls >= 0.19.0, <= 0.20.1 |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
 |       0.6.0       | mmcv-full >= 1.3.16 |      mmcls >= 0.19.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
 |       0.5.0       | mmcv-full >= 1.3.16 |             /              |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md
index 96047bdb1..85a1cf36c 100644
--- a/docs/en/model_zoo.md
+++ b/docs/en/model_zoo.md
@@ -6,21 +6,25 @@ All models and part of benchmark results are recorded below.
 
 | Algorithm                                                                                                          | Config                                                                                                                                                                                       | Download                                                                                                                                                                                                                                                                      |
 | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [BYOL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/README.md)                         | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220225-5c8b2c2e.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220214_115709.log.json)                     |
-|                                                                                                                    | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220225-a0daa54a.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220210_095852.log.json)                     |
+| [Relative Location](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/README.md)    | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               | [model](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220225-84784688.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220211_124808.log.json)     |
+| [Rotation Prediction](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/README.md) | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            | [model](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220225-5b9f06a0.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220215_185303.log.json) |
 | [DeepCluster](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/README.md)           | [deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py)    | [model](https://download.openmmlab.com/mmselfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k-bb8681e2.pth)                                                                                                                                                  |
-| [DenseCL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/README.md)                   | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220225-8c7808fe.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220215_041207.log.json)                         |
-| [MoCo v2](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/README.md)                    | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220225-89e03af4.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220210_110905.log.json)                                 |
 | [NPID](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/README.md)                         | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                     | [model](https://download.openmmlab.com/mmselfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k_20220225-5fbbda2a.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k_20220215_185513.log.json)                                   |
 | [ODC](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/README.md)                           | [odc_resnet50_8xb64-steplr-440e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py)                                        | [model](https://download.openmmlab.com/mmselfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k_20220225-a755d9c0.pth)   &#124; [log](https://download.openmmlab.com/mmselfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k_20220215_235245.log.json)                                     |
-| [Relative Location](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/README.md)    | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               | [model](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220225-84784688.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220211_124808.log.json)     |
-| [Rotation Prediction](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/README.md) | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            | [model](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220225-5b9f06a0.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220215_185303.log.json) |
-| [SimCLR](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/README.md)                     | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](simclr_resnet50_8xb32-coslr-200e_in1k_20220225-97d2abef.pth)        &#124; [log](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb64-coslr-200e_in1k_20220210_191629.log.json)                                                                      |
+| [SimCLR](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/README.md)                     | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k_20220428-46ef6bb9.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k_20220411_182427.log.json)                             |
+|                                                                                                                    | [simclr_resnet50_16xb256-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py)                             | [model](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k_20220428-8c24b063.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k_20220423_205520.log.json)                         |
+| [MoCo v2](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/README.md)                    | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220225-89e03af4.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220210_110905.log.json)                                 |
+| [BYOL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/README.md)                         | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220225-5c8b2c2e.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220214_115709.log.json)                     |
+|                                                                                                                    | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220225-a0daa54a.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220210_095852.log.json)                     |
+| [SwAV](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/README.md)                         | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | [model](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220225-0497dd5d.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220211_061131.log.json)   |
+| [DenseCL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/README.md)                   | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220225-8c7808fe.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220215_041207.log.json)                         |
 | [SimSiam](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/README.md)                   | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k_20220225-68a88ad8.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k_20220210_195405.log.json)                         |
 |                                                                                                                    | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k_20220225-2f488143.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k_20220210_195402.log.json)                         |
-| [SwAV](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/README.md)                         | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | [model](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220225-0497dd5d.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220211_061131.log.json)   |
+| [BarlowTwins](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/barlowtwins/README.md)           | [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py)                | [model](https://download.openmmlab.com/mmselfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220419-5ae15f89.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220413_111555.log.json)       |
 | [MoCo v3](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/README.md)                    | [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | [model](https://download.openmmlab.com/mmselfsup/moco/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224_20220225-e31238dd.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/moco/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224_20220222_160222.log.json) |
 | [MAE](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/README.md)                           | [mae_vit-base-p16_8xb512-coslr-400e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k.py)                                | [model](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k-224_20220223-85be947b.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-300e_in1k-224_20220210_140925.log.json)                       |
+| [SimMIM](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/README.md)                     | [simmim_swin-base_16xb128-coslr-100e_in1k-192](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py)                   | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json)             |
+| [CAE](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/README.md)                        | [cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py)                      | [model](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.log.json)                   |
 
 Remarks:
 
@@ -36,27 +40,31 @@ In the following tables, we only display ImageNet linear evaluation, ImageNet fi
 
 If not specified, we use linear evaluation setting from [MoCo](http://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf) as default. Other settings are mentioned in Remarks.
 
-| Algorithm           | Config                                                                                                                                                                                       | Remarks               | Top-1 (%) |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------- | --------- |
-| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       |                       | 67.55     |
-|                     | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       |                       | 68.55     |
-| DeepCluster         | [deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py) |                       | 46.92     |
-| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              |                       | 63.62     |
-| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 |                       | 67.58     |
-| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                     |                       | 58.97     |
-| ODC                 | [odc_resnet50_8xb64-steplr-440e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py)                                        |                       | 53.43     |
-| Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               |                       | 38.78     |
-| Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            |                       | 48.12     |
-| SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 |                       | 57.28     |
-| SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                              | SimSiam paper setting | 68.28     |
-|                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                              | SimSiam paper setting | 69.84     |
-| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | SwAV paper setting    | 70.47     |
-| MoCo v3             | [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | MoCo v3 paper setting | 73.19     |
+| Algorithm           | Config                                                                                                                                                                                       | Remarks                    | Top-1 (%) |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------- | --------- |
+| Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               |                            | 38.78     |
+| Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            |                            | 48.12     |
+| DeepCluster         | [deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py) |                            | 46.92     |
+| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                     |                            | 58.97     |
+| ODC                 | [odc_resnet50_8xb64-steplr-440e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py)                                        |                            | 53.43     |
+| SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 | SimSiam paper setting      | 62.56     |
+|                     | [simclr_resnet50_16xb256-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py)                             | SimSiam paper setting      | 66.66     |
+| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 |                            | 67.58     |
+| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       |                            | 67.55     |
+|                     | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       |                            | 68.55     |
+| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | SwAV paper setting         | 70.47     |
+| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              |                            | 63.62     |
+| SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                              | SimSiam paper setting      | 68.28     |
+|                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                              | SimSiam paper setting      | 69.84     |
+| Barlow Twins        | [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py)                | Barlow Twins paper setting | 71.66     |
+| MoCo v3             | [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | MoCo v3 paper setting      | 73.19     |
 
 ### ImageNet Fine-tuning
-| Algorithm | Config                                                                                                                                                        | Remarks | Top-1 (%) |
-| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | --------- |
-| MAE       | [mae_vit-base-p16_8xb512-coslr-400e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k.py) |         | 83.1      |
+| Algorithm | Config                                                                                                                                                                     | Remarks | Top-1 (%) |
+| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | --------- |
+| MAE       | [mae_vit-base-p16_8xb512-coslr-400e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k.py)              |         | 83.1      |
+| SimMIM    | [simmim_swin-base_16xb128-coslr-100e_in1k-192](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py) |         | 82.9      |
+| CAE       | [cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py)    |         | 83.2      |
 
 ### COCO17 Object Detection and Instance Segmentation
 
@@ -64,16 +72,15 @@ In COCO17 object detection and instance segmentation task, we choose the evaluat
 
 | Algorithm           | Config                                                                                                                                                                                   | mAP (Box) | mAP (Mask) |
 | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ---------- |
-| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 40.9      | 36.8       |
-| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                          |           |            |
-| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 40.2      | 36.1       |
-| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 38.5      | 34.6       |
 | Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)           | 37.5      | 33.7       |
 | Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)        | 37.9      | 34.2       |
+| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 38.5      | 34.6       |
 | SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                             | 38.7      | 34.9       |
+| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 40.2      | 36.1       |
+| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 40.9      | 36.8       |
+| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 40.2      | 36.3       |
 | SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                          | 38.6      | 34.6       |
 |                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                          | 38.8      | 34.9       |
-| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 40.2      | 36.3       |
 
 ### Pascal VOC12 Aug Semantic Segmentation
 
@@ -81,13 +88,13 @@ In Pascal VOC12 Aug semantic segmentation task, we choose the evaluation protoco
 
 | Algorithm           | Config                                                                                                                                                                                   | mIOU  |
 | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
-| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 67.16 |
-| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                          | 69.47 |
-| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 67.55 |
-| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 65.45 |
 | Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)           | 63.49 |
 | Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)        | 64.31 |
+| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 65.45 |
 | SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                             | 64.03 |
+| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 67.55 |
+| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 67.16 |
+| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 63.73 |
+| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                          | 69.47 |
 | SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                          | 48.35 |
 |                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                          | 46.27 |
-| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 63.73 |
diff --git a/docs/zh_cn/algorithms/barlowtwins.md b/docs/zh_cn/algorithms/barlowtwins.md
new file mode 100644
index 000000000..06e25543c
--- /dev/null
+++ b/docs/zh_cn/algorithms/barlowtwins.md
@@ -0,0 +1,52 @@
+# BarlowTwins
+
+> [Barlow Twins: Self-Supervised Learning via Redundancy Reduction](https://arxiv.org/abs/2103.03230)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Self-supervised learning (SSL) is rapidly closing the gap with supervised methods on large computer vision benchmarks. A successful approach to SSL is to learn embeddings which are invariant to distortions of the input sample. However, a recurring issue with this approach is the existence of trivial constant solutions. Most current methods avoid such solutions by careful implementation details. We propose an objective function that naturally avoids collapse by measuring the cross-correlation matrix between the outputs of two identical networks fed with distorted versions of a sample, and making it as close to the identity matrix as possible. This causes the embedding vectors of distorted versions of a sample to be similar, while minimizing the redundancy between the components of these vectors. The method is called Barlow Twins, owing to neuroscientist H. Barlow's redundancy-reduction principle applied to a pair of identical networks. Barlow Twins does not require large batches nor asymmetry between the network twins such as a predictor network, gradient stopping, or a moving average on the weight updates. Intriguingly it benefits from very high-dimensional output vectors. Barlow Twins outperforms previous methods on ImageNet for semi-supervised classification in the low-data regime, and is on par with current state of the art for ImageNet classification with a linear classifier head, and for transfer tasks of classification and object detection.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/36138628/163914714-082de804-0b5f-4024-94f9-880e6ef334fa.png" width="800" />
+</div>
+
+## Results and Models
+
+**Back to [model_zoo.md](https://github.com/open-mmlab/mmselfsup/blob/master/docs/en/model_zoo.md) to download models.**
+
+In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models are pre-trained on ImageNet-1k dataset.
+
+### Classification
+
+The classification benchmarks includes 1 downstream task datasets, **ImageNet**. If not specified, the results are Top-1 (%).
+
+#### ImageNet Linear Evaluation
+
+The **Feature1 - Feature5** don't have the GlobalAveragePooling, the feature map is pooled to the specific dimensions and then follows a Linear layer to do the classification. Please refer to [resnet50_mhead_8xb32-steplr-90e.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_mhead_8xb32-steplr-90e_in1k.py) for details of config.
+
+The **AvgPool** result is obtained from Linear Evaluation with GlobalAveragePooling. Please refer to [resnet50_8xb32-steplr-100e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/resnet50_8xb32-steplr-100e_in1k.py) for details of config.
+
+| Self-Supervised Config                                                                                                                                                       | Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | AvgPool |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | -------- | ------- |
+| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/arlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 15.51    | 33.98    | 45.96    | 61.90    | 71.01    | 71.66   |
+
+#### ImageNet Nearest-Neighbor Classification
+
+The results are obtained from the features after GlobalAveragePooling. Here, k=10 to 200 indicates different number of nearest neighbors.
+
+| Self-Supervised Config                                                                                                                                                       | k=10 | k=20 | k=100 | k=200 |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---- | ----- | ----- |
+| [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/arlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py) | 63.6 | 63.8 | 62.7  | 61.9  |
+
+## Citation
+
+```bibtex
+@inproceedings{zbontar2021barlow,
+  title={Barlow twins: Self-supervised learning via redundancy reduction},
+  author={Zbontar, Jure and Jing, Li and Misra, Ishan and LeCun, Yann and Deny, St{\'e}phane},
+  booktitle={International Conference on Machine Learning},
+  year={2021},
+}
+```
diff --git a/docs/zh_cn/algorithms/cae.md b/docs/zh_cn/algorithms/cae.md
new file mode 100644
index 000000000..a79750ece
--- /dev/null
+++ b/docs/zh_cn/algorithms/cae.md
@@ -0,0 +1,42 @@
+# CAE
+
+> [Context Autoencoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a novel masked image modeling (MIM) approach, context autoencoder (CAE), for self-supervised learning. We randomly partition the image into two sets: visible patches and masked patches. The CAE architecture consists of: (i) an encoder that takes visible patches as input and outputs their latent representations, (ii) a latent context regressor that predicts the masked patch representations from the visible patch representations that are not updated in this regressor, (iii) a decoder that takes the estimated masked patch representations as input and makes predictions for the masked patches, and (iv) an alignment module that aligns the masked patch representation estimation with the masked patch representations computed from the encoder. In comparison to previous MIM methods that couple the encoding and decoding roles, e.g., using a single module in BEiT, our approach attempts to separate the encoding role (content understanding) from the decoding role (making predictions for masked patches) using different modules, improving the content understanding capability. In addition, our approach makes predictions from the visible patches to the masked patches in the latent representation space that is expected to take on semantics. In addition, we present the explanations about why contrastive pretraining and supervised pretraining perform similarly and why MIM potentially performs better. We demonstrate the effectiveness of our CAE through superior transfer performance in downstream tasks: semantic segmentation, and object detection and instance segmentation.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/30762564/165459947-6c6ef13c-0593-4765-b44e-6da0a079802a.png" width="40%"/>
+</div>
+
+
+## Prerequisite
+
+Create a new folder ``cae_ckpt`` under the root directory and download the
+[weights](https://download.openmmlab.com/mmselfsup/cae/dalle_encoder.pth) for ``dalle`` encoder to that folder
+## Models and Benchmarks
+
+Here, we report the results of the model, which is pre-trained on ImageNet-1k
+for 300 epochs, the details are below:
+
+
+
+| Backbone | Pre-train epoch | Fine-tuning Top-1 |                  Pre-train Config                   |                                    Fine-tuning Config                                     |                                                                                                                        Download                                                                                                                         |
+| :------: | :-------------: | :---------------: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| ViT-B/16 |       300       |       83.2        | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/vit-base-p16_ft-8xb128-coslr-100e-rpe_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.log.json) |
+
+
+## Citation
+
+```bibtex
+@article{CAE,
+  title={Context Autoencoder for Self-Supervised Representation Learning},
+  author={Xiaokang Chen, Mingyu Ding, Xiaodi Wang, Ying Xin, Shentong Mo,
+  Yunhao Wang, Shumin Han, Ping Luo, Gang Zeng, Jingdong Wang},
+  journal={ArXiv},
+  year={2022}
+}
+```
diff --git a/docs/zh_cn/changelog.md b/docs/zh_cn/changelog.md
index b9aa60ecd..9e8f90d58 100644
--- a/docs/zh_cn/changelog.md
+++ b/docs/zh_cn/changelog.md
@@ -2,6 +2,34 @@
 
 ## MMSelfSup
 
+### v0.9.0 (29/04/2022)
+
+#### 亮点
+* 支持 **CAE** ([#284](https://github.com/open-mmlab/mmselfsup/pull/284))
+* 支持 **Barlow Twins** ([#207](https://github.com/open-mmlab/mmselfsup/pull/207))
+
+#### 新特性
+* 支持 CAE ([#284](https://github.com/open-mmlab/mmselfsup/pull/284))
+* 支持 Barlow twins ([#207](https://github.com/open-mmlab/mmselfsup/pull/207))
+* 增加 SimMIM 192 预训练及 224 微调的结果 ([#280](https://github.com/open-mmlab/mmselfsup/pull/280))
+* 增加 MAE fp16 预训练设置 ([#271](https://github.com/open-mmlab/mmselfsup/pull/271))
+
+#### Bug 修复
+* 修复参数问题 ([#290](https://github.com/open-mmlab/mmselfsup/pull/290))
+* 在 MAE 配置中修改 imgs_per_gpu 为 samples_per_gpu ([#278](https://github.com/open-mmlab/mmselfsup/pull/278))
+* 使用 prefetch dataloader 时避免 GPU 内存溢出 ([#277](https://github.com/open-mmlab/mmselfsup/pull/277))
+* 修复在注册自定义钩子时键值错误的问题 ([#273](https://github.com/open-mmlab/mmselfsup/pull/273))
+
+#### 改进
+* 更新 SimCLR 模型和结果 ([#295](https://github.com/open-mmlab/mmselfsup/pull/295))
+* 单元测试减少内存使用 ([#291](https://github.com/open-mmlab/mmselfsup/pull/291))
+* 去除 pytorch 1.5 测试 ([#288](https://github.com/open-mmlab/mmselfsup/pull/288))
+* 重命名线性评估配置文件 ([#281](https://github.com/open-mmlab/mmselfsup/pull/281))
+* 为 api 增加单元测试 ([#276](https://github.com/open-mmlab/mmselfsup/pull/276))
+
+#### 文档
+* 在模型库增加 SimMIM 并修复链接 ([#272](https://github.com/open-mmlab/mmselfsup/pull/272))
+
 ### v0.8.0 (31/03/2022)
 
 #### 亮点
@@ -9,24 +37,24 @@
 * 增加 **KNN** 基准测试，支持中间 checkpoint 和提取的 backbone 权重进行评估 ([#243](https://github.com/open-mmlab/mmselfsup/pull/243))
 * 支持 ImageNet-21k 数据集 ([#225](https://github.com/open-mmlab/mmselfsup/pull/225))
 
-#### New Features
+#### 新特性
 * 支持 SimMIM ([#239](https://github.com/open-mmlab/mmselfsup/pull/239))
 * 增加 KNN 基准测试，支持中间 checkpoint 和提取的 backbone 权重进行评估 ([#243](https://github.com/open-mmlab/mmselfsup/pull/243))
 * 支持 ImageNet-21k 数据集 ([#225](https://github.com/open-mmlab/mmselfsup/pull/225))
 * 支持自动继续 checkpoint 文件的训练 ([#245](https://github.com/open-mmlab/mmselfsup/pull/245))
 
-#### Bug Fixes
+#### Bug 修复
 * 在分布式 sampler 中增加种子 ([#250](https://github.com/open-mmlab/mmselfsup/pull/250))
 * 修复 dist_test_svm_epoch.sh 中参数位置问题 ([#260](https://github.com/open-mmlab/mmselfsup/pull/260))
 * 修复 prepare_voc07_cls.sh 中 mkdir 潜在错误 ([#261](https://github.com/open-mmlab/mmselfsup/pull/261))
 
-#### Improvements
+#### 改进
 * 更新命令行参数模式 ([#253](https://github.com/open-mmlab/mmselfsup/pull/253))
 
-#### Docs
+#### 文档
 * 修复 6_benchmarks.md 中命令文档([#263](https://github.com/open-mmlab/mmselfsup/pull/263))
 * 翻译 6_benchmarks.md 到中文 ([#262](https://github.com/open-mmlab/mmselfsup/pull/262))
-*
+
 ### v0.7.0 (03/03/2022)
 
 #### 亮点
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 22fbad5ee..b59f74a17 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -45,6 +45,8 @@ Welcome to MMSelfSup's documentation!
    algorithms/mocov3.md
    algorithms/mae.md
    algorithms/simmim.md
+   algorithms/barlowtwins.md
+   algorithms/cae.md
 
 
 .. toctree::
diff --git a/docs/zh_cn/install.md b/docs/zh_cn/install.md
index 927a843d7..e857db8ce 100644
--- a/docs/zh_cn/install.md
+++ b/docs/zh_cn/install.md
@@ -7,8 +7,8 @@
 - PyTorch 1.5+
 - CUDA 9.2+
 - GCC 5+
-- [mmcv](https://github.com/open-mmlab/mmcv) 1.3.16+
-- [mmcls](https://mmclassification.readthedocs.io/en/latest/install.html) 0.19.0+
+- [mmcv](https://github.com/open-mmlab/mmcv) 1.4.2+
+- [mmcls](https://mmclassification.readthedocs.io/en/latest/install.html) 0.21.0+
 - [mmdet](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 2.16.0+
 - [mmseg](https://mmsegmentation.readthedocs.io/en/latest/get_started.html#installation) 0.20.2+
 
@@ -16,7 +16,8 @@
 
 | MMSelfSup version |    MMCV version     |  MMClassification version  | MMSegmentation version | MMDetection version |
 | :---------------: | :-----------------: | :------------------------: | :--------------------: | :-----------------: |
-|  0.8.0 (master)   | mmcv-full >= 1.3.16 |      mmcls >= 0.21.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
+|  0.9.0 (master)   | mmcv-full >= 1.4.2  |      mmcls >= 0.21.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
+|       0.8.0       | mmcv-full >= 1.4.2  |      mmcls >= 0.21.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
 |       0.7.1       | mmcv-full >= 1.3.16 | mmcls >= 0.19.0, <= 0.20.1 |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
 |       0.6.0       | mmcv-full >= 1.3.16 |      mmcls >= 0.19.0       |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
 |       0.5.0       | mmcv-full >= 1.3.16 |             /              |    mmseg >= 0.20.2     |   mmdet >= 2.16.0   |
diff --git a/docs/zh_cn/model_zoo.md b/docs/zh_cn/model_zoo.md
index d5b1f286c..c9143ef75 100644
--- a/docs/zh_cn/model_zoo.md
+++ b/docs/zh_cn/model_zoo.md
@@ -6,21 +6,25 @@
 
 | 算法                                                                                                               | 配置文件                                                                                                                                                                                     | 下载链接                                                                                                                                                                                                                                                                      |
 | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [BYOL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/README.md)                         | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220225-5c8b2c2e.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220214_115709.log.json)                     |
-|                                                                                                                    | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220225-a0daa54a.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220210_095852.log.json)                     |
+| [Relative Location](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/README.md)    | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               | [model](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220225-84784688.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220211_124808.log.json)     |
+| [Rotation Prediction](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/README.md) | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            | [model](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220225-5b9f06a0.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220215_185303.log.json) |
 | [DeepCluster](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/README.md)           | [deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py)    | [model](https://download.openmmlab.com/mmselfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k-bb8681e2.pth)                                                                                                                                                  |
-| [DenseCL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/README.md)                   | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220225-8c7808fe.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220215_041207.log.json)                         |
-| [MoCo v2](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/README.md)                    | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220225-89e03af4.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220210_110905.log.json)                                 |
 | [NPID](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/README.md)                         | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                     | [model](https://download.openmmlab.com/mmselfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k_20220225-5fbbda2a.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k_20220215_185513.log.json)                                   |
 | [ODC](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/README.md)                           | [odc_resnet50_8xb64-steplr-440e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py)                                        | [model](https://download.openmmlab.com/mmselfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k_20220225-a755d9c0.pth)   &#124; [log](https://download.openmmlab.com/mmselfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k_20220215_235245.log.json)                                     |
-| [Relative Location](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/README.md)    | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               | [model](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220225-84784688.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k_20220211_124808.log.json)     |
-| [Rotation Prediction](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/README.md) | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            | [model](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220225-5b9f06a0.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k_20220215_185303.log.json) |
-| [SimCLR](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/README.md)                     | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](simclr_resnet50_8xb32-coslr-200e_in1k_20220225-97d2abef.pth)        &#124; [log](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb64-coslr-200e_in1k_20220210_191629.log.json)                                                                      |
+| [SimCLR](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/README.md)                     | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k_20220428-46ef6bb9.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k_20220411_182427.log.json)                             |
+|                                                                                                                    | [simclr_resnet50_16xb256-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py)                             | [model](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k_20220428-8c24b063.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k_20220423_205520.log.json)                         |
+| [MoCo v2](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/README.md)                    | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 | [model](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220225-89e03af4.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/moco/mocov2_resnet50_8xb32-coslr-200e_in1k_20220210_110905.log.json)                                 |
+| [BYOL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/README.md)                         | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220225-5c8b2c2e.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k_20220214_115709.log.json)                     |
+|                                                                                                                    | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       | [model](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220225-a0daa54a.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k_20220210_095852.log.json)                     |
+| [SwAV](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/README.md)                         | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | [model](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220225-0497dd5d.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220211_061131.log.json)   |
+| [DenseCL](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/README.md)                   | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220225-8c7808fe.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k_20220215_041207.log.json)                         |
 | [SimSiam](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/README.md)                   | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k_20220225-68a88ad8.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k_20220210_195405.log.json)                         |
 |                                                                                                                    | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                              | [model](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k_20220225-2f488143.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k_20220210_195402.log.json)                         |
-| [SwAV](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/README.md)                         | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | [model](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220225-0497dd5d.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96_20220211_061131.log.json)   |
+| [BarlowTwins](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/barlowtwins/README.md)           | [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py)                | [model](https://download.openmmlab.com/mmselfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220419-5ae15f89.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k_20220413_111555.log.json)       |
 | [MoCo v3](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/README.md)                    | [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | [model](https://download.openmmlab.com/mmselfsup/moco/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224_20220225-e31238dd.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/moco/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224_20220222_160222.log.json) |
-| [MAE](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/README.md)                           | [mae_vit-base-p16_8xb512-coslr-400e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k.py)                                | [model](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k-224_20220223-85be947b.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-300e_in1k-224_20220210_140925.log.json)                       |
+| [MAE](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/README.md)                           | [mae_vit-base-p16_8xb512-coslr-400e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k.py)                                | [model](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k-224_20220223-85be947b.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/mae/mae_vit-base-p16_8xb512-coslr-300e_in1k-224_20220210_140925.log.json)                   |
+| [SimMIM](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/README.md)         | [simmim_swin-base_16xb128-coslr-100e_in1k-192](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py)                                | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json) |
+| [CAE](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/README.md)         | [cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py)                                | [model](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.pth) &#124; [log](https://download.openmmlab.com/mmselfsup/cae/cae_vit-base-p16_16xb256-coslr-300e_in1k-224_20220427-4c786349.log.json) |
 
 备注：
 
@@ -36,27 +40,31 @@
 
 如果没有特殊说明，下列实验采用 [MoCo](http://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf) 的设置，或者采用的训练设置写在备注中。
 
-| 算法                | 配置文件                                                                                                                                                                                     | 备注             | Top-1 (%) |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------- | --------- |
-| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       |                  | 67.55     |
-|                     | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       |                  | 68.55     |
-| DeepCluster         | [deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py) |                  | 46.92     |
-| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              |                  | 63.62     |
-| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 |                  | 67.58     |
-| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                     |                  | 57.97     |
-| ODC                 | [odc_resnet50_8xb64-steplr-440e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py)                                        |                  | 53.43     |
-| Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               |                  | 38.78     |
-| Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            |                  | 48.12     |
-| SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 |                  | 57.28     |
-| SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                              | SimSiam 论文设置 | 68.28     |
-|                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                              | SimSiam 论文设置 | 69.84     |
-| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | SwAV 论文设置    | 70.47     |
-| MoCo v3             | [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | MoCo v3 论文设置 | 73.19     |
+| 算法                | 配置文件                                                                                                                                                                                       | 备注                  | Top-1 (%) |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------- | --------- |
+| Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)               |                       | 38.78     |
+| Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)            |                       | 48.12     |
+| DeepCluster         | [deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/deepcluster/deepcluster-sobel_resnet50_8xb64-steplr-200e_in1k.py) |                       | 46.92     |
+| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                     |                       | 58.97     |
+| ODC                 | [odc_resnet50_8xb64-steplr-440e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/odc/odc_resnet50_8xb64-steplr-440e_in1k.py)                                        |                       | 53.43     |
+| SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                                 | SimSiam 论文设置       | 62.56     |
+|                     | [simclr_resnet50_16xb256-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_16xb256-coslr-200e_in1k.py)                             | SimSiam 论文设置       | 66.66     |
+| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                                 |                       | 67.58     |
+| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                       |                       | 67.55     |
+|                     | [byol_resnet50_8xb32-accum16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-300e_in1k.py)                       |                       | 68.55     |
+| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py)     | SwAV 论文设置          | 70.47     |
+| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                              |                       | 63.62     |
+| SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                              | SimSiam 论文设置       | 68.28     |
+|                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                              | SimSiam 论文设置       | 69.84     |
+| Barlow Twins        | [barlowtwins_resnet50_8xb256-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/barlowtwins/barlowtwins_resnet50_8xb256-coslr-300e_in1k.py)                | Barlow Twins 论文设置  | 71.66     |
+| MoCo v3             | [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | MoCo v3 论文设置       | 73.19     |
 
 ### ImageNet 微调
 | 算法 | 配置文件                                                                                                                                                      | 备注 | Top-1 (%) |
 | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | --------- |
 | MAE  | [mae_vit-base-p16_8xb512-coslr-400e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mae/mae_vit-base-p16_8xb512-coslr-400e_in1k.py) |      | 83.1      |
+| SimMIM       | [simmim_swin-base_16xb128-coslr-100e_in1k-192](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py) |         | 82.9      |
+| CAE       | [cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/cae/cae_vit-base-p16_8xb256-fp16-coslr-300e_in1k.py) |         | 83.2     |
 
 ### COCO17 目标检测和实例分割
 
@@ -64,16 +72,15 @@
 
 | 算法                | 配置文件                                                                                                                                                                                 | mAP (Box) | mAP (Mask) |
 | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ---------- |
-| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 40.9      | 36.8       |
-| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                          |           |            |
-| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 40.2      | 36.1       |
-| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 38.5      | 34.6       |
 | Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)           | 37.5      | 33.7       |
 | Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)        | 37.9      | 34.2       |
+| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 38.5      | 34.6       |
 | SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                             | 38.7      | 34.9       |
+| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 40.2      | 36.1       |
+| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 40.9      | 36.8       |
+| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 40.2      | 36.3       |
 | SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                          | 38.6      | 34.6       |
 |                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                          | 38.8      | 34.9       |
-| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 40.2      | 36.3       |
 
 ### Pascal VOC12 Aug 语义分割
 
@@ -81,13 +88,13 @@
 
 | 算法                | 配置文件                                                                                                                                                                                 | mIOU  |
 | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
-| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 67.16 |
-| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                          | 69.47 |
-| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 67.55 |
-| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 65.45 |
 | Relative Location   | [relative-loc_resnet50_8xb64-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py)           | 63.49 |
 | Rotation Prediction | [rotation-pred_resnet50_8xb16-steplr-70e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/rotation_pred/rotation-pred_resnet50_8xb16-steplr-70e_in1k.py)        | 64.31 |
+| NPID                | [npid_resnet50_8xb32-steplr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/npid/npid_resnet50_8xb32-steplr-200e_in1k.py)                                 | 65.45 |
 | SimCLR              | [simclr_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simclr/simclr_resnet50_8xb32-coslr-200e_in1k.py)                             | 64.03 |
+| MoCo v2             | [mocov2_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/mocov2/mocov2_resnet50_8xb32-coslr-200e_in1k.py)                             | 67.55 |
+| BYOL                | [byol_resnet50_8xb32-accum16-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/byol/byol_resnet50_8xb32-accum16-coslr-200e_in1k.py)                   | 67.16 |
+| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 63.73 |
+| DenseCL             | [densecl_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/densecl/densecl_resnet50_8xb32-coslr-200e_in1k.py)                          | 69.47 |
 | SimSiam             | [simsiam_resnet50_8xb32-coslr-100e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-100e_in1k.py)                          | 48.35 |
 |                     | [simsiam_resnet50_8xb32-coslr-200e_in1k](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simsiam/simsiam_resnet50_8xb32-coslr-200e_in1k.py)                          | 46.27 |
-| SwAV                | [swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/swav/swav_resnet50_8xb32-mcrop-2-6-coslr-200e_in1k-224-96.py) | 63.73 |
diff --git a/mmselfsup/__init__.py b/mmselfsup/__init__.py
index 5c1b76e83..cd673dba5 100644
--- a/mmselfsup/__init__.py
+++ b/mmselfsup/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 
+import mmcls
 import mmcv
 from packaging.version import parse
 
@@ -47,14 +48,21 @@ def digit_version(version_str: str, length: int = 4):
     return tuple(release)
 
 
-mmcv_minimum_version = '1.3.16'
-mmcv_maximum_version = '1.5.0'
+mmcv_minimum_version = '1.4.2'
+mmcv_maximum_version = '1.6.0'
 mmcv_version = digit_version(mmcv.__version__)
 
+mmcls_minimum_version = '0.21.0'
+mmcls_version = digit_version(mmcls.__version__)
+
 
 assert (mmcv_version >= digit_version(mmcv_minimum_version)
         and mmcv_version <= digit_version(mmcv_maximum_version)), \
     f'MMCV=={mmcv.__version__} is used but incompatible. ' \
     f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
 
+assert mmcls_version >= digit_version(mmcls_minimum_version), \
+    f'MMClassification=={mmcls.__version__} is used but incompatible. ' \
+    f'Please install mmcls>={mmcls_minimum_version}.'
+
 __all__ = ['__version__', 'digit_version']
diff --git a/mmselfsup/apis/train.py b/mmselfsup/apis/train.py
index d4726b289..1d0b00b7c 100644
--- a/mmselfsup/apis/train.py
+++ b/mmselfsup/apis/train.py
@@ -90,21 +90,30 @@ def train_model(model,
                 f'{cfg.data.imgs_per_gpu} in this experiments')
         cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
 
-    data_loaders = [
-        build_dataloader(
-            ds,
-            samples_per_gpu=cfg.data.samples_per_gpu,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            # `num_gpus` will be ignored if distributed
-            num_gpus=len(cfg.gpu_ids),
-            dist=distributed,
-            replace=getattr(cfg.data, 'sampling_replace', False),
-            seed=cfg.seed,
-            drop_last=getattr(cfg.data, 'drop_last', False),
-            prefetch=cfg.prefetch,
-            persistent_workers=cfg.persistent_workers,
-            img_norm_cfg=cfg.img_norm_cfg) for ds in dataset
-    ]
+    # The default loader config
+    loader_cfg = dict(
+        # cfg.gpus will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
+        dist=distributed,
+        replace=getattr(cfg.data, 'sampling_replace', False),
+        drop_last=getattr(cfg.data, 'drop_last', False),
+        prefetch=getattr(cfg, 'prefetch', False),
+        seed=cfg.get('seed'),
+        persistent_workers=cfg.persistent_workers,
+        img_norm_cfg=cfg.img_norm_cfg)
+
+    # The overall dataloader settings
+    loader_cfg.update({
+        k: v
+        for k, v in cfg.data.items() if k not in [
+            'train', 'val', 'test', 'train_dataloader', 'val_dataloader',
+            'test_dataloader'
+        ]
+    })
+    # The specific train dataloader settings
+    train_loader_cfg = {**loader_cfg, **cfg.data.get('train_dataloader', {})}
+
+    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]
 
     # put model on gpus
     if distributed:
@@ -164,7 +173,7 @@ def train_model(model,
             if hook_cfg.type == 'DeepClusterHook':
                 common_params = dict(dist_mode=True, data_loaders=data_loaders)
             else:
-                common_params = dict(dist_mode=True)
+                common_params = dict()
             hook_cfg = hook_cfg.copy()
             priority = hook_cfg.pop('priority', 'NORMAL')
             hook = build_from_cfg(hook_cfg, HOOKS, common_params)
@@ -173,15 +182,16 @@ def train_model(model,
     # register evaluation hook
     if cfg.get('evaluation', None):
         val_dataset = build_dataset(cfg.data.val)
-        val_dataloader = build_dataloader(
-            val_dataset,
-            samples_per_gpu=cfg.data.samples_per_gpu,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            dist=distributed,
-            shuffle=False,
-            prefetch=cfg.data.val.prefetch,
-            drop_last=getattr(cfg.data, 'drop_last', False),
-            img_norm_cfg=cfg.get('img_norm_cfg', dict()))
+
+        # The specific validation dataloader settings
+        val_loader_cfg = {
+            **loader_cfg,
+            'shuffle': False,  # Not shuffle by default
+            'drop_last': False,
+            **cfg.data.get('val_dataloader', {}),
+        }
+        val_dataloader = build_dataloader(val_dataset, **val_loader_cfg)
+
         eval_cfg = cfg.get('evaluation', {})
         eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
         eval_hook = DistEvalHook if distributed else EvalHook
diff --git a/mmselfsup/core/hooks/cosineAnnealing_hook.py b/mmselfsup/core/hooks/cosineAnnealing_hook.py
index e55866058..39ca8f53a 100644
--- a/mmselfsup/core/hooks/cosineAnnealing_hook.py
+++ b/mmselfsup/core/hooks/cosineAnnealing_hook.py
@@ -7,6 +7,27 @@
 @HOOKS.register_module()
 class StepFixCosineAnnealingLrUpdaterHook(CosineAnnealingLrUpdaterHook):
 
+    def get_warmup_lr(self, cur_iters):
+
+        def _get_warmup_lr(cur_iters, regular_lr):
+            if self.warmup == 'constant':
+                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters)
+                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_lr = [_lr * k for _lr in regular_lr]
+            return warmup_lr
+
+        if isinstance(self.regular_lr, dict):
+            lr_groups = {}
+            for key, regular_lr in self.regular_lr.items():
+                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
+            return lr_groups
+        else:
+            return _get_warmup_lr(cur_iters, self.regular_lr)
+
     def get_lr(self, runner, base_lr):
         if self.by_epoch:
             progress = runner.epoch
diff --git a/mmselfsup/datasets/pipelines/__init__.py b/mmselfsup/datasets/pipelines/__init__.py
index 1628f9f3b..f4cf1e2a9 100644
--- a/mmselfsup/datasets/pipelines/__init__.py
+++ b/mmselfsup/datasets/pipelines/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .transforms import (BlockwiseMaskGenerator, GaussianBlur, Lighting,
-                         RandomAppliedTrans, RandomAug, Solarization)
+from .transforms import (BEiTMaskGenerator, GaussianBlur, Lighting,
+                         RandomAppliedTrans, RandomAug, SimMIMMaskGenerator,
+                         Solarization, ToTensor)
 
 __all__ = [
     'GaussianBlur', 'Lighting', 'RandomAppliedTrans', 'Solarization',
-    'RandomAug', 'BlockwiseMaskGenerator'
+    'RandomAug', 'SimMIMMaskGenerator', 'ToTensor', 'BEiTMaskGenerator'
 ]
diff --git a/mmselfsup/datasets/pipelines/transforms.py b/mmselfsup/datasets/pipelines/transforms.py
index d7ff389b1..dd4411d2b 100644
--- a/mmselfsup/datasets/pipelines/transforms.py
+++ b/mmselfsup/datasets/pipelines/transforms.py
@@ -1,9 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import inspect
-from typing import Tuple
+import math
+import random
+import warnings
+from typing import Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
+import torchvision.transforms.functional as F
 from mmcv.utils import build_from_cfg
 from PIL import Image, ImageFilter
 from timm.data import create_transform
@@ -18,10 +22,33 @@
         PIPELINES.register_module(m[1])
 
 
+@PIPELINES.register_module(force=True)
+class ToTensor(object):
+    """Convert image or a sequence of images to tensor.
+
+    This module can not only convert a single image to tensor, but also a
+    sequence of images.
+    """
+
+    def __init__(self) -> None:
+        self.transform = _transforms.ToTensor()
+
+    def __call__(self, imgs: Union[object, Sequence[object]]) -> torch.Tensor:
+        if isinstance(imgs, Sequence):
+            imgs = list(imgs)
+            for i, img in enumerate(imgs):
+                imgs[i] = self.transform(img)
+        else:
+            imgs = self.transform(imgs)
+        return imgs
+
+
 @PIPELINES.register_module()
-class BlockwiseMaskGenerator(object):
+class SimMIMMaskGenerator(object):
     """Generate random block mask for each Image.
 
+    This module is used in SimMIM to generate masks.
+
     Args:
         input_size (int): Size of input image. Defaults to 192.
         mask_patch_size (int): Size of each block mask. Defaults to 32.
@@ -61,6 +88,224 @@ def __call__(self, img: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         return img, mask
 
 
+@PIPELINES.register_module()
+class BEiTMaskGenerator(object):
+    """Generate mask for image.
+
+    This module is borrowed from
+    https://github.com/microsoft/unilm/tree/master/beit
+
+    Args:
+        input_size (int): The size of input image.
+        num_masking_patches (int): The number of patches to be masked.
+        min_num_patches (int): The minimum number of patches to be masked
+            in the process of generating mask. Defaults to 4.
+        max_num_patches (int, optional): The maximum number of patches to be
+            masked in the process of generating mask. Defaults to None.
+        min_aspect (float): The minimum aspect ratio of mask blocks. Defaults
+            to 0.3.
+        min_aspect (float, optional): The minimum aspect ratio of mask blocks.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 input_size: int,
+                 num_masking_patches: int,
+                 min_num_patches: int = 4,
+                 max_num_patches: Optional[int] = None,
+                 min_aspect: float = 0.3,
+                 max_aspect: Optional[float] = None) -> None:
+        if not isinstance(input_size, tuple):
+            input_size = (input_size, ) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_masking_patches = num_masking_patches
+
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = num_masking_patches if max_num_patches is None \
+            else max_num_patches
+
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+    def __repr__(self) -> None:
+        repr_str = 'Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)' % (
+            self.height, self.width, self.min_num_patches,
+            self.max_num_patches, self.num_masking_patches,
+            self.log_aspect_ratio[0], self.log_aspect_ratio[1])
+        return repr_str
+
+    def get_shape(self) -> Tuple[int, int]:
+        return self.height, self.width
+
+    def _mask(self, mask: np.ndarray, max_mask_patches: int) -> int:
+        delta = 0
+        for _ in range(10):
+            target_area = random.uniform(self.min_num_patches,
+                                         max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+
+                num_masked = mask[top:top + h, left:left + w].sum()
+                # Overlap
+                if 0 < h * w - num_masked <= max_mask_patches:
+                    for i in range(top, top + h):
+                        for j in range(left, left + w):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(
+        self, img: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray]:
+        mask = np.zeros(shape=self.get_shape(), dtype=np.int)
+        mask_count = 0
+        while mask_count != self.num_masking_patches:
+            max_mask_patches = self.num_masking_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            mask_count += delta
+
+        return img[0], img[1], mask
+
+
+@PIPELINES.register_module()
+class RandomResizedCropAndInterpolationWithTwoPic(object):
+    """Crop the given PIL Image to random size and aspect ratio with random
+    interpolation.
+
+    This module is borrowed from
+    https://github.com/microsoft/unilm/tree/master/beit.
+
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a
+    random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio
+    is made. This crop is finally resized to given size. This is popularly used
+    to train the Inception networks. This module first crops the image and
+    resizes the crop to two different sizes.
+
+    Args:
+        size (Union[tuple, int]): Expected output size of each edge of the
+            first image.
+        second_size (Union[tuple, int], optional): Expected output size of each
+            edge of the second image.
+        scale (tuple[float, float]): Range of size of the origin size cropped.
+            Defaults to (0.08, 1.0).
+        ratio (tuple[float, float]): Range of aspect ratio of the origin aspect
+            ratio cropped. Defaults to (3./4., 4./3.).
+        interpolation (str): The interpolation for the first image. Defaults
+            to ``bilinear``.
+        second_interpolation (str): The interpolation for the second image.
+            Defaults to ``lanczos``.
+    """
+
+    interpolation_dict = {
+        'bicubic': Image.BICUBIC,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
+
+    def __init__(self,
+                 size: Union[tuple, int],
+                 second_size=None,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear',
+                 second_interpolation='lanczos') -> None:
+        if isinstance(size, tuple):
+            self.size = size
+        else:
+            self.size = (size, size)
+        if second_size is not None:
+            if isinstance(second_size, tuple):
+                self.second_size = second_size
+            else:
+                self.second_size = (second_size, second_size)
+        else:
+            self.second_size = None
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn('range should be of kind (min, max)')
+
+        if interpolation == 'random':
+            self.interpolation = (Image.BILINEAR, Image.BICUBIC)
+        else:
+            self.interpolation = self.interpolation_dict.get(
+                interpolation, Image.BILINEAR)
+        self.second_interpolation = self.interpolation_dict.get(
+            second_interpolation, Image.BILINEAR)
+        self.scale = scale
+        self.ratio = ratio
+
+    @staticmethod
+    def get_params(img: np.ndarray, scale: tuple,
+                   ratio: tuple) -> Sequence[int]:
+        """Get parameters for ``crop`` for a random sized crop.
+
+        Args:
+            img (np.ndarray): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect
+                ratio cropped
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        area = img.size[0] * img.size[1]
+
+        for _ in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w <= img.size[0] and h <= img.size[1]:
+                i = random.randint(0, img.size[1] - h)
+                j = random.randint(0, img.size[0] - w)
+                return i, j, h, w
+
+        # Fallback to central crop
+        in_ratio = img.size[0] / img.size[1]
+        if in_ratio < min(ratio):
+            w = img.size[0]
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = img.size[1]
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = img.size[0]
+            h = img.size[1]
+        i = (img.size[1] - h) // 2
+        j = (img.size[0] - w) // 2
+        return i, j, h, w
+
+    def __call__(
+            self, img: np.ndarray
+    ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolation = random.choice(self.interpolation)
+        else:
+            interpolation = self.interpolation
+        if self.second_size is None:
+            return F.resized_crop(img, i, j, h, w, self.size, interpolation)
+        else:
+            return F.resized_crop(img, i, j, h, w, self.size,
+                                  interpolation), F.resized_crop(
+                                      img, i, j, h, w, self.second_size,
+                                      self.second_interpolation)
+
+
 @PIPELINES.register_module()
 class RandomAug(object):
     """RandAugment data augmentation method based on
diff --git a/mmselfsup/datasets/utils.py b/mmselfsup/datasets/utils.py
index c6ec2db26..f2e9cee72 100644
--- a/mmselfsup/datasets/utils.py
+++ b/mmselfsup/datasets/utils.py
@@ -197,6 +197,8 @@ def __iter__(self):
             torch.cuda.current_stream().wait_stream(stream)
             input_dict = next_input_dict
 
+        next_input_dict = None
+        torch.cuda.empty_cache()
         yield input_dict
 
     def __len__(self):
diff --git a/mmselfsup/models/algorithms/__init__.py b/mmselfsup/models/algorithms/__init__.py
index 61b9cde2e..8de1b2539 100644
--- a/mmselfsup/models/algorithms/__init__.py
+++ b/mmselfsup/models/algorithms/__init__.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .barlowtwins import BarlowTwins
 from .base import BaseModel
 from .byol import BYOL
+from .cae import CAE
 from .classification import Classification
 from .deepcluster import DeepCluster
 from .densecl import DenseCL
@@ -18,7 +20,8 @@
 from .swav import SwAV
 
 __all__ = [
-    'BaseModel', 'BYOL', 'Classification', 'DeepCluster', 'DenseCL', 'MoCo',
-    'NPID', 'ODC', 'RelativeLoc', 'RotationPred', 'SimCLR', 'SimSiam', 'SwAV',
-    'MAE', 'MoCoV3', 'SimMIM', 'MMClsImageClassifierWrapper'
+    'BaseModel', 'BarlowTwins', 'BYOL', 'Classification', 'DeepCluster',
+    'DenseCL', 'MoCo', 'NPID', 'ODC', 'RelativeLoc', 'RotationPred', 'SimCLR',
+    'SimSiam', 'SwAV', 'MAE', 'MoCoV3', 'SimMIM',
+    'MMClsImageClassifierWrapper', 'CAE'
 ]
diff --git a/mmselfsup/models/algorithms/barlowtwins.py b/mmselfsup/models/algorithms/barlowtwins.py
new file mode 100644
index 000000000..cfb77a8c2
--- /dev/null
+++ b/mmselfsup/models/algorithms/barlowtwins.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+
+from ..builder import ALGORITHMS, build_backbone, build_head, build_neck
+from .base import BaseModel
+
+
+@ALGORITHMS.register_module()
+class BarlowTwins(BaseModel):
+    """BarlowTwins.
+
+    Implementation of `Barlow Twins: Self-Supervised Learning via Redundancy
+    Reduction <https://arxiv.org/abs/2103.03230>`_.
+    Part of the code is borrowed from:
+    `<https://github.com/facebookresearch/barlowtwins/blob/main/main.py>`_.
+
+    Args:
+        backbone (dict): Config dict for module of backbone. Defaults to None.
+        neck (dict): Config dict for module of deep features to compact
+            feature vectors. Defaults to None.
+        head (dict): Config dict for module of loss functions.
+            Defaults to None.
+        init_cfg (dict): Config dict for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: dict = None,
+                 neck: dict = None,
+                 head: dict = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs) -> None:
+        super(BarlowTwins, self).__init__(init_cfg)
+        assert backbone is not None
+        self.backbone = build_backbone(backbone)
+        assert neck is not None
+        self.neck = build_neck(neck)
+        assert head is not None
+        self.head = build_head(head)
+
+    def extract_feat(self, img: torch.Tensor) -> torch.Tensor:
+        """Function to extract features from backbone.
+
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+
+        Returns:
+            tuple[Tensor]: backbone outputs.
+        """
+        x = self.backbone(img)
+        return x
+
+    def forward_train(self, img: List[torch.Tensor]) -> dict:
+        """Forward computation during training.
+
+        Args:
+            img (List[Tensor]): A list of input images with shape
+                (N, C, H, W). Typically these should be mean centered
+                and std scaled.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert isinstance(img, list)
+        img_v1 = img[0]
+        img_v2 = img[1]
+
+        z1 = self.neck(self.backbone(img_v1))[0]  # NxC
+        z2 = self.neck(self.backbone(img_v2))[0]  # NxC
+
+        losses = self.head(z1, z2)['loss']
+        return dict(loss=losses)
diff --git a/mmselfsup/models/algorithms/cae.py b/mmselfsup/models/algorithms/cae.py
new file mode 100644
index 000000000..0420a4047
--- /dev/null
+++ b/mmselfsup/models/algorithms/cae.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+from torchvision.transforms import Normalize
+
+from ..builder import ALGORITHMS, build_backbone, build_head, build_neck
+from .base import BaseModel
+
+
+@ALGORITHMS.register_module()
+class CAE(BaseModel):
+    """CAE.
+
+    Implementation of `Context Autoencoder for Self-Supervised Representation
+    Learning <https://arxiv.org/abs/2202.03026>`_.
+
+    Args:
+        backbone (dict, optional): Config dict for module of backbone.
+        neck (dict, optional): Config dict for module of deep features to
+            compact feature vectors. Defaults to None.
+        head (dict, optional): Config dict for module of loss functions.
+            Defaults to None.
+        base_momentum (float): The base momentum coefficient for the target
+            network. Defaults to 0.0.
+        init_cfg (dict, optional): the config to control the initialization.
+    """
+
+    def __init__(self,
+                 backbone: dict = None,
+                 neck: dict = None,
+                 head: dict = None,
+                 base_momentum: float = 0.0,
+                 init_cfg: dict = None,
+                 **kwargs) -> None:
+        super(CAE, self).__init__(init_cfg)
+        assert backbone is not None
+        self.backbone = build_backbone(backbone)
+        self.teacher = build_backbone(backbone)
+        assert neck is not None
+        self.neck = build_neck(neck)
+        assert head is not None
+        self.head = build_head(head)
+
+        self.momentum = base_momentum
+
+        self.img_norm = Normalize(
+            mean=torch.tensor((0.485, 0.456, 0.406)),
+            std=torch.tensor((0.229, 0.224, 0.225)))
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        self._init_teacher()
+
+    def _init_teacher(self) -> None:
+        # init the weights of teacher with those of backbone
+        for param_backbone, param_teacher in zip(self.backbone.parameters(),
+                                                 self.teacher.parameters()):
+            param_teacher.detach()
+            param_teacher.data.copy_(param_backbone.data)
+            param_teacher.requires_grad = False
+
+    def momentum_update(self) -> None:
+        """Momentum update of the teacher network."""
+        for param_bacbone, param_teacher in zip(self.backbone.parameters(),
+                                                self.teacher.parameters()):
+            param_teacher.data = param_teacher.data * self.momentum + \
+                param_bacbone.data * (1. - self.momentum)
+
+    def extract_feat(self, img: torch.Tensor,
+                     mask: torch.Tensor) -> torch.Tensor:
+
+        x = self.backbone(img, mask)
+        return x
+
+    def forward_train(self, samples: Sequence, **kwargs) -> dict:
+        img, img_target, mask = samples
+
+        # normalize images and the images to get the target
+        img_list = [self.img_norm(x).unsqueeze(0) for x in img]
+        img = torch.cat(img_list)
+        img_target = 0.8 * img_target + 0.1
+
+        mask = mask.flatten(1).to(torch.bool)
+
+        unmasked = self.backbone(img, mask)
+
+        # get the latent prediction for the masked patches
+        with torch.no_grad():
+            latent_target = self.teacher(img, ~mask)
+            latent_target = latent_target[:, 1:, :]
+            self.momentum_update()
+
+        pos_embed = self.backbone.pos_embed.expand(img.shape[0], -1, -1)
+        pos_embed_masked = pos_embed[:,
+                                     1:][mask].reshape(img.shape[0], -1,
+                                                       pos_embed.shape[-1])
+        pos_embed_unmasked = pos_embed[:, 1:][~mask].reshape(
+            img.shape[0], -1, pos_embed.shape[-1])
+
+        # input the unmasked tokens and masked tokens to the decoder
+        logits, latent_pred = self.neck(unmasked[:, 1:], pos_embed_masked,
+                                        pos_embed_unmasked)
+
+        logits = logits.view(-1, logits.shape[-1])
+
+        losses = self.head(img_target, logits, latent_pred, latent_target,
+                           mask)
+        return losses
diff --git a/mmselfsup/models/backbones/__init__.py b/mmselfsup/models/backbones/__init__.py
index 7f3a1c573..dbc78426c 100644
--- a/mmselfsup/models/backbones/__init__.py
+++ b/mmselfsup/models/backbones/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .mae_pretrain_vit import MAEViT
+from .cae_vit import CAEViT
+from .mae_vit import MAEViT
 from .mim_cls_vit import MIMVisionTransformer
 from .resnet import ResNet, ResNetV1d
 from .resnext import ResNeXt
@@ -8,5 +9,5 @@
 
 __all__ = [
     'ResNet', 'ResNetV1d', 'ResNeXt', 'MAEViT', 'MIMVisionTransformer',
-    'VisionTransformer', 'SimMIMSwinTransformer'
+    'VisionTransformer', 'SimMIMSwinTransformer', 'CAEViT'
 ]
diff --git a/mmselfsup/models/backbones/cae_vit.py b/mmselfsup/models/backbones/cae_vit.py
new file mode 100644
index 000000000..c097d350d
--- /dev/null
+++ b/mmselfsup/models/backbones/cae_vit.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcls.models import VisionTransformer
+from mmcv.cnn.utils.weight_init import trunc_normal_
+from mmcv.runner.base_module import ModuleList
+from torch import nn
+
+from ..builder import BACKBONES
+from ..utils import TransformerEncoderLayer, build_2d_sincos_position_embedding
+
+
+@BACKBONES.register_module()
+class CAEViT(VisionTransformer):
+    """Vision Transformer for CAE pre-training.
+
+    Rewritten version of: `An Image is Worth 16x16 Words: Transformers
+    for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_
+
+    Args:
+        arch (str | dict): Vision Transformer architecture. Default: 'b'
+        img_size (int | tuple): Input image size
+        patch_size (int | tuple): The patch size
+        out_indices (Sequence | int): Output from which stages.
+            Defaults to -1, means the last stage.
+        drop_rate (float): Probability of an element to be zeroed.
+            Defaults to 0.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Defaults to True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            `with_cls_token` must be True. Defaults to True.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Defaults to "bicubic".
+        init_values (float, optional): The init value of gamma in
+            TransformerEncoderLayer.
+        patch_cfg (dict): Configs of patch embeding. Defaults to an empty dict.
+        layer_cfgs (Sequence | dict): Configs of each transformer layer in
+            encoder. Defaults to an empty dict.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 arch: str = 'b',
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 out_indices: int = -1,
+                 drop_rate: float = 0,
+                 drop_path_rate: float = 0,
+                 qkv_bias: bool = True,
+                 norm_cfg: dict = dict(type='LN', eps=1e-6),
+                 final_norm: bool = True,
+                 output_cls_token: bool = True,
+                 interpolate_mode: str = 'bicubic',
+                 init_values: float = None,
+                 patch_cfg: dict = dict(),
+                 layer_cfgs: dict = dict(),
+                 init_cfg: dict = None) -> None:
+        super().__init__(
+            arch=arch,
+            img_size=img_size,
+            patch_size=patch_size,
+            out_indices=out_indices,
+            drop_rate=drop_rate,
+            drop_path_rate=drop_path_rate,
+            norm_cfg=norm_cfg,
+            final_norm=final_norm,
+            output_cls_token=output_cls_token,
+            interpolate_mode=interpolate_mode,
+            patch_cfg=patch_cfg,
+            layer_cfgs=layer_cfgs,
+            init_cfg=init_cfg)
+        self.pos_embed.requires_grad = False
+        self.num_patches = self.patch_resolution[0] * self.patch_resolution[1]
+        dpr = np.linspace(0, drop_path_rate, self.num_layers)
+
+        # Replace original TransformerEncoderLayer with customized
+        # TransformerEncoderLayer
+        self.layers = ModuleList()
+        if isinstance(layer_cfgs, dict):
+            layer_cfgs = [layer_cfgs] * self.num_layers
+        for i in range(self.num_layers):
+            _layer_cfg = dict(
+                embed_dims=self.embed_dims,
+                num_heads=self.arch_settings['num_heads'],
+                feedforward_channels=self.
+                arch_settings['feedforward_channels'],
+                drop_rate=drop_rate,
+                drop_path_rate=dpr[i],
+                qkv_bias=qkv_bias,
+                init_values=init_values,
+                norm_cfg=norm_cfg)
+            _layer_cfg.update(layer_cfgs[i])
+            self.layers.append(TransformerEncoderLayer(**_layer_cfg))
+
+    def init_weights(self) -> None:
+        super(CAEViT, self).init_weights()
+        if not (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # initialize position  embedding in backbone
+            pos_embed = build_2d_sincos_position_embedding(
+                int(self.num_patches**.5),
+                self.pos_embed.shape[-1],
+                cls_token=True)
+            self.pos_embed.data.copy_(pos_embed.float())
+
+            trunc_normal_(self.cls_token, std=.02)
+            self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, img: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        x, _ = self.patch_embed(img)
+        batch_size, _, dim = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+
+        # NOTE: unmasked embeddings
+        x_unmasked = x[~mask].reshape(batch_size, -1, dim)
+        x_unmasked = torch.cat((cls_tokens, x_unmasked), dim=1)
+
+        pos_embed = self.pos_embed.expand(batch_size, self.num_patches + 1,
+                                          dim)
+        pos_embed_unmasked = pos_embed[:,
+                                       1:][~mask].reshape(batch_size, -1, dim)
+        pos_embed_unmasked = torch.cat((pos_embed[:, :1], pos_embed_unmasked),
+                                       dim=1)
+        x_unmasked = x_unmasked + pos_embed_unmasked
+
+        x_unmasked = self.drop_after_pos(x_unmasked)
+
+        for i, layer in enumerate(self.layers):
+            x_unmasked = layer(x_unmasked)
+
+            if i == len(self.layers) - 1 and self.final_norm:
+                x_unmasked = self.norm1(x_unmasked)
+
+        return x_unmasked
diff --git a/mmselfsup/models/backbones/mae_pretrain_vit.py b/mmselfsup/models/backbones/mae_vit.py
similarity index 100%
rename from mmselfsup/models/backbones/mae_pretrain_vit.py
rename to mmselfsup/models/backbones/mae_vit.py
diff --git a/mmselfsup/models/backbones/mim_cls_vit.py b/mmselfsup/models/backbones/mim_cls_vit.py
index e5c7b138b..09581f429 100644
--- a/mmselfsup/models/backbones/mim_cls_vit.py
+++ b/mmselfsup/models/backbones/mim_cls_vit.py
@@ -1,9 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
 import torch
 from mmcls.models import VisionTransformer
 from mmcv.cnn import build_norm_layer
+from mmcv.runner.base_module import ModuleList
 
 from ..builder import BACKBONES
+from ..utils import TransformerEncoderLayer
 
 
 @BACKBONES.register_module()
@@ -45,12 +48,15 @@ def __init__(self,
                  img_size=224,
                  patch_size=16,
                  out_indices=-1,
+                 use_window=False,
                  drop_rate=0,
                  drop_path_rate=0,
+                 qkv_bias=True,
                  norm_cfg=dict(type='LN', eps=1e-6),
                  final_norm=True,
                  output_cls_token=True,
                  interpolate_mode='bicubic',
+                 init_values=0.0,
                  patch_cfg=dict(),
                  layer_cfgs=dict(),
                  finetune=True,
@@ -69,6 +75,24 @@ def __init__(self,
             patch_cfg=patch_cfg,
             layer_cfgs=layer_cfgs,
             init_cfg=init_cfg)
+        dpr = np.linspace(0, drop_path_rate, self.num_layers)
+        self.layers = ModuleList()
+        if isinstance(layer_cfgs, dict):
+            layer_cfgs = [layer_cfgs] * self.num_layers
+        for i in range(self.num_layers):
+            _layer_cfg = dict(
+                embed_dims=self.embed_dims,
+                num_heads=self.arch_settings['num_heads'],
+                feedforward_channels=self.
+                arch_settings['feedforward_channels'],
+                window_size=self.patch_resolution if use_window else None,
+                drop_rate=drop_rate,
+                drop_path_rate=dpr[i],
+                init_values=init_values,
+                qkv_bias=qkv_bias,
+                norm_cfg=norm_cfg)
+            _layer_cfg.update(layer_cfgs[i])
+            self.layers.append(TransformerEncoderLayer(**_layer_cfg))
 
         self.embed_dims = self.arch_settings['embed_dims']
         self.num_patches = self.patch_resolution[0] * self.patch_resolution[1]
diff --git a/mmselfsup/models/backbones/resnet.py b/mmselfsup/models/backbones/resnet.py
index 954c90bae..18dcddf07 100644
--- a/mmselfsup/models/backbones/resnet.py
+++ b/mmselfsup/models/backbones/resnet.py
@@ -43,7 +43,7 @@ class ResNet(_ResNet):
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Defaults to False.
         zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Defaults to True.
+            in resblocks to let them behave as identity. Defaults to False.
         Probability of the path to be zeroed. Defaults to 0.1
     Example:
         >>> from mmselfsup.models import ResNet
@@ -86,7 +86,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN', requires_grad=True),
                  norm_eval=False,
                  with_cp=False,
-                 zero_init_residual=True,
+                 zero_init_residual=False,
                  init_cfg=[
                      dict(type='Kaiming', layer=['Conv2d']),
                      dict(
diff --git a/mmselfsup/models/backbones/resnext.py b/mmselfsup/models/backbones/resnext.py
index 6de89a6cf..10ac0ac6a 100644
--- a/mmselfsup/models/backbones/resnext.py
+++ b/mmselfsup/models/backbones/resnext.py
@@ -51,7 +51,7 @@ class ResNeXt(ResNet):
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Defaults to False.
         zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Defaults to True.
+            in resblocks to let them behave as identity. Defaults to False.
 
     Example:
         >>> from mmselfsup.models import ResNeXt
diff --git a/mmselfsup/models/heads/__init__.py b/mmselfsup/models/heads/__init__.py
index 36b7e4382..360701b17 100644
--- a/mmselfsup/models/heads/__init__.py
+++ b/mmselfsup/models/heads/__init__.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .cae_head import CAEHead
 from .cls_head import ClsHead
 from .contrastive_head import ContrastiveHead
-from .latent_pred_head import LatentClsHead, LatentPredictHead
-from .mae_head import MAEFinetuneHead, MAEPretrainHead
+from .latent_pred_head import (LatentClsHead, LatentCrossCorrelationHead,
+                               LatentPredictHead)
+from .mae_head import MAEFinetuneHead, MAELinprobeHead, MAEPretrainHead
 from .mocov3_head import MoCoV3Head
 from .multi_cls_head import MultiClsHead
 from .simmim_head import SimMIMHead
@@ -10,6 +12,7 @@
 
 __all__ = [
     'ContrastiveHead', 'ClsHead', 'LatentPredictHead', 'LatentClsHead',
-    'MultiClsHead', 'SwAVHead', 'MAEFinetuneHead', 'MAEPretrainHead',
-    'MoCoV3Head', 'SimMIMHead'
+    'LatentCrossCorrelationHead', 'MultiClsHead', 'SwAVHead',
+    'MAEFinetuneHead', 'MAEPretrainHead', 'MoCoV3Head', 'SimMIMHead',
+    'CAEHead', 'MAELinprobeHead'
 ]
diff --git a/mmselfsup/models/heads/cae_head.py b/mmselfsup/models/heads/cae_head.py
new file mode 100644
index 000000000..f118f0083
--- /dev/null
+++ b/mmselfsup/models/heads/cae_head.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import torch
+from mmcv.runner import BaseModule
+from torch import nn
+
+from ..builder import HEADS
+from ..utils import Encoder
+
+
+@HEADS.register_module()
+class CAEHead(BaseModule):
+    """Pretrain Head for CAE.
+
+    Compute the align loss and the main loss. In addition, this head also
+    generates the prediction target generated by dalle.
+
+    Args:
+        tokenizer_path (str): The path of the tokenizer.
+        lambd (float): The weight for the align loss.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 tokenizer_path: str,
+                 lambd: float,
+                 init_cfg: dict = None) -> None:
+        super(CAEHead, self).__init__(init_cfg=init_cfg)
+        self.tokenizer_path = tokenizer_path
+        self.lambd = lambd
+        self.encoder = self._load_encoder()
+        self.loss_cross_entropy = nn.CrossEntropyLoss()
+        self.loss_mse = nn.MSELoss()
+
+    def _load_encoder(self) -> nn.Module:
+        encoder = Encoder()
+        if os.path.exists(self.tokenizer_path):
+            state_dict = torch.load(self.tokenizer_path)
+            encoder.load_state_dict(state_dict)
+        else:
+            warnings.warn(
+                f'Do not find {self.tokenizer_path}, please download from https://download.openmmlab.com/mmselfsup/cae/dalle_encoder.pth'  # noqa: E501
+            )
+        return encoder
+
+    @torch.no_grad()
+    def _generate_target(self, img_target: torch.Tensor) -> torch.Tensor:
+        logits = self.encoder(img_target)
+        target = torch.argmax(logits, dim=1)
+        return target.flatten(1)
+
+    def forward(self, img_target: torch.Tensor, outputs: torch.Tensor,
+                latent_pred: torch.Tensor, latent_target: torch.Tensor,
+                mask: torch.Tensor) -> dict:
+        losses = dict()
+        target = self._generate_target(img_target)
+        target = target[mask]
+        loss_main = self.loss_cross_entropy(outputs, target)
+        loss_align = self.loss_mse(latent_pred,
+                                   latent_target.detach()) * self.lambd
+
+        losses['loss'] = loss_main + loss_align
+        losses['main'] = loss_main
+        losses['align'] = loss_align
+
+        return losses
diff --git a/mmselfsup/models/heads/latent_pred_head.py b/mmselfsup/models/heads/latent_pred_head.py
index 03b2176d4..2497fad40 100644
--- a/mmselfsup/models/heads/latent_pred_head.py
+++ b/mmselfsup/models/heads/latent_pred_head.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmcv.runner import BaseModule
+from mmcv.runner import BaseModule, get_dist_info
 
 from ..builder import HEADS, build_neck
 
@@ -15,14 +15,14 @@ class LatentPredictHead(BaseModule):
     It also implements similarity loss between two forward features.
 
     Args:
-        predictor (dict): Config dict for module of predictor.
+        predictor (dict): Config dict for the predictor.
     """
 
-    def __init__(self, predictor):
+    def __init__(self, predictor: dict) -> None:
         super(LatentPredictHead, self).__init__()
         self.predictor = build_neck(predictor)
 
-    def forward(self, input, target):
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> dict:
         """Forward head.
 
         Args:
@@ -51,15 +51,21 @@ class LatentClsHead(BaseModule):
         init_cfg (dict or list[dict], optional): Initialization config dict.
     """
 
-    def __init__(self,
-                 in_channels,
-                 num_classes,
-                 init_cfg=dict(type='Normal', std=0.01, layer='Linear')):
+    def __init__(
+        self,
+        in_channels: int,
+        num_classes: int,
+        init_cfg: dict = dict(
+            type='Normal',
+            std=0.01,
+            layer='Linear',
+        )
+    ) -> None:
         super(LatentClsHead, self).__init__(init_cfg)
         self.predictor = nn.Linear(in_channels, num_classes)
         self.criterion = nn.CrossEntropyLoss()
 
-    def forward(self, input, target):
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> dict:
         """Forward head.
 
         Args:
@@ -74,3 +80,52 @@ def forward(self, input, target):
             label = torch.argmax(self.predictor(target), dim=1).detach()
         loss = self.criterion(pred, label)
         return dict(loss=loss)
+
+
+@HEADS.register_module()
+class LatentCrossCorrelationHead(BaseModule):
+    """Head for latent feature cross correlation. Part of the code is borrowed
+    from:
+    `https://github.com/facebookresearch/barlowtwins/blob/main/main.py>`_.
+
+    Args:
+        in_channels (int): Number of input channels.
+        lambd (float): Weight on off-diagonal terms. Defaults to 0.0051.
+    """
+
+    def __init__(self, in_channels: int, lambd: float = 0.0051) -> None:
+        super(LatentCrossCorrelationHead, self).__init__()
+        self.lambd = lambd
+        _, self.world_size = get_dist_info()
+        self.bn = nn.BatchNorm1d(in_channels, affine=False)
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> dict:
+        """Forward head.
+
+        Args:
+            input (Tensor): NxC input features.
+            target (Tensor): NxC target features.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # cross-correlation matrix
+        cross_correlation_matrix = self.bn(input).T @ self.bn(target)
+        cross_correlation_matrix.div_(input.size(0) * self.world_size)
+
+        if torch.distributed.is_initialized():
+            torch.distributed.all_reduce(cross_correlation_matrix)
+
+        # loss
+        on_diag = torch.diagonal(cross_correlation_matrix).add_(-1).pow_(
+            2).sum()
+        off_diag = self.off_diagonal(cross_correlation_matrix).pow_(2).sum()
+        loss = on_diag + self.lambd * off_diag
+        return dict(loss=loss)
+
+    def off_diagonal(self, x: torch.Tensor) -> torch.Tensor:
+        """Rreturn a flattened view of the off-diagonal elements of a square
+        matrix."""
+        n, m = x.shape
+        assert n == m
+        return x.flatten()[:-1].view(n - 1, n + 1)[:, 1:].flatten()
diff --git a/mmselfsup/models/heads/mae_head.py b/mmselfsup/models/heads/mae_head.py
index cb8b566c1..cfaf2161d 100644
--- a/mmselfsup/models/heads/mae_head.py
+++ b/mmselfsup/models/heads/mae_head.py
@@ -80,3 +80,37 @@ def loss(self, outputs, labels):
         losses['loss'] = self.criterion(outputs[0], labels)
 
         return losses
+
+
+@HEADS.register_module()
+class MAELinprobeHead(BaseModule):
+    """Linear probing head for MAE.
+
+    Args:
+        embed_dim (int): The dim of the feature before the classifier head.
+        num_classes (int): The total classes. Defaults to 1000.
+    """
+
+    def __init__(self, embed_dim, num_classes=1000):
+        super(MAELinprobeHead, self).__init__()
+        self.head = nn.Linear(embed_dim, num_classes)
+        self.bn = nn.BatchNorm1d(embed_dim, affine=False, eps=1e-6)
+        self.criterion = nn.CrossEntropyLoss()
+
+    def init_weights(self):
+        nn.init.constant_(self.head.bias, 0)
+        trunc_normal_(self.head.weight, std=0.01)
+
+    def forward(self, x):
+        """"Get the logits."""
+        x = self.bn(x)
+        outputs = self.head(x)
+
+        return [outputs]
+
+    def loss(self, outputs, labels):
+        """Compute the loss."""
+        losses = dict()
+        losses['loss'] = self.criterion(outputs[0], labels)
+
+        return losses
diff --git a/mmselfsup/models/necks/__init__.py b/mmselfsup/models/necks/__init__.py
index 8a1362bca..9cbbcb975 100644
--- a/mmselfsup/models/necks/__init__.py
+++ b/mmselfsup/models/necks/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .avgpool2d_neck import AvgPool2dNeck
+from .cae_neck import CAENeck
 from .densecl_neck import DenseCLNeck
 from .linear_neck import LinearNeck
 from .mae_neck import MAEPretrainDecoder
@@ -13,5 +14,5 @@
 __all__ = [
     'AvgPool2dNeck', 'DenseCLNeck', 'LinearNeck', 'MoCoV2Neck',
     'NonLinearNeck', 'ODCNeck', 'RelativeLocNeck', 'SwAVNeck',
-    'MAEPretrainDecoder', 'SimMIMNeck'
+    'MAEPretrainDecoder', 'SimMIMNeck', 'CAENeck'
 ]
diff --git a/mmselfsup/models/necks/cae_neck.py b/mmselfsup/models/necks/cae_neck.py
new file mode 100644
index 000000000..6f8e2b682
--- /dev/null
+++ b/mmselfsup/models/necks/cae_neck.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.utils.weight_init import trunc_normal_
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+from ..utils import CAETransformerRegressorLayer, TransformerEncoderLayer
+
+
+@NECKS.register_module()
+class CAENeck(BaseModule):
+    """Neck for CAE Pre-training.
+
+    This module construct the latent prediction regressor and the decoder
+    for the latent prediction and final prediction.
+
+    Args:
+        patch_size (int): The patch size of each token. Defaults to 16.
+        num_classes (int): The number of classes for final prediction. Defaults
+            to 8192.
+        embed_dims (int): The embed dims of latent feature in regressor and
+            decoder. Defaults to 768.
+        regressor_depth (int): The number of regressor blocks. Defaults to 6.
+        decoder_depth (int): The number of decoder blocks. Defaults to 8.
+        num_heads (int): The number of head in multi-head attention. Defaults
+            to 12.
+        mlp_ratio (int): The expand ratio of latent features in MLP. defaults
+            to 4.
+        qkv_bias (bool): Whether or not to use qkv bias. Defaults to True.
+        qk_scale (float, optional): The scale applied to the results of qk.
+            Defaults to None.
+        drop_rate (float): The dropout rate. Defaults to 0.
+        attn_drop_rate (float): The dropout rate in attention block. Defaults
+            to 0.
+        norm_cfg (dict): The config of normalization layer. Defaults to
+            dict(type='LN', eps=1e-6).
+        init_values (float, optional): The init value of gamma. Defaults to
+            None.
+        mask_tokens_num (int): The number of mask tokens. Defaults to 75.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 patch_size: int = 16,
+                 num_classes: int = 8192,
+                 embed_dims: int = 768,
+                 regressor_depth: int = 6,
+                 decoder_depth: int = 8,
+                 num_heads: int = 12,
+                 mlp_ratio: int = 4,
+                 qkv_bias: bool = True,
+                 qk_scale: float = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 norm_cfg: dict = dict(type='LN', eps=1e-6),
+                 init_values: float = None,
+                 mask_tokens_num: int = 75,
+                 init_cfg: dict = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.num_features = self.embed_dim = embed_dims
+        self.patch_size = patch_size
+        self.mask_token_num = mask_tokens_num
+
+        # regressor
+        regressor_drop_path_rates = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, regressor_depth)
+        ]
+        self.regressors = nn.ModuleList([
+            CAETransformerRegressorLayer(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=mlp_ratio * embed_dims,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=regressor_drop_path_rates[i],
+                norm_cfg=norm_cfg,
+                init_values=init_values) for i in range(regressor_depth)
+        ])
+
+        # decoder
+        decoder_drop_path_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, decoder_depth)
+        ]
+
+        self.decoders = nn.ModuleList([
+            TransformerEncoderLayer(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=mlp_ratio * embed_dims,
+                qkv_bias=qkv_bias,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=decoder_drop_path_rates[i],
+                norm_cfg=norm_cfg,
+                init_values=init_values) for i in range(decoder_depth)
+        ])
+
+        _, self.norm_regressor = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        _, self.norm_decoder = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+
+        self.head = nn.Linear(
+            embed_dims, num_classes) if num_classes > 0 else nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+    def init_weights(self) -> None:
+        super(CAENeck, self).init_weights()
+        self.apply(self._init_weights)
+        trunc_normal_(self.mask_token, std=0.02)
+        trunc_normal_(self.head.weight, std=0.02)
+
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(
+            self, x_unmasked: torch.Tensor, pos_embed_masked: torch.Tensor,
+            pos_embed_unmasked: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get the latent prediction and final prediction.
+
+        Args:
+            x_unmasked (torch.Tensor): Features of unmasked tokens.
+            pos_embed_masked (torch.Tensor): Position embedding of masked
+                tokens.
+            pos_embed_unmasked (torch.Tensor): Position embedding of unmasked
+                tokens.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Final prediction and latent
+                prediction.
+        """
+        x_masked = self.mask_token.expand(x_unmasked.shape[0],
+                                          self.mask_token_num, -1)
+        # regressor
+        for regressor in self.regressors:
+            x_masked = regressor(
+                x_masked, torch.cat([x_unmasked, x_masked], dim=1),
+                pos_embed_masked,
+                torch.cat([pos_embed_unmasked, pos_embed_masked], dim=1))
+        x_masked = self.norm_regressor(x_masked)
+        latent_pred = x_masked
+
+        # decoder
+        x_masked = x_masked + pos_embed_masked
+        for decoder in self.decoders:
+            x_masked = decoder(x_masked)
+        x_masked = self.norm_decoder(x_masked)
+
+        logits = self.head(x_masked)
+
+        return logits, latent_pred
diff --git a/mmselfsup/models/utils/__init__.py b/mmselfsup/models/utils/__init__.py
index 32868a4fb..55fb6982a 100644
--- a/mmselfsup/models/utils/__init__.py
+++ b/mmselfsup/models/utils/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .accuracy import Accuracy, accuracy
+from .dall_e import Encoder
 from .extract_process import ExtractProcess, MultiExtractProcess
 from .gather_layer import GatherLayer
 from .knn_classifier import knn_classifier
@@ -7,9 +8,12 @@
 from .multi_prototypes import MultiPrototypes
 from .position_embedding import build_2d_sincos_position_embedding
 from .sobel import Sobel
+from .transformer_blocks import (CAETransformerRegressorLayer,
+                                 MultiheadAttention, TransformerEncoderLayer)
 
 __all__ = [
     'Accuracy', 'accuracy', 'ExtractProcess', 'MultiExtractProcess',
     'GatherLayer', 'knn_classifier', 'MultiPooling', 'MultiPrototypes',
-    'build_2d_sincos_position_embedding', 'Sobel'
+    'build_2d_sincos_position_embedding', 'Sobel', 'MultiheadAttention',
+    'TransformerEncoderLayer', 'CAETransformerRegressorLayer', 'Encoder'
 ]
diff --git a/mmselfsup/models/utils/dall_e.py b/mmselfsup/models/utils/dall_e.py
new file mode 100644
index 000000000..a026d8071
--- /dev/null
+++ b/mmselfsup/models/utils/dall_e.py
@@ -0,0 +1,174 @@
+# Copyright (c)
+# https://github.com/microsoft/unilm/blob/master/beit/dall_e/encoder.py
+# Copied from BEiT
+import math
+from collections import OrderedDict
+from functools import partial
+
+import attr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+@attr.s(eq=False)
+class Conv2d(nn.Module):
+    n_in: int = attr.ib(validator=lambda i, a, x: x >= 1)
+    n_out: int = attr.ib(validator=lambda i, a, x: x >= 1)
+    kw: int = attr.ib(validator=lambda i, a, x: x >= 1 and x % 2 == 1)
+
+    use_float16: bool = attr.ib(default=True)
+    device: torch.device = attr.ib(default=torch.device('cpu'))
+    requires_grad: bool = attr.ib(default=False)
+
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+
+        w = torch.empty((self.n_out, self.n_in, self.kw, self.kw),
+                        dtype=torch.float32,
+                        device=self.device,
+                        requires_grad=self.requires_grad)
+        w.normal_(std=1 / math.sqrt(self.n_in * self.kw**2))
+
+        b = torch.zeros((self.n_out, ),
+                        dtype=torch.float32,
+                        device=self.device,
+                        requires_grad=self.requires_grad)
+        self.w, self.b = nn.Parameter(w), nn.Parameter(b)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_float16 and 'cuda' in self.w.device.type:
+            if x.dtype != torch.float16:
+                x = x.half()
+
+            w, b = self.w.half(), self.b.half()
+        else:
+            if x.dtype != torch.float32:
+                x = x.float()
+
+            w, b = self.w, self.b
+
+        return F.conv2d(x, w, b, padding=(self.kw - 1) // 2)
+
+
+@attr.s(eq=False, repr=False)
+class EncoderBlock(nn.Module):
+    n_in: int = attr.ib(validator=lambda i, a, x: x >= 1)
+    n_out: int = attr.ib(validator=lambda i, a, x: x >= 1 and x % 4 == 0)
+    n_layers: int = attr.ib(validator=lambda i, a, x: x >= 1)
+
+    device: torch.device = attr.ib(default=None)
+    requires_grad: bool = attr.ib(default=False)
+
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.n_hid = self.n_out // 4
+        self.post_gain = 1 / (self.n_layers**2)
+
+        make_conv = partial(
+            Conv2d, device=self.device, requires_grad=self.requires_grad)
+        self.id_path = make_conv(
+            self.n_in, self.n_out,
+            1) if self.n_in != self.n_out else nn.Identity()
+        self.res_path = nn.Sequential(
+            OrderedDict([
+                ('relu_1', nn.ReLU()),
+                ('conv_1', make_conv(self.n_in, self.n_hid, 3)),
+                ('relu_2', nn.ReLU()),
+                ('conv_2', make_conv(self.n_hid, self.n_hid, 3)),
+                ('relu_3', nn.ReLU()),
+                ('conv_3', make_conv(self.n_hid, self.n_hid, 3)),
+                ('relu_4', nn.ReLU()),
+                ('conv_4', make_conv(self.n_hid, self.n_out, 1)),
+            ]))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+@attr.s(eq=False, repr=False)
+class Encoder(nn.Module):
+    group_count: int = 4
+    n_hid: int = attr.ib(default=256, validator=lambda i, a, x: x >= 64)
+    n_blk_per_group: int = attr.ib(default=2, validator=lambda i, a, x: x >= 1)
+    input_channels: int = attr.ib(default=3, validator=lambda i, a, x: x >= 1)
+    vocab_size: int = attr.ib(default=8192, validator=lambda i, a, x: x >= 512)
+
+    device: torch.device = attr.ib(default=torch.device('cpu'))
+    requires_grad: bool = attr.ib(default=False)
+    use_mixed_precision: bool = attr.ib(default=True)
+
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+
+        blk_range = range(self.n_blk_per_group)
+        n_layers = self.group_count * self.n_blk_per_group
+        make_conv = partial(
+            Conv2d, device=self.device, requires_grad=self.requires_grad)
+        make_blk = partial(
+            EncoderBlock,
+            n_layers=n_layers,
+            device=self.device,
+            requires_grad=self.requires_grad)
+
+        self.blocks = nn.Sequential(
+            OrderedDict([
+                ('input', make_conv(self.input_channels, 1 * self.n_hid, 7)),
+                ('group_1',
+                 nn.Sequential(
+                     OrderedDict([
+                         *[(f'block_{i + 1}',
+                            make_blk(1 * self.n_hid, 1 * self.n_hid))
+                           for i in blk_range],
+                         ('pool', nn.MaxPool2d(kernel_size=2)),
+                     ]))),
+                ('group_2',
+                 nn.Sequential(
+                     OrderedDict([
+                         *[(f'block_{i + 1}',
+                            make_blk(
+                                1 * self.n_hid if i == 0 else 2 * self.n_hid,
+                                2 * self.n_hid)) for i in blk_range],
+                         ('pool', nn.MaxPool2d(kernel_size=2)),
+                     ]))),
+                ('group_3',
+                 nn.Sequential(
+                     OrderedDict([
+                         *[(f'block_{i + 1}',
+                            make_blk(
+                                2 * self.n_hid if i == 0 else 4 * self.n_hid,
+                                4 * self.n_hid)) for i in blk_range],
+                         ('pool', nn.MaxPool2d(kernel_size=2)),
+                     ]))),
+                ('group_4',
+                 nn.Sequential(
+                     OrderedDict([
+                         *[(f'block_{i + 1}',
+                            make_blk(
+                                4 * self.n_hid if i == 0 else 8 * self.n_hid,
+                                8 * self.n_hid)) for i in blk_range],
+                     ]))),
+                ('output',
+                 nn.Sequential(
+                     OrderedDict([
+                         ('relu', nn.ReLU()),
+                         ('conv',
+                          make_conv(
+                              8 * self.n_hid,
+                              self.vocab_size,
+                              1,
+                              use_float16=False)),
+                     ]))),
+            ]))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.float()
+        if len(x.shape) != 4:
+            raise ValueError(f'input shape {x.shape} is not 4d')
+        if x.shape[1] != self.input_channels:
+            raise ValueError(f'input has {x.shape[1]} channels but model \
+                    built for {self.input_channels}')
+        if x.dtype != torch.float32:
+            raise ValueError('input must have dtype torch.float32')
+
+        return self.blocks(x)
diff --git a/mmselfsup/models/utils/transformer_blocks.py b/mmselfsup/models/utils/transformer_blocks.py
new file mode 100644
index 000000000..09142cd22
--- /dev/null
+++ b/mmselfsup/models/utils/transformer_blocks.py
@@ -0,0 +1,528 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+from mmcls.models.backbones.vision_transformer import \
+    TransformerEncoderLayer as _TransformerEncoderLayer
+from mmcls.models.utils import MultiheadAttention as _MultiheadAttention
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import FFN
+from mmcv.runner.base_module import BaseModule
+from torch.nn import functional as F
+
+
+class MultiheadAttention(_MultiheadAttention):
+    """Multi-head Attention Module.
+
+    This module rewrite the MultiheadAttention by replacing qkv bias with
+    customized qkv bias, in addition to removing the drop path layer.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        input_dims (int, optional): The input dimension, and if None,
+            use ``embed_dims``. Defaults to None.
+        attn_drop (float): Dropout rate of the dropout layer after the
+            attention calculation of query and key. Defaults to 0.
+        proj_drop (float): Dropout rate of the dropout layer after the
+            output projection. Defaults to 0.
+        dropout_layer (dict): The dropout config before adding the shortcut.
+            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
+        qkv_bias (bool): If True, add a learnable bias to q, k, v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        proj_bias (bool) If True, add a learnable bias to output projection.
+            Defaults to True.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 input_dims: int = None,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 qkv_bias: bool = True,
+                 qk_scale: float = None,
+                 proj_bias: bool = True,
+                 init_cfg: dict = None) -> None:
+        super(MultiheadAttention, self).__init__(
+            embed_dims,
+            num_heads=num_heads,
+            input_dims=input_dims,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            proj_bias=proj_bias,
+            init_cfg=init_cfg)
+
+        del self.out_drop
+        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
+            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # qkv bias is different from that in mmcls
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+        B, N, _ = x.shape
+        qkv = F.linear(
+            x, weight=self.qkv.weight,
+            bias=qkv_bias).reshape(B, N, 3, self.num_heads,
+                                   self.head_dims).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.embed_dims)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class MultiheadAttentionWithRPE(MultiheadAttention):
+    """Multi-head Attention Module.
+
+    This module rewrite the MultiheadAttention in MMSelfSup by adding the
+    relative position bias.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        window_size (int): The window size of the relative position bias.
+        input_dims (int, optional): The input dimension, and if None,
+            use ``embed_dims``. Defaults to None.
+        attn_drop (float): Dropout rate of the dropout layer after the
+            attention calculation of query and key. Defaults to 0.
+        proj_drop (float): Dropout rate of the dropout layer after the
+            output projection. Defaults to 0.
+        dropout_layer (dict): The dropout config before adding the shortcut.
+            Defaults to ``dict(type='Dropout', drop_prob=0.)``.
+        qkv_bias (bool): If True, add a learnable bias to q, k, v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        proj_bias (bool) If True, add a learnable bias to output projection.
+            Defaults to True.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 window_size: int,
+                 input_dims: int = None,
+                 attn_drop: float = 0,
+                 proj_drop: float = 0,
+                 qkv_bias: bool = True,
+                 qk_scale: float = None,
+                 proj_bias: bool = True,
+                 init_cfg: dict = None) -> None:
+        super().__init__(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            input_dims=input_dims,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            proj_bias=proj_bias,
+            init_cfg=init_cfg)
+
+        self.qkv = nn.Linear(self.input_dims, embed_dims * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
+            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+        assert isinstance(window_size, Sequence)
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] -
+                                      1) * (2 * window_size[1] - 1) + 3
+        # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH)
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))
+
+        # get pair-wise relative position index for
+        # each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        # coords shape is (2, Wh, Ww)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        # coords_flatten shape is (2, Wh*Ww)
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :])
+        # relative_coords shape is (Wh*Ww, Wh*Ww, 2)
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        # shift to start from 0
+        relative_coords[:, :, 0] += window_size[0] - 1
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = torch.zeros(
+            size=(window_size[0] * window_size[1] + 1, ) * 2,
+            dtype=relative_coords.dtype)
+
+        # relative_position_index shape is (Wh*Ww, Wh*Ww)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+        B, N, _ = x.shape
+        qkv = F.linear(
+            x, weight=self.qkv.weight,
+            bias=qkv_bias).reshape(B, N, 3, self.num_heads,
+                                   self.head_dims).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[
+                    self.relative_position_index.view(-1)].view(
+                        self.window_size[0] * self.window_size[1] + 1,
+                        self.window_size[0] * self.window_size[1] + 1, -1)
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.embed_dims)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class TransformerEncoderLayer(_TransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    This module is the rewritten version of the TransformerEncoderLayer in
+    MMClassification by adding the gamma and relative position bias in
+    Attention module.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads
+        feedforward_channels (int): The hidden dimension for FFNs
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Defaults to 0.
+        attn_drop_rate (float): The drop out rate for attention output weights.
+            Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Defaults to 2.
+        qkv_bias (bool): enable bias for qkv if True. Defaults to True.
+        act_cfg (dict): The activation config for FFNs.
+            Defaluts to ``dict(type='GELU')``.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+        init_values (float): The init values of gamma. Defaults to 0.0.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 feedforward_channels: int,
+                 window_size: int = None,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 num_fcs: int = 2,
+                 qkv_bias: bool = True,
+                 act_cfg: dict = dict(type='GELU'),
+                 norm_cfg: dict = dict(type='LN'),
+                 init_values: float = 0.0,
+                 init_cfg: dict = None) -> None:
+        super(TransformerEncoderLayer, self).__init__(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            feedforward_channels=feedforward_channels,
+            drop_rate=drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            num_fcs=num_fcs,
+            qkv_bias=qkv_bias,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.embed_dims, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        if window_size is None:
+            # attention without relative position bias
+            self.attn = MultiheadAttention(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                attn_drop=attn_drop_rate,
+                proj_drop=drop_rate,
+                qkv_bias=qkv_bias)
+        else:
+            # attention with relative position bias
+            self.attn = MultiheadAttentionWithRPE(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                window_size=window_size,
+                attn_drop=attn_drop_rate,
+                proj_drop=drop_rate,
+                qkv_bias=qkv_bias)
+
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.embed_dims, postfix=2)
+        self.add_module(self.norm2_name, norm2)
+
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=None,
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate))
+
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((embed_dims)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((embed_dims)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.gamma_1 is not None:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.ffn(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class CAETransformerRegressorLayer(BaseModule):
+    """Transformer layer for the regressor of CAE.
+
+    This module is different from conventional transformer encoder layer, for
+    its queries are the masked tokens, but its keys and values are the
+    concatenation of the masked and unmasked tokens.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): The number of heads in multi-head attention.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        qkv_bias (bool): If True, add a learnable bias to q, k, v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        drop_rate (float): The dropout rate. Defaults to 0.0.
+        attn_drop_rate (float): The drop out rate for attention output weights.
+            Defaults to 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
+        init_values (float): The init values of gamma. Defaults to 0.0.
+        act_cfg (dict): The activation config for FFNs.
+            Defaluts to ``dict(type='GELU')``.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='LN')``.
+    """
+
+    def __init__(
+        self,
+        embed_dims: int,
+        num_heads: int,
+        feedforward_channels: int,
+        num_fcs: int = 2,
+        qkv_bias: bool = False,
+        qk_scale: float = None,
+        drop_rate: float = 0.,
+        attn_drop_rate: float = 0.,
+        drop_path_rate: float = 0.,
+        init_values: float = 0.0,
+        act_cfg: dict = dict(type='GELU'),
+        norm_cfg: dict = dict(type='LN', eps=1e-6)
+    ) -> None:
+        super().__init__()
+
+        # NOTE: cross attention
+        _, self.norm1_q_cross = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        _, self.norm1_k_cross = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        _, self.norm1_v_cross = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        _, self.norm2_cross = build_norm_layer(norm_cfg, embed_dims, postfix=2)
+        self.cross_attn = CrossMultiheadAttention(
+            embed_dims,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate)
+
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=None,
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate))
+
+        if init_values > 0:
+            self.gamma_1_cross = nn.Parameter(
+                init_values * torch.ones((embed_dims)), requires_grad=True)
+            self.gamma_2_cross = nn.Parameter(
+                init_values * torch.ones((embed_dims)), requires_grad=True)
+        else:
+            self.gamma_1_cross = nn.Parameter(
+                torch.ones((embed_dims)), requires_grad=False)
+            self.gamma_2_cross = nn.Parameter(
+                torch.ones((embed_dims)), requires_grad=False)
+
+    def forward(self, x_q: torch.Tensor, x_kv: torch.Tensor,
+                pos_q: torch.Tensor, pos_k: torch.Tensor) -> torch.Tensor:
+        x = x_q + self.drop_path(self.gamma_1_cross * self.cross_attn(
+            self.norm1_q_cross(x_q + pos_q),
+            k=self.norm1_k_cross(x_kv + pos_k),
+            v=self.norm1_v_cross(x_kv)))
+        x = self.norm2_cross(x)
+        x = x + self.drop_path(self.gamma_2_cross * self.ffn(x))
+
+        return x
+
+
+class CrossMultiheadAttention(BaseModule):
+    """Cross attention between queries and the union of keys and values.
+
+    This module is different from ``MultiheadAttention``, for the attention
+    is computed between queries and the union of keys and values.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        qkv_bias (bool): If True, add a learnable bias to q, k, v.
+            Defaults to True.
+        qk_scale (float, optional): Override default qk scale of
+            ``head_dim ** -0.5`` if set. Defaults to None.
+        attn_drop (float): Dropout rate of the dropout layer after the
+            attention calculation of query and key. Defaults to 0.
+        proj_drop (float): Dropout rate of the dropout layer after the
+            output projection. Defaults to 0.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int = 8,
+                 qkv_bias: bool = False,
+                 qk_scale: float = None,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = embed_dims // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(embed_dims, embed_dims, bias=False)
+        self.k = nn.Linear(embed_dims, embed_dims, bias=False)
+        self.v = nn.Linear(embed_dims, embed_dims, bias=False)
+
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(embed_dims))
+            self.v_bias = nn.Parameter(torch.zeros(embed_dims))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self,
+                x: torch.Tensor,
+                k: torch.Tensor = None,
+                v: torch.Tensor = None) -> None:
+        B, N, _ = x.shape
+
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
+            v_bias = self.v_bias
+
+        q = F.linear(
+            input=x, weight=self.q.weight, bias=q_bias)  # (B, N_q, dim)
+        k = F.linear(
+            input=k, weight=self.k.weight, bias=k_bias)  # (B, N_k, dim)
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+
+        q = q.reshape(B, N, 1, self.num_heads,
+                      -1).permute(2, 0, 3, 1,
+                                  4).squeeze(0)  # (B, num_heads, N_q, dim)
+        k = k.reshape(B, N_k, 1, self.num_heads,
+                      -1).permute(2, 0, 3, 1,
+                                  4).squeeze(0)  # (B, num_heads, N_k, dim)
+        v = v.reshape(B, N_v, 1, self.num_heads,
+                      -1).permute(2, 0, 3, 1,
+                                  4).squeeze(0)  # (B, num_heads, N_v, dim)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
diff --git a/mmselfsup/version.py b/mmselfsup/version.py
index 1fd5e1596..09c166cc2 100644
--- a/mmselfsup/version.py
+++ b/mmselfsup/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '0.8.0'
+__version__ = '0.9.0'
 
 
 def parse_version_info(version_str):
diff --git a/model-index.yml b/model-index.yml
index 727cd68e5..4993c013c 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -14,3 +14,4 @@ Import:
   - configs/selfsup/swav/metafile.yml
   - configs/selfsup/mae/metafile.yaml
   - configs/selfsup/simmim/metafile.yml
+  - configs/selfsup/barlowtwins/metafile.yml
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index a146d04d5..6bdb5cd04 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,4 +1,4 @@
 mmcls >= 0.21.0
-mmcv-full>=1.3.16
+mmcv-full>=1.4.2
 mmdet >= 2.16.0
 mmsegmentation >= 0.20.2
diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt
index d08e48f91..ce9846ca2 100644
--- a/requirements/readthedocs.txt
+++ b/requirements/readthedocs.txt
@@ -1,5 +1,5 @@
 faiss-cpu
-mmcv>=1.3.16
+mmcv>=1.4.2
 mmselfsup
 sklearn
 torch
diff --git a/tests/test_apis/test_train.py b/tests/test_apis/test_train.py
new file mode 100644
index 000000000..7a1820068
--- /dev/null
+++ b/tests/test_apis/test_train.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+import tempfile
+import time
+
+import mmcv
+import pytest
+import torch
+import torch.nn as nn
+from mmcv import Config
+from torch.utils.data import Dataset
+
+from mmselfsup.apis import init_random_seed, set_random_seed, train_model
+
+
+class ExampleDataset(Dataset):
+
+    def __getitem__(self, idx):
+        results = dict(
+            img=torch.tensor([1], dtype=torch.float32), img_metas=dict())
+        return results
+
+    def __len__(self):
+        return 2
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super(ExampleModel, self).__init__()
+        self.test_cfg = None
+        self.layer = nn.Linear(1, 1)
+        self.neck = nn.Identity()
+
+    def forward(self, img, test_mode=False, **kwargs):
+        out = self.layer(img)
+        return out
+
+    def train_step(self, data_batch, optimizer):
+        loss = self.forward(**data_batch)
+        return dict(loss=loss)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='')
+def test_train_model():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Specify the data settings
+        cfg = Config.fromfile(
+            'configs/selfsup/relative_loc/relative-loc_resnet50_8xb64-steplr-70e_in1k.py'  # noqa: E501
+        )
+
+        cfg.data.samples_per_gpu = 1
+        cfg.data.workers_per_gpu = 2
+
+        cfg.data.val.data_source.data_prefix = 'tests/data/'
+        cfg.data.val.data_source.ann_file = 'tests/data/data_list.txt'
+
+        # Specify the optimizer
+        cfg.optimizer = dict(
+            type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)
+        cfg.optimizer_config = dict(grad_clip=None)
+
+        # Specify the learning rate scheduler
+        cfg.lr_config = dict(policy='step', step=[1])
+
+        # Modify runtime setting
+        cfg.runner = dict(type='EpochBasedRunner', max_epochs=1)
+
+        # Specify the work directory
+        cfg.work_dir = tmpdir
+
+        # Set the random seed and enable the deterministic option of cuDNN
+        # to keep the results' reproducible
+        cfg.seed = 0
+        set_random_seed(0, deterministic=True)
+
+        cfg.gpu_ids = range(1)
+
+        # Create the work directory
+        mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+
+        # Build the algorithm
+        model = ExampleModel()
+
+        # Build the dataset
+        datasets = [ExampleDataset()]
+
+        # evaluation
+        cfg.evaluation = dict(interval=10, topk=(1, 5))
+
+        # Start pre-train
+        train_model(
+            model,
+            datasets,
+            cfg,
+            distributed=False,
+            timestamp=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            meta=dict())
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='CUDA is not available')
+def test_init_random_seed():
+    seed = init_random_seed(0)
+    assert seed == 0
+
+
+def test_set_random_seed():
+    set_random_seed(0)
diff --git a/tests/test_data/test_pipeline.py b/tests/test_data/test_pipeline.py
index a2928cd8e..b62dabb25 100644
--- a/tests/test_data/test_pipeline.py
+++ b/tests/test_data/test_pipeline.py
@@ -128,9 +128,9 @@ def test_randomaug():
     assert isinstance(str(module), str)
 
 
-def test_mask_gen():
+def test_simmim_mask_gen():
     transform = dict(
-        type='BlockwiseMaskGenerator',
+        type='SimMIMMaskGenerator',
         input_size=192,
         mask_patch_size=32,
         model_patch_size=4,
@@ -143,3 +143,47 @@ def test_mask_gen():
 
     assert list(res[0].shape) == [3, 192, 192]
     assert list(res[1].shape) == [48, 48]
+
+
+def test_beit_mask_gen():
+    transform = dict(
+        type='BEiTMaskGenerator',
+        input_size=(14, 14),
+        num_masking_patches=75,
+        max_num_patches=None,
+        min_num_patches=16)
+    fake_image_1 = torch.rand((3, 224, 224))
+    fake_image_2 = torch.rand((3, 112, 112))
+    module = build_from_cfg(transform, PIPELINES)
+
+    res = module([fake_image_1, fake_image_2])
+
+    assert list(res[0].shape) == [3, 224, 224]
+    assert list(res[1].shape) == [3, 112, 112]
+    assert list(res[2].shape) == [14, 14]
+
+
+def test_to_tensor():
+    transform = dict(type='ToTensor')
+    module = build_from_cfg(transform, PIPELINES)
+    fake_img = torch.rand((112, 112, 3)).numpy()
+    fake_output_1 = module(fake_img)
+    fake_output_2 = module([fake_img, fake_img])
+    assert list(fake_output_1.shape) == [3, 112, 112]
+    assert len(fake_output_2) == 2
+
+
+def test_random_resize_crop_with_two_pic():
+    transform = dict(
+        type='RandomResizedCropAndInterpolationWithTwoPic',
+        size=224,
+        second_size=112,
+        interpolation='bicubic',
+        second_interpolation='lanczos',
+        scale=(0.08, 1.0))
+    module = build_from_cfg(transform, PIPELINES)
+    fake_input = torch.rand((224, 224, 3)).numpy().astype(np.uint8)
+    fake_input = Image.fromarray(fake_input)
+    fake_output = module(fake_input)
+    assert list(fake_output[0].size) == [224, 224]
+    assert list(fake_output[1].size) == [112, 112]
diff --git a/tests/test_models/test_algorithms/test_barlowtwins.py b/tests/test_models/test_algorithms/test_barlowtwins.py
new file mode 100644
index 000000000..a0ba8cca9
--- /dev/null
+++ b/tests/test_models/test_algorithms/test_barlowtwins.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
+import torch
+
+from mmselfsup.models.algorithms import BarlowTwins
+
+backbone = dict(
+    type='ResNet',
+    depth=50,
+    in_channels=3,
+    out_indices=[4],  # 0: conv-1, x: stage-x
+    norm_cfg=dict(type='BN'))
+neck = dict(
+    type='NonLinearNeck',
+    in_channels=2048,
+    hid_channels=2,
+    out_channels=2,
+    num_layers=3,
+    with_last_bn=False,
+    with_last_bn_affine=False,
+    with_avg_pool=True,
+    norm_cfg=dict(type='BN1d'))
+head = dict(type='LatentCrossCorrelationHead', in_channels=2)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_barlowtwins():
+    with pytest.raises(AssertionError):
+        alg = BarlowTwins(backbone=backbone, neck=None, head=head)
+    with pytest.raises(AssertionError):
+        alg = BarlowTwins(backbone=backbone, neck=neck, head=None)
+
+    alg = BarlowTwins(backbone=backbone, neck=neck, head=head)
+    fake_input = torch.randn((2, 3, 224, 224))
+    fake_backbone_out = alg.extract_feat(fake_input)
+    assert fake_backbone_out[0].size() == torch.Size([2, 2048, 7, 7])
+    with pytest.raises(AssertionError):
+        fake_out = alg.forward_train(fake_input)
+
+    fake_input = [torch.randn((2, 3, 224, 224)), torch.randn((2, 3, 224, 224))]
+    fake_out = alg.forward_train(fake_input)
+    assert fake_out['loss'].item() > 0.0
diff --git a/tests/test_models/test_algorithms/test_byol.py b/tests/test_models/test_algorithms/test_byol.py
index 7a34a3e98..c0d74d5bb 100644
--- a/tests/test_models/test_algorithms/test_byol.py
+++ b/tests/test_models/test_algorithms/test_byol.py
@@ -8,15 +8,15 @@
 
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
     type='NonLinearNeck',
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     with_bias=True,
     with_last_bn=False,
     with_avg_pool=True,
@@ -25,9 +25,9 @@
     type='LatentPredictHead',
     predictor=dict(
         type='NonLinearNeck',
-        in_channels=4,
-        hid_channels=4,
-        out_channels=4,
+        in_channels=2,
+        hid_channels=2,
+        out_channels=2,
         with_bias=True,
         with_last_bn=False,
         with_avg_pool=False,
@@ -42,15 +42,12 @@ def test_byol():
         alg = BYOL(backbone=backbone, neck=neck, head=None)
 
     alg = BYOL(backbone=backbone, neck=neck, head=head)
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.extract_feat(fake_input)
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
     with pytest.raises(AssertionError):
         fake_out = alg.forward_train(fake_input)
 
-    fake_input = [
-        torch.randn((16, 3, 224, 224)),
-        torch.randn((16, 3, 224, 224))
-    ]
+    fake_input = [torch.randn((2, 3, 224, 224)), torch.randn((2, 3, 224, 224))]
     fake_out = alg.forward_train(fake_input)
     assert fake_out['loss'].item() > -4
diff --git a/tests/test_models/test_algorithms/test_cae.py b/tests/test_models/test_algorithms/test_cae.py
new file mode 100644
index 000000000..037cd9f99
--- /dev/null
+++ b/tests/test_models/test_algorithms/test_cae.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
+import torch
+
+from mmselfsup.models.algorithms import CAE
+
+# model settings
+backbone = dict(type='CAEViT', arch='b', patch_size=16, init_values=0.1)
+neck = dict(
+    type='CAENeck',
+    patch_size=16,
+    embed_dims=768,
+    num_heads=12,
+    regressor_depth=4,
+    decoder_depth=4,
+    mlp_ratio=4,
+    init_values=0.1,
+)
+head = dict(
+    type='CAEHead', tokenizer_path='cae_ckpt/encoder_stat_dict.pth', lambd=2)
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_cae():
+    with pytest.raises(AssertionError):
+        model = CAE(backbone=None, neck=neck, head=head)
+    with pytest.raises(AssertionError):
+        model = CAE(backbone=backbone, neck=None, head=head)
+    with pytest.raises(AssertionError):
+        model = CAE(backbone=backbone, neck=neck, head=None)
+
+    model = CAE(backbone=backbone, neck=neck, head=head)
+    model.init_weights()
+
+    fake_input = torch.rand((1, 3, 224, 224))
+    fake_target = torch.rand((1, 3, 112, 112))
+    fake_mask = torch.zeros((1, 196)).bool()
+    fake_mask[:, 75:150] = 1
+
+    inputs = (fake_input, fake_target, fake_mask)
+
+    fake_loss = model.forward_train(inputs)
+    fake_feat = model.extract_feat(fake_input, fake_mask)
+    assert isinstance(fake_loss['loss'].item(), float)
+    assert list(fake_feat.shape) == [1, 122, 768]
diff --git a/tests/test_models/test_algorithms/test_classification.py b/tests/test_models/test_algorithms/test_classification.py
index 132e1a79a..48a5f5d11 100644
--- a/tests/test_models/test_algorithms/test_classification.py
+++ b/tests/test_models/test_algorithms/test_classification.py
@@ -13,23 +13,23 @@ def test_classification():
     with_sobel = True,
     backbone = dict(
         type='ResNet',
-        depth=50,
+        depth=18,
         in_channels=2,
         out_indices=[4],  # 0: conv-1, x: stage-x
         norm_cfg=dict(type='BN'),
         frozen_stages=4)
     head = dict(
-        type='ClsHead', with_avg_pool=True, in_channels=2048, num_classes=4)
+        type='ClsHead', with_avg_pool=True, in_channels=512, num_classes=4)
 
     alg = Classification(backbone=backbone, with_sobel=with_sobel, head=head)
     assert hasattr(alg, 'sobel_layer')
     assert hasattr(alg, 'head')
 
-    fake_input = torch.randn((16, 3, 224, 224))
-    fake_labels = torch.ones(16, dtype=torch.long)
+    fake_input = torch.randn((2, 3, 224, 224))
+    fake_labels = torch.ones(2, dtype=torch.long)
     fake_out = alg.forward_test(fake_input)
     assert 'head4' in fake_out
-    assert fake_out['head4'].size() == torch.Size([16, 4])
+    assert fake_out['head4'].size() == torch.Size([2, 4])
 
     fake_out = alg.forward_train(fake_input, fake_labels)
     assert fake_out['loss'].item() > 0
@@ -51,7 +51,7 @@ def test_classification():
     alg = Classification(backbone=backbone, head=head)
     assert alg.with_head
 
-    fake_input = torch.randn((16, 3, 224, 224))
-    fake_labels = torch.ones(16, dtype=torch.long)
+    fake_input = torch.randn((2, 3, 224, 224))
+    fake_labels = torch.ones(2, dtype=torch.long)
     fake_out = alg.forward_train(fake_input, fake_labels)
     assert fake_out['loss'].item() > 0
diff --git a/tests/test_models/test_algorithms/test_deepcluster.py b/tests/test_models/test_algorithms/test_deepcluster.py
index 286fb4c83..f92ee68fc 100644
--- a/tests/test_models/test_algorithms/test_deepcluster.py
+++ b/tests/test_models/test_algorithms/test_deepcluster.py
@@ -10,7 +10,7 @@
 with_sobel = True,
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=2,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
@@ -18,7 +18,7 @@
 head = dict(
     type='ClsHead',
     with_avg_pool=False,  # already has avgpool in the neck
-    in_channels=2048,
+    in_channels=512,
     num_classes=num_classes)
 
 
@@ -34,11 +34,11 @@ def test_deepcluster():
     assert hasattr(alg, 'neck')
     assert hasattr(alg, 'head')
 
-    fake_input = torch.randn((16, 3, 224, 224))
-    fake_labels = torch.ones(16, dtype=torch.long)
+    fake_input = torch.randn((2, 3, 224, 224))
+    fake_labels = torch.ones(2, dtype=torch.long)
     fake_out = alg.forward(fake_input, mode='test')
     assert 'head0' in fake_out
-    assert fake_out['head0'].size() == torch.Size([16, num_classes])
+    assert fake_out['head0'].size() == torch.Size([2, num_classes])
 
     fake_out = alg.forward_train(fake_input, fake_labels)
     alg.set_reweight(fake_labels)
diff --git a/tests/test_models/test_algorithms/test_densecl.py b/tests/test_models/test_algorithms/test_densecl.py
index 67820a522..0104a7560 100644
--- a/tests/test_models/test_algorithms/test_densecl.py
+++ b/tests/test_models/test_algorithms/test_densecl.py
@@ -9,20 +9,20 @@
 from mmselfsup.models.algorithms import DenseCL
 
 queue_len = 32
-feat_dim = 4
+feat_dim = 2
 momentum = 0.999
 loss_lambda = 0.5
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
     type='DenseCLNeck',
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     num_grid=None)
 head = dict(type='ContrastiveHead', temperature=0.2)
 
@@ -57,14 +57,14 @@ def test_densecl():
     assert alg.queue.size() == torch.Size([feat_dim, queue_len])
     assert alg.queue2.size() == torch.Size([feat_dim, queue_len])
 
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     with pytest.raises(AssertionError):
         fake_out = alg.forward_train(fake_input)
 
     fake_out = alg.forward_test(fake_input)
     assert fake_out[0] is None
     assert fake_out[2] is None
-    assert fake_out[1].size() == torch.Size([16, 2048, 49])
+    assert fake_out[1].size() == torch.Size([2, 512, 49])
 
     mmselfsup.models.algorithms.densecl.batch_shuffle_ddp = MagicMock(
         side_effect=mock_batch_shuffle_ddp)
@@ -75,10 +75,10 @@ def test_densecl():
     fake_loss = alg.forward_train([fake_input, fake_input])
     assert fake_loss['loss_single'] > 0
     assert fake_loss['loss_dense'] > 0
-    assert alg.queue_ptr.item() == 16
-    assert alg.queue2_ptr.item() == 16
+    assert alg.queue_ptr.item() == 2
+    assert alg.queue2_ptr.item() == 2
 
     # test train step with 2 keys in loss
     fake_outputs = alg.train_step(dict(img=[fake_input, fake_input]), None)
     assert fake_outputs['loss'].item() > -1
-    assert fake_outputs['num_samples'] == 16
+    assert fake_outputs['num_samples'] == 2
diff --git a/tests/test_models/test_algorithms/test_mae.py b/tests/test_models/test_algorithms/test_mae.py
index d985f44f9..87ad10348 100644
--- a/tests/test_models/test_algorithms/test_mae.py
+++ b/tests/test_models/test_algorithms/test_mae.py
@@ -30,8 +30,8 @@ def test_mae():
         alg = MAE(backbone=None, neck=neck, head=head)
     alg = MAE(backbone=backbone, neck=neck, head=head)
 
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_loss = alg.forward_train(fake_input)
     fake_feature = alg.extract_feat(fake_input)
     assert isinstance(fake_loss['loss'].item(), float)
-    assert list(fake_feature[0].shape) == [16, 50, 768]
+    assert list(fake_feature[0].shape) == [2, 50, 768]
diff --git a/tests/test_models/test_algorithms/test_mmcls_classifier_wrapper.py b/tests/test_models/test_algorithms/test_mmcls_classifier_wrapper.py
index 114dc3b60..4cd570db9 100644
--- a/tests/test_models/test_algorithms/test_mmcls_classifier_wrapper.py
+++ b/tests/test_models/test_algorithms/test_mmcls_classifier_wrapper.py
@@ -21,7 +21,7 @@ def test_mmcls_classifier_wrapper():
         neck=dict(type='mmcls.GlobalAveragePooling'),
         head=dict(
             type='mmcls.LinearClsHead',
-            num_classes=1000,
+            num_classes=2,
             in_channels=1024,
             init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
             loss=dict(
@@ -34,8 +34,8 @@ def test_mmcls_classifier_wrapper():
             dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
         ],
         train_cfg=dict(augments=[
-            dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
-            dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+            dict(type='BatchMixup', alpha=0.8, num_classes=2, prob=0.5),
+            dict(type='BatchCutMix', alpha=1.0, num_classes=2, prob=0.5)
         ]))
     model = ALGORITHMS.build(model_config)
     fake_inputs = torch.rand((2, 3, 192, 192))
@@ -47,7 +47,7 @@ def test_mmcls_classifier_wrapper():
 
     # test mode
     outputs = model(fake_inputs, mode='test')
-    assert list(outputs['head3'].shape) == [2, 1000]
+    assert list(outputs['head3'].shape) == [2, 2]
 
     # extract mode
     outputs = model(fake_inputs, mode='extract')
diff --git a/tests/test_models/test_algorithms/test_moco.py b/tests/test_models/test_algorithms/test_moco.py
index 7735fd3cf..ed29beecf 100644
--- a/tests/test_models/test_algorithms/test_moco.py
+++ b/tests/test_models/test_algorithms/test_moco.py
@@ -9,19 +9,19 @@
 from mmselfsup.models.algorithms import MoCo
 
 queue_len = 32
-feat_dim = 4
+feat_dim = 2
 momentum = 0.999
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
     type='MoCoV2Neck',
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     with_avg_pool=True)
 head = dict(type='ContrastiveHead', temperature=0.2)
 
@@ -54,9 +54,9 @@ def test_moco():
         momentum=momentum)
     assert alg.queue.size() == torch.Size([feat_dim, queue_len])
 
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.extract_feat(fake_input)
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
     with pytest.raises(AssertionError):
         fake_backbone_out = alg.forward_train(fake_input)
 
@@ -68,4 +68,4 @@ def test_moco():
         side_effect=mock_concat_all_gather)
     fake_loss = alg.forward_train([fake_input, fake_input])
     assert fake_loss['loss'] > 0
-    assert alg.queue_ptr.item() == 16
+    assert alg.queue_ptr.item() == 2
diff --git a/tests/test_models/test_algorithms/test_mocov3.py b/tests/test_models/test_algorithms/test_mocov3.py
index 74b1842fa..ea42c4fa0 100644
--- a/tests/test_models/test_algorithms/test_mocov3.py
+++ b/tests/test_models/test_algorithms/test_mocov3.py
@@ -15,8 +15,8 @@
 neck = dict(
     type='NonLinearNeck',
     in_channels=384,
-    hid_channels=8,
-    out_channels=8,
+    hid_channels=2,
+    out_channels=2,
     num_layers=2,
     with_bias=False,
     with_last_bn=True,
@@ -28,9 +28,9 @@
     type='MoCoV3Head',
     predictor=dict(
         type='NonLinearNeck',
-        in_channels=8,
-        hid_channels=8,
-        out_channels=8,
+        in_channels=2,
+        hid_channels=2,
+        out_channels=2,
         num_layers=2,
         with_bias=False,
         with_last_bn=True,
@@ -51,7 +51,7 @@ def test_mocov3():
     alg.init_weights()
     alg.momentum_update()
 
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.forward(fake_input, mode='extract')
-    assert fake_backbone_out[0][0].size() == torch.Size([16, 384, 14, 14])
-    assert fake_backbone_out[0][1].size() == torch.Size([16, 384])
+    assert fake_backbone_out[0][0].size() == torch.Size([2, 384, 14, 14])
+    assert fake_backbone_out[0][1].size() == torch.Size([2, 384])
diff --git a/tests/test_models/test_algorithms/test_npid.py b/tests/test_models/test_algorithms/test_npid.py
index 5779903f8..734f58143 100644
--- a/tests/test_models/test_algorithms/test_npid.py
+++ b/tests/test_models/test_algorithms/test_npid.py
@@ -8,14 +8,14 @@
 
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
-    type='LinearNeck', in_channels=2048, out_channels=4, with_avg_pool=True)
+    type='LinearNeck', in_channels=512, out_channels=2, with_avg_pool=True)
 head = dict(type='ContrastiveHead', temperature=0.07)
-memory_bank = dict(type='SimpleMemory', length=8, feat_dim=4, momentum=0.5)
+memory_bank = dict(type='SimpleMemory', length=8, feat_dim=2, momentum=0.5)
 
 
 @pytest.mark.skipif(
@@ -30,6 +30,6 @@ def test_npid():
 
     alg = NPID(
         backbone=backbone, neck=neck, head=head, memory_bank=memory_bank)
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.extract_feat(fake_input)
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
diff --git a/tests/test_models/test_algorithms/test_odc.py b/tests/test_models/test_algorithms/test_odc.py
index b397be857..74d1b7ed8 100644
--- a/tests/test_models/test_algorithms/test_odc.py
+++ b/tests/test_models/test_algorithms/test_odc.py
@@ -9,26 +9,26 @@
 num_classes = 5
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
     type='ODCNeck',
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     norm_cfg=dict(type='BN1d'),
     with_avg_pool=True)
 head = dict(
     type='ClsHead',
     with_avg_pool=False,
-    in_channels=4,
+    in_channels=2,
     num_classes=num_classes)
 memory_bank = dict(
     type='ODCMemory',
     length=8,
-    feat_dim=4,
+    feat_dim=2,
     momentum=0.5,
     num_classes=num_classes,
     min_cluster=2,
@@ -48,7 +48,7 @@ def test_odc():
     alg = ODC(backbone=backbone, neck=neck, head=head, memory_bank=memory_bank)
     alg.set_reweight()
 
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_out = alg.forward_test(fake_input)
     assert 'head0' in fake_out
-    assert fake_out['head0'].size() == torch.Size([16, num_classes])
+    assert fake_out['head0'].size() == torch.Size([2, num_classes])
diff --git a/tests/test_models/test_algorithms/test_relative_loc.py b/tests/test_models/test_algorithms/test_relative_loc.py
index b8feac8f5..1a5f91bce 100644
--- a/tests/test_models/test_algorithms/test_relative_loc.py
+++ b/tests/test_models/test_algorithms/test_relative_loc.py
@@ -8,16 +8,16 @@
 
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
     type='RelativeLocNeck',
-    in_channels=2048,
-    out_channels=4,
+    in_channels=512,
+    out_channels=2,
     with_avg_pool=True)
-head = dict(type='ClsHead', with_avg_pool=False, in_channels=4, num_classes=8)
+head = dict(type='ClsHead', with_avg_pool=False, in_channels=2, num_classes=8)
 
 
 @pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
@@ -49,6 +49,6 @@ def test_relative_loc():
     assert 'head4' in fake_out
 
     # extract
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.forward(fake_input, mode='extract')
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
diff --git a/tests/test_models/test_algorithms/test_rotation_pred.py b/tests/test_models/test_algorithms/test_rotation_pred.py
index 69f4b9d3e..46d93a119 100644
--- a/tests/test_models/test_algorithms/test_rotation_pred.py
+++ b/tests/test_models/test_algorithms/test_rotation_pred.py
@@ -8,12 +8,11 @@
 
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
-head = dict(
-    type='ClsHead', with_avg_pool=True, in_channels=2048, num_classes=4)
+head = dict(type='ClsHead', with_avg_pool=True, in_channels=512, num_classes=4)
 
 
 @pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
@@ -41,6 +40,6 @@ def test_rotation_pred():
     assert 'head4' in fake_out
 
     # extract
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.forward(fake_input, mode='extract')
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
diff --git a/tests/test_models/test_algorithms/test_simclr.py b/tests/test_models/test_algorithms/test_simclr.py
index 44744aa26..3ccc27d81 100644
--- a/tests/test_models/test_algorithms/test_simclr.py
+++ b/tests/test_models/test_algorithms/test_simclr.py
@@ -8,15 +8,15 @@
 
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'))
 neck = dict(
     type='NonLinearNeck',  # SimCLR non-linear neck
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     num_layers=2,
     with_avg_pool=True)
 head = dict(type='ContrastiveHead', temperature=0.1)
@@ -34,6 +34,6 @@ def test_simclr():
         fake_input = torch.randn((16, 3, 224, 224))
         alg.forward_train(fake_input)
 
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.extract_feat(fake_input)
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
diff --git a/tests/test_models/test_algorithms/test_simsiam.py b/tests/test_models/test_algorithms/test_simsiam.py
index fe2011dfd..93469fbc6 100644
--- a/tests/test_models/test_algorithms/test_simsiam.py
+++ b/tests/test_models/test_algorithms/test_simsiam.py
@@ -8,16 +8,16 @@
 
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'),
     zero_init_residual=True)
 neck = dict(
     type='NonLinearNeck',
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     num_layers=3,
     with_last_bn_affine=False,
     with_avg_pool=True,
@@ -26,9 +26,9 @@
     type='LatentPredictHead',
     predictor=dict(
         type='NonLinearNeck',
-        in_channels=4,
-        hid_channels=4,
-        out_channels=4,
+        in_channels=2,
+        hid_channels=2,
+        out_channels=2,
         with_avg_pool=False,
         with_last_bn=False,
         with_last_bias=True,
@@ -42,22 +42,19 @@ def test_simsiam():
 
     alg = SimSiam(backbone=backbone, neck=neck, head=head)
     with pytest.raises(AssertionError):
-        fake_input = torch.randn((16, 3, 224, 224))
+        fake_input = torch.randn((2, 3, 224, 224))
         alg.forward_train(fake_input)
 
-    fake_input = [
-        torch.randn((16, 3, 224, 224)),
-        torch.randn((16, 3, 224, 224))
-    ]
+    fake_input = [torch.randn((2, 3, 224, 224)), torch.randn((2, 3, 224, 224))]
     fake_out = alg.forward(fake_input)
     assert fake_out['loss'].item() > -1
 
     # test train step
     fake_outputs = alg.train_step(dict(img=fake_input), None)
     assert fake_outputs['loss'].item() > -1
-    assert fake_outputs['num_samples'] == 16
+    assert fake_outputs['num_samples'] == 2
 
     # test val step
     fake_outputs = alg.val_step(dict(img=fake_input), None)
     assert fake_outputs['loss'].item() > -1
-    assert fake_outputs['num_samples'] == 16
+    assert fake_outputs['num_samples'] == 2
diff --git a/tests/test_models/test_algorithms/test_swav.py b/tests/test_models/test_algorithms/test_swav.py
index 0d09db5d8..09a80809f 100644
--- a/tests/test_models/test_algorithms/test_swav.py
+++ b/tests/test_models/test_algorithms/test_swav.py
@@ -9,21 +9,21 @@
 nmb_crops = [2, 6]
 backbone = dict(
     type='ResNet',
-    depth=50,
+    depth=18,
     in_channels=3,
     out_indices=[4],  # 0: conv-1, x: stage-x
     norm_cfg=dict(type='BN'),
     zero_init_residual=True)
 neck = dict(
     type='SwAVNeck',
-    in_channels=2048,
-    hid_channels=4,
-    out_channels=4,
+    in_channels=512,
+    hid_channels=2,
+    out_channels=2,
     norm_cfg=dict(type='BN1d'),
     with_avg_pool=True)
 head = dict(
     type='SwAVHead',
-    feat_dim=4,  # equal to neck['out_channels']
+    feat_dim=2,  # equal to neck['out_channels']
     epsilon=0.05,
     temperature=0.1,
     num_crops=nmb_crops)
@@ -37,19 +37,19 @@ def test_swav():
         alg = SwAV(backbone=backbone, neck=None, head=head)
 
     alg = SwAV(backbone=backbone, neck=neck, head=head)
-    fake_input = torch.randn((16, 3, 224, 224))
+    fake_input = torch.randn((2, 3, 224, 224))
     fake_backbone_out = alg.extract_feat(fake_input)
-    assert fake_backbone_out[0].size() == torch.Size([16, 2048, 7, 7])
+    assert fake_backbone_out[0].size() == torch.Size([2, 512, 7, 7])
 
     fake_input = [
-        torch.randn((16, 3, 224, 224)),
-        torch.randn((16, 3, 224, 224)),
-        torch.randn((16, 3, 96, 96)),
-        torch.randn((16, 3, 96, 96)),
-        torch.randn((16, 3, 96, 96)),
-        torch.randn((16, 3, 96, 96)),
-        torch.randn((16, 3, 96, 96)),
-        torch.randn((16, 3, 96, 96)),
+        torch.randn((2, 3, 224, 224)),
+        torch.randn((2, 3, 224, 224)),
+        torch.randn((2, 3, 96, 96)),
+        torch.randn((2, 3, 96, 96)),
+        torch.randn((2, 3, 96, 96)),
+        torch.randn((2, 3, 96, 96)),
+        torch.randn((2, 3, 96, 96)),
+        torch.randn((2, 3, 96, 96)),
     ]
     fake_out = alg.forward_train(fake_input)
     assert fake_out['loss'].item() > 0
diff --git a/tests/test_models/test_backbones/test_mim_cls_vit.py b/tests/test_models/test_backbones/test_mim_cls_vit.py
index 005ea1cfe..5988b521a 100644
--- a/tests/test_models/test_backbones/test_mim_cls_vit.py
+++ b/tests/test_models/test_backbones/test_mim_cls_vit.py
@@ -15,18 +15,26 @@
 linprobe_backbone = dict(
     arch='b', patch_size=16, finetune=False, final_norm=False)
 
+linprobe_backbone_use_window = dict(
+    arch='b', patch_size=16, finetune=False, final_norm=False, use_window=True)
+
 
 @pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_mae_cls_vit():
     mae_finetune_backbone = MIMVisionTransformer(**finetune_backbone)
     mae_finetune_backbone_norm = MIMVisionTransformer(**finetune_backbone_norm)
     mae_linprobe_backbone = MIMVisionTransformer(**linprobe_backbone)
+    mae_linprobe_backbone_use_window = MIMVisionTransformer(
+        **linprobe_backbone_use_window)
     mae_linprobe_backbone.train()
 
     fake_inputs = torch.randn((2, 3, 224, 224))
     fake_finetune_outputs = mae_finetune_backbone(fake_inputs)
     fake_finetune_outputs_norm = mae_finetune_backbone_norm(fake_inputs)
     fake_linprobe_outputs = mae_linprobe_backbone(fake_inputs)
+    fake_linprobe_outputs_use_window = mae_linprobe_backbone_use_window(
+        fake_inputs)
     assert list(fake_finetune_outputs.shape) == [2, 768]
     assert list(fake_linprobe_outputs.shape) == [2, 768]
     assert list(fake_finetune_outputs_norm.shape) == [2, 768]
+    assert list(fake_linprobe_outputs_use_window.shape) == [2, 768]
diff --git a/tests/test_models/test_heads.py b/tests/test_models/test_heads.py
index 69b3bf8b8..269557c44 100644
--- a/tests/test_models/test_heads.py
+++ b/tests/test_models/test_heads.py
@@ -3,6 +3,7 @@
 import torch.nn.functional as F
 
 from mmselfsup.models.heads import (ClsHead, ContrastiveHead, LatentClsHead,
+                                    LatentCrossCorrelationHead,
                                     LatentPredictHead, MAEFinetuneHead,
                                     MAEPretrainHead, MultiClsHead, SwAVHead)
 
@@ -53,6 +54,15 @@ def test_latent_cls_head():
     assert loss['loss'].item() > 0
 
 
+def test_latent_cross_correlation_head():
+    head = LatentCrossCorrelationHead(2, 0.0051)
+    fake_input = torch.rand(32, 2)  # N, C
+    fake_traget = torch.rand(32, 2)  # N, C
+
+    loss = head.forward(fake_input, fake_traget)
+    assert loss['loss'].item() > 0
+
+
 def test_multi_cls_head():
     head = MultiClsHead(in_indices=(0, 1))
     fake_input = [torch.rand(8, 64, 5, 5), torch.rand(8, 256, 14, 14)]
diff --git a/tests/test_models/test_utils/test_dalle.py b/tests/test_models/test_utils/test_dalle.py
new file mode 100644
index 000000000..04aa9b783
--- /dev/null
+++ b/tests/test_models/test_utils/test_dalle.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
+import torch
+
+from mmselfsup.models.utils import Encoder
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_dalle():
+    model = Encoder()
+    fake_inputs = torch.rand((2, 3, 112, 112))
+    fake_outputs = model(fake_inputs)
+
+    assert list(fake_outputs.shape) == [2, 8192, 14, 14]
diff --git a/tools/benchmarks/classification/knn_imagenet/test_knn.py b/tools/benchmarks/classification/knn_imagenet/test_knn.py
index 51c6f352e..68547c0ba 100644
--- a/tools/benchmarks/classification/knn_imagenet/test_knn.py
+++ b/tools/benchmarks/classification/knn_imagenet/test_knn.py
@@ -31,7 +31,6 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local-rank', type=int, default=0)
     parser.add_argument(
         '--cfg-options',
         nargs='+',
@@ -59,6 +58,7 @@ def parse_args():
         default=True,
         type=bool,
         help='Store the features on GPU. Set to False if you encounter OOM')
+    parser.add_argument('--local_rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
diff --git a/tools/benchmarks/classification/svm_voc07/extract.py b/tools/benchmarks/classification/svm_voc07/extract.py
index 0fc5d31aa..f69b177fe 100644
--- a/tools/benchmarks/classification/svm_voc07/extract.py
+++ b/tools/benchmarks/classification/svm_voc07/extract.py
@@ -122,10 +122,12 @@ def main():
     model.init_weights()
 
     # model is determined in this priority: init_cfg > checkpoint > random
-    if getattr(cfg.model.backbone.init_cfg, 'type', None) == 'Pretrained':
-        logger.info(
-            f'Use pretrained model: '
-            f'{cfg.model.backbone.init_cfg.checkpoint} to extract features')
+    if hasattr(cfg.model.backbone, 'init_cfg'):
+        if getattr(cfg.model.backbone.init_cfg, 'type', None) == 'Pretrained':
+            logger.info(
+                f'Use pretrained model: '
+                f'{cfg.model.backbone.init_cfg.checkpoint} to extract features'
+            )
     elif args.checkpoint is not None:
         logger.info(f'Use checkpoint: {args.checkpoint} to extract features')
         load_checkpoint(model, args.checkpoint, map_location='cpu')
diff --git a/tools/test.py b/tools/test.py
index 19c364a40..226842ea1 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -37,12 +37,6 @@ def parse_args():
         default=0,
         help='id of gpu to use '
         '(only applicable to non-distributed testing)')
-    parser.add_argument(
-        '--local_rank',
-        type=int,
-        default=0,
-        help='(Deprecated, please use --local-rank)')
-    parser.add_argument('--local-rank', type=int, default=0)
     parser.add_argument(
         '--cfg-options',
         nargs='+',
@@ -53,6 +47,7 @@ def parse_args():
         'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
         'Note that the quotation marks are necessary and that no white space '
         'is allowed.')
+    parser.add_argument('--local_rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -114,12 +109,30 @@ def main():
                 'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                 f'{cfg.data.imgs_per_gpu} in this experiments')
         cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
-    data_loader = build_dataloader(
-        dataset,
-        samples_per_gpu=cfg.data.samples_per_gpu,
-        workers_per_gpu=cfg.data.workers_per_gpu,
+
+    # The default loader config
+    loader_cfg = dict(
+        # cfg.gpus will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
         dist=distributed,
-        shuffle=False)
+        prefetch=getattr(cfg, 'prefetch', False),
+        img_norm_cfg=cfg.img_norm_cfg)
+
+    # The overall dataloader settings
+    loader_cfg.update({
+        k: v
+        for k, v in cfg.data.items() if k not in [
+            'train', 'val', 'test', 'train_dataloader', 'val_dataloader',
+            'test_dataloader'
+        ]
+    })
+    # The specific train dataloader settings
+    test_loader_cfg = {
+        **loader_cfg,
+        'shuffle': False,  # Not shuffle by default
+        **cfg.data.get('test_dataloader', {}),
+    }
+    data_loader = build_dataloader(dataset, **test_loader_cfg)
 
     # build the model and load checkpoint
     model = build_algorithm(cfg.model)
diff --git a/tools/train.py b/tools/train.py
index a38ef20f4..7b8d43c18 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -80,12 +80,7 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument(
-        '--local_rank',
-        type=int,
-        default=0,
-        help='(Deprecated, please use --local-rank)')
-    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument('--local_rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)