diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
new file mode 100644
index 0000000000..f35ebc134a
--- /dev/null
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_2L.json
@@ -0,0 +1,93 @@
+ {
+  "dataset_reader": {
+    "class_name": "paraphraser_reader",
+    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
+    "do_lower_case": false
+  },
+  "dataset_iterator": {
+    "class_name": "siamese_iterator",
+    "seed": 243,
+    "len_valid": 500
+  },
+  "chainer": {
+    "in": ["text_a", "text_b"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 64,
+        "in": ["text_a", "text_b"],
+        "out": ["bert_features"]
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": 2,
+        "return_probas": false,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.67,
+        "hidden_keep_prob": 0.0, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 9e-05
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y"
+        ],
+        "out": [
+          "predictions"
+        ]
+      }
+    ],
+    "out": ["predictions"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1",
+        "accuracy"
+    ],
+    "validation_patience": 7,
+    "val_every_n_batches": 50,
+    "log_every_n_batches": 50,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_2L"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_2L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
new file mode 100644
index 0000000000..02060d97ea
--- /dev/null
+++ b/deeppavlov/configs/classifiers/paraphraser_convers_distilrubert_6L.json
@@ -0,0 +1,93 @@
+{
+  "dataset_reader": {
+    "class_name": "paraphraser_reader",
+    "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
+    "do_lower_case": false
+  },
+  "dataset_iterator": {
+    "class_name": "siamese_iterator",
+    "seed": 243,
+    "len_valid": 500
+  },
+  "chainer": {
+    "in": ["text_a", "text_b"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 64,
+        "in": ["text_a", "text_b"],
+        "out": ["bert_features"]
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": 2,
+        "return_probas": false,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.89,
+        "hidden_keep_prob": 0.44, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 5.46e-05
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y"
+        ],
+        "out": [
+          "predictions"
+        ]
+      }
+    ],
+    "out": ["predictions"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1",
+        "accuracy"
+    ],
+    "validation_patience": 7,
+    "val_every_n_batches": 50,
+    "log_every_n_batches": 50,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/paraphraser_convers_distilrubert_6L"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/paraphraser_convers_distilrubert_6L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
+        "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
new file mode 100644
index 0000000000..42d0c72fc4
--- /dev/null
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_2L.json
@@ -0,0 +1,145 @@
+{
+  "dataset_reader": {
+    "class_name": "basic_classification_reader",
+    "x": "text",
+    "y": "label",
+    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
+    "train": "rusentiment_random_posts.csv",
+    "test": "rusentiment_test.csv"
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42,
+    "split_seed": 23,
+    "field_to_split": "train",
+    "split_fields": [
+      "train",
+      "valid"
+    ],
+    "split_proportions": [
+      0.9,
+      0.1
+    ]
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": true,
+        "max_seq_length": 64,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "bert_features"
+        ]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": [
+          "y"
+        ],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": "y",
+        "out": "y_ids"
+      },
+      {
+        "in": "y_ids",
+        "out": "y_onehot",
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.33,
+        "hidden_keep_prob": 0.67, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 3.67e-05
+        },
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y_ids"
+        ],
+        "out": [
+          "y_pred_probas"
+        ]
+      },
+      {
+        "in": "y_pred_probas",
+        "out": "y_pred_ids",
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": "y_pred_ids",
+        "out": "y_pred_labels",
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": [
+      "y_pred_labels"
+    ]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1_weighted",
+        "f1_macro",
+        "accuracy",
+        {
+            "name": "roc_auc",
+            "inputs": [
+                "y_onehot",
+                "y_pred_probas"
+            ]
+        }
+    ],
+    "validation_patience": 5,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_2L"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_2L.tar.gz",
+        "subdir": "{MODELS_PATH}/classifiers/"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
new file mode 100644
index 0000000000..f81488dbbb
--- /dev/null
+++ b/deeppavlov/configs/classifiers/rusentiment_convers_distilrubert_6L.json
@@ -0,0 +1,145 @@
+{
+  "dataset_reader": {
+    "class_name": "basic_classification_reader",
+    "x": "text",
+    "y": "label",
+    "data_path": "{DOWNLOADS_PATH}/rusentiment/",
+    "train": "rusentiment_random_posts.csv",
+    "test": "rusentiment_test.csv"
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42,
+    "split_seed": 23,
+    "field_to_split": "train",
+    "split_fields": [
+      "train",
+      "valid"
+    ],
+    "split_proportions": [
+      0.9,
+      0.1
+    ]
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": true,
+        "max_seq_length": 64,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "bert_features"
+        ]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": [
+          "y"
+        ],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": "y",
+        "out": "y_ids"
+      },
+      {
+        "in": "y_ids",
+        "out": "y_onehot",
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.22,
+        "hidden_keep_prob": 0.22, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 4.56e-05
+        },
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y_ids"
+        ],
+        "out": [
+          "y_pred_probas"
+        ]
+      },
+      {
+        "in": "y_pred_probas",
+        "out": "y_pred_ids",
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": "y_pred_ids",
+        "out": "y_pred_labels",
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": [
+      "y_pred_labels"
+    ]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+        "f1_weighted",
+        "f1_macro",
+        "accuracy",
+        {
+            "name": "roc_auc",
+            "inputs": [
+                "y_onehot",
+                "y_pred_probas"
+            ]
+        }
+    ],
+    "validation_patience": 5,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "evaluation_targets": [
+      "train",
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/rusentiment_convers_distilrubert_6L"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_convers_distilrubert_6L.tar.gz",
+        "subdir": "{MODELS_PATH}/classifiers/"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
new file mode 100644
index 0000000000..6123c18138
--- /dev/null
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_2L.json
@@ -0,0 +1,155 @@
+ {
+  "dataset_reader": {
+    "class_name": "conll2003_reader",
+    "data_path": "{DOWNLOADS_PATH}/total_rus/",
+    "dataset_name": "collection_rus",
+    "provide_pos": false
+  },
+  "dataset_iterator": {
+    "class_name": "data_learning_iterator"
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_ner_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 512,
+        "max_subword_length": 15,
+        "token_masking_prob": 0.0,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "x_tokens",
+          "x_subword_tokens",
+          "x_subword_tok_ids",
+          "startofword_markers",
+          "attention_mask"
+        ]
+      },
+      {
+        "id": "tag_vocab",
+        "class_name": "simple_vocab",
+        "unk_token": [
+          "O"
+        ],
+        "pad_with_zeros": true,
+        "save_path": "{MODEL_PATH}/tag.dict",
+        "load_path": "{MODEL_PATH}/tag.dict",
+        "fit_on": [
+          "y"
+        ],
+        "in": [
+          "y"
+        ],
+        "out": [
+          "y_ind"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_sequence_tagger",
+        "n_tags": "#tag_vocab.len",
+        "pretrained_bert": "{TRANSFORMER}",
+        "attention_probs_keep_prob": 0.11,
+        "hidden_keep_prob": 0.67, 
+        "return_probas": false,
+        "encoder_layer_ids": [
+          -1
+        ],
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 8.11e-05,
+          "weight_decay": 1e-06,
+          "betas": [
+            0.9,
+            0.999
+          ],
+          "eps": 1e-06
+        },
+        "clip_norm": 1.0,
+        "min_learning_rate": 1e-07,
+        "learning_rate_drop_patience": 30,
+        "learning_rate_drop_div": 1.5,
+        "load_before_drop": true,
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "in": [
+          "x_subword_tok_ids",
+          "attention_mask",
+          "startofword_markers"
+        ],
+        "in_y": [
+          "y_ind"
+        ],
+        "out": [
+          "y_pred_ind"
+        ]
+      },
+      {
+        "ref": "tag_vocab",
+        "in": [
+          "y_pred_ind"
+        ],
+        "out": [
+          "y_pred"
+        ]
+      }
+    ],
+    "out": [
+      "x_tokens",
+      "y_pred"
+    ]
+  },
+  "train": {
+    "epochs": 30,
+    "batch_size": 10,
+    "metrics": [
+      {
+        "name": "ner_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      },
+      {
+        "name": "ner_token_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      }
+    ],
+    "validation_patience": 100,
+    "val_every_n_batches": 20,
+    "log_every_n_batches": 20,
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models", 
+      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_2L",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational"
+    }, 
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_2L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
new file mode 100644
index 0000000000..f719065d58
--- /dev/null
+++ b/deeppavlov/configs/ner/ner_rus_convers_distilrubert_6L.json
@@ -0,0 +1,155 @@
+{
+  "dataset_reader": {
+    "class_name": "conll2003_reader",
+    "data_path": "{DOWNLOADS_PATH}/total_rus/",
+    "dataset_name": "collection_rus",
+    "provide_pos": false
+  },
+  "dataset_iterator": {
+    "class_name": "data_learning_iterator"
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_ner_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 512,
+        "max_subword_length": 15,
+        "token_masking_prob": 0.0,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "x_tokens",
+          "x_subword_tokens",
+          "x_subword_tok_ids",
+          "startofword_markers",
+          "attention_mask"
+        ]
+      },
+      {
+        "id": "tag_vocab",
+        "class_name": "simple_vocab",
+        "unk_token": [
+          "O"
+        ],
+        "pad_with_zeros": true,
+        "save_path": "{MODEL_PATH}/tag.dict",
+        "load_path": "{MODEL_PATH}/tag.dict",
+        "fit_on": [
+          "y"
+        ],
+        "in": [
+          "y"
+        ],
+        "out": [
+          "y_ind"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_sequence_tagger",
+        "n_tags": "#tag_vocab.len",
+        "pretrained_bert": "{TRANSFORMER}",
+        "attention_probs_keep_prob": 0.56,
+        "hidden_keep_prob": 1.0, 
+        "return_probas": false,
+        "encoder_layer_ids": [
+          -1
+        ],
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 2.78e-05,
+          "weight_decay": 1e-06,
+          "betas": [
+            0.9,
+            0.999
+          ],
+          "eps": 1e-06
+        },
+        "clip_norm": 1.0,
+        "min_learning_rate": 1e-07,
+        "learning_rate_drop_patience": 30,
+        "learning_rate_drop_div": 1.5,
+        "load_before_drop": true,
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "in": [
+          "x_subword_tok_ids",
+          "attention_mask",
+          "startofword_markers"
+        ],
+        "in_y": [
+          "y_ind"
+        ],
+        "out": [
+          "y_pred_ind"
+        ]
+      },
+      {
+        "ref": "tag_vocab",
+        "in": [
+          "y_pred_ind"
+        ],
+        "out": [
+          "y_pred"
+        ]
+      }
+    ],
+    "out": [
+      "x_tokens",
+      "y_pred"
+    ]
+  },
+  "train": {
+    "epochs": 30,
+    "batch_size": 10,
+    "metrics": [
+      {
+        "name": "ner_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      },
+      {
+        "name": "ner_token_f1",
+        "inputs": [
+          "y",
+          "y_pred"
+        ]
+      }
+    ],
+    "validation_patience": 100,
+    "val_every_n_batches": 20,
+    "log_every_n_batches": 20,
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid",
+      "test"
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models", 
+      "MODEL_PATH": "{MODELS_PATH}/ner_rus_conversational_distilrubert_6L",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational"
+    }, 
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_conversational_distilrubert_6L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
new file mode 100644
index 0000000000..830ded55f6
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L.json
@@ -0,0 +1,173 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": [
+      "context_raw",
+      "question_raw"
+    ],
+    "in_y": [
+      "ans_raw",
+      "ans_raw_start"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_squad_transformers_preprocessor",
+        "add_token_type_ids": true, 
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 384,
+        "return_tokens": true,
+        "in": [
+          "question_raw",
+          "context_raw"
+        ],
+        "out": [
+          "bert_features",
+          "subtokens"
+        ]
+      },
+      {
+        "class_name": "squad_bert_mapping",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "context_raw",
+          "bert_features",
+          "subtokens"
+        ],
+        "out": [
+          "subtok2chars",
+          "char2subtoks"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_preprocessor",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "ans_raw",
+          "ans_raw_start",
+          "char2subtoks"
+        ],
+        "out": [
+          "ans",
+          "ans_start",
+          "ans_end"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_squad",
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.0,
+        "hidden_keep_prob": 0.11, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 9e-05
+        },
+        "learning_rate_drop_patience": 2,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "ans_start",
+          "ans_end"
+        ],
+        "out": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "logits"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_postprocessor",
+        "in": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "context_raw",
+          "bert_features",
+          "subtok2chars",
+          "subtokens"
+        ],
+        "out": [
+          "ans_predicted",
+          "ans_start_predicted",
+          "ans_end_predicted"
+        ]
+      }
+    ],
+    "out": [
+      "ans_predicted",
+      "ans_start_predicted",
+      "logits"
+    ]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      }
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/logs",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_infer.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_infer.json
new file mode 100644
index 0000000000..9202d83ba8
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_2L_infer.json
@@ -0,0 +1,76 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": ["context_raw", "question_raw"],
+    "in_y": ["ans_raw", "ans_raw_start"],
+    "pipe": [
+        {
+        "class_name": "torch_transformers_squad_infer",
+        "lang": "ru", 
+        "batch_size": 128,
+        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_2L.json",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 256,
+        "in": ["context_raw", "question_raw"],
+        "out": ["ans_predicted", "ans_start_predicted", "logits"]
+        }
+    ],
+    "out": ["ans_predicted", "ans_start_predicted", "logits"]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      }
+    ]
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-tiny-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_2L",
+      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_2L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
new file mode 100644
index 0000000000..58e815cc77
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L.json
@@ -0,0 +1,173 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": [
+      "context_raw",
+      "question_raw"
+    ],
+    "in_y": [
+      "ans_raw",
+      "ans_raw_start"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_squad_transformers_preprocessor", 
+        "add_token_type_ids": true, 
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 384,
+        "return_tokens": true,
+        "in": [
+          "question_raw",
+          "context_raw"
+        ],
+        "out": [
+          "bert_features",
+          "subtokens"
+        ]
+      },
+      {
+        "class_name": "squad_bert_mapping",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "context_raw",
+          "bert_features",
+          "subtokens"
+        ],
+        "out": [
+          "subtok2chars",
+          "char2subtoks"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_preprocessor",
+        "do_lower_case": "{lowercase}",
+        "in": [
+          "ans_raw",
+          "ans_raw_start",
+          "char2subtoks"
+        ],
+        "out": [
+          "ans",
+          "ans_start",
+          "ans_end"
+        ]
+      },
+      {
+        "class_name": "torch_transformers_squad",
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "attention_probs_keep_prob": 0.45,
+        "hidden_keep_prob": 0.56, 
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 2.78e-05
+        },
+        "learning_rate_drop_patience": 2,
+        "learning_rate_drop_div": 1.5,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "ans_start",
+          "ans_end"
+        ],
+        "out": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "logits"
+        ]
+      },
+      {
+        "class_name": "squad_bert_ans_postprocessor",
+        "in": [
+          "ans_start_predicted",
+          "ans_end_predicted",
+          "context_raw",
+          "bert_features",
+          "subtok2chars",
+          "subtokens"
+        ],
+        "out": [
+          "ans_predicted",
+          "ans_start_predicted",
+          "ans_end_predicted"
+        ]
+      }
+    ],
+    "out": [
+      "ans_predicted",
+      "ans_start_predicted",
+      "logits"
+    ]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": [
+          "ans",
+          "ans_predicted"
+        ]
+      }
+    ],
+    "tensorboard_log_dir": "{MODEL_PATH}/logs",
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+} 
diff --git a/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_infer.json b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_infer.json
new file mode 100644
index 0000000000..5c6171311c
--- /dev/null
+++ b/deeppavlov/configs/squad/squad_ru_convers_distilrubert_6L_infer.json
@@ -0,0 +1,76 @@
+{
+  "dataset_reader": {
+    "class_name": "squad_dataset_reader",
+    "dataset": "SberSQuADClean",
+    "url": "http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz",
+    "data_path": "{DOWNLOADS_PATH}/squad_ru_clean/"
+  },
+  "dataset_iterator": {
+    "class_name": "squad_iterator",
+    "seed": 1337,
+    "shuffle": true
+  },
+  "chainer": {
+    "in": ["context_raw", "question_raw"],
+    "in_y": ["ans_raw", "ans_raw_start"],
+    "pipe": [
+        {
+        "class_name": "torch_transformers_squad_infer",
+        "lang": "ru", 
+        "batch_size": 128,
+        "squad_model_config": "{CONFIGS_PATH}/squad/squad_ru_convers_distilrubert_6L.json",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": "{lowercase}",
+        "max_seq_length": 256,
+        "in": ["context_raw", "question_raw"],
+        "out": ["ans_predicted", "ans_start_predicted", "logits"]
+        }
+    ],
+    "out": ["ans_predicted", "ans_start_predicted", "logits"]
+  },
+  "train": {
+    "show_examples": false,
+    "evaluation_targets": [
+      "valid"
+    ],
+    "log_every_n_batches": 250,
+    "val_every_n_batches": 500,
+    "batch_size": 10,
+    "validation_patience": 10,
+    "metrics": [
+      {
+        "name": "squad_v2_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v2_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_f1",
+        "inputs": ["ans_raw", "ans_predicted"]
+      },
+      {
+        "name": "squad_v1_em",
+        "inputs": ["ans_raw", "ans_predicted"]
+      }
+    ]
+  },
+  "metadata": {
+    "variables": {
+      "lowercase": false, 
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "TRANSFORMER": "DeepPavlov/distilrubert-base-cased-conversational",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/squad_ru_convers_distilrubert_6L",
+      "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_ru_convers_distilrubert_6L.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst
index 285c781991..9e68437742 100644
--- a/docs/features/models/bert.rst
+++ b/docs/features/models/bert.rst
@@ -29,6 +29,8 @@ We have trained BERT-base model for other languages and domains:
    `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12_pt.tar.gz>`__
 -  Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12.tar.gz>`__,
    `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/ru_conversational_cased_L-12_H-768_A-12_pt.tar.gz>`__
+-  Conversational DistilRuBERT, Russian, cased, 6-layer, 768-hidden, 12-heads, 135.4M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-6_H-768_A-12_pt.tar.gz>`__
+-  Conversational DistilRuBERT-tiny, Russian, cased, 2-layer, 768-hidden, 12-heads, 107M parameters: `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/distil_ru_conversational_cased_L-2_H-768_A-12_pt.tar.gz>`__
 -  Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12.tar.gz>`__,
    `[deeppavlov_pytorch] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt.tar.gz>`__
 -  Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_ru_cased_L-12_H-768_A-12.tar.gz>`__,
@@ -50,6 +52,13 @@ English cased version of BERT-base as initialization for English Conversational
 Conversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_.
 We assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT.
 
+Conversational DistilRuBERT (6 transformer layers) and DistilRuBERT-tiny (2 transformer layers) were trained on the same data as Conversational RuBERT and highly inspired by DistilBERT [13]_. Namely, Distil* models (students) used pretrained Conversational RuBERT as teacher and linear combination of the following losses:
+
+1. Masked language modeling loss (between student output logits for tokens and its true labels)
+2. Kullback-Leibler divergence (between student and teacher output logits)
+3. Cosine embedding loss (between averaged hidden states of the teacher and hidden states of the student)
+4. Mean squared error loss (between averaged attention maps of the teacher and attention maps of the student)
+
 Sentence Multilingual BERT is a representation-based sentence encoder for 101 languages of Multilingual BERT.
 It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI [9]_ and on dev set of multilingual XNLI [10]_.
 Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_.
@@ -196,3 +205,4 @@ the :doc:`config </intro/configuration>` file must be changed to match new BERT
 .. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053
 .. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326
 .. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084
+.. [13] Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108.
diff --git a/docs/features/overview.rst b/docs/features/overview.rst
index 31e822ff89..376b780cca 100644
--- a/docs/features/overview.rst
+++ b/docs/features/overview.rst
@@ -20,27 +20,31 @@ The second model reproduces architecture from the paper `Application
 of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition <https://arxiv.org/pdf/1709.09686.pdf>`__
 which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf.
 
-+---------------------------------------------------------+-------+-----------------------------------------------------------------------------+-------------+
-| Dataset                                                 | Lang  | Model                                                                       |   Test F1   |
-+=========================================================+=======+=============================================================================+=============+
-| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                         |    98.1     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                   |    95.1     |
-+---------------------------------------------------------+-------+-----------------------------------------------------------------------------+-------------+
-| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`   |    88.8     |
-+                                                         +-------+-----------------------------------------------------------------------------+-------------+
-|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`             |    88.6     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                       |    87.1     |
-+---------------------------------------------------------+       +-----------------------------------------------------------------------------+-------------+
-| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`             |    91.7     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_conll2003_torch_bert.json <ner/ner_conll2003_torch_bert.json>` |    88.6     |
-+                                                         +       +-----------------------------------------------------------------------------+-------------+
-|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                       |    89.9     |
-+---------------------------------------------------------+       +-----------------------------------------------------------------------------+-------------+
-| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                               |    97.1     |
-+---------------------------------------------------------+-------+-----------------------------------------------------------------------------+-------------+
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
+| Dataset                                                 | Lang  | Model                                                                                      |   Test F1   |
++=========================================================+=======+============================================================================================+=============+
+| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                                        |    98.1     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                                  |    95.1     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_rus_convers_distilrubert_2L.json  <ner/ner_rus_convers_distilrubert_2L.json>` |  94.2 ± 0.2 |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_rus_convers_distilrubert_6L.json  <ner/ner_rus_convers_distilrubert_6L.json>` |  96.4 ± 0.2 |
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
+| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>`                  |    88.8     |
++                                                         +-------+--------------------------------------------------------------------------------------------+-------------+
+|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`                            |    88.6     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                                      |    87.1     |
++---------------------------------------------------------+       +--------------------------------------------------------------------------------------------+-------------+
+| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`                            |    91.7     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003_torch_bert.json <ner/ner_conll2003_torch_bert.json>`                |    88.6     |
++                                                         +       +--------------------------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                                      |    89.9     |
++---------------------------------------------------------+       +--------------------------------------------------------------------------------------------+-------------+
+| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                                              |    97.1     |
++---------------------------------------------------------+-------+--------------------------------------------------------------------------------------------+-------------+
 
 Slot filling models :doc:`[docs] </features/models/slot_filling>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -63,61 +67,65 @@ BiLSTM with self-attention and other models are presented. The model also allows
 Several pre-trained models are available and presented in Table below.
 
 
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Task             | Dataset            | Lang | Model                                                                                           | Metric      | Valid  | Test   | Downloads |
-+==================+====================+======+=================================================================================================+=============+========+========+===========+
-| 28 intents       | `DSTC 2`_          | En   | :config:`DSTC 2 emb <classifiers/intents_dstc2.json>`                                           | Accuracy    | 0.7613 | 0.7733 |  800 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Wiki emb <classifiers/intents_dstc2_big.json>`                                         |             | 0.9629 | 0.9617 |  8.5 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                            |             | 0.9673 | 0.9636 |  800 Mb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 7 intents        | `SNIPS-2017`_ [1]_ |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                           | F1-macro    | 0.8591 |    --  |  800 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                         |             | 0.9820 |    --  |  8.5 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Tfidf + SelectKBest + PCA + Wiki emb <classifiers/intents_snips_sklearn.json>`         |             | 0.9673 |    --  |  8.6 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Wiki emb weighted by Tfidf <classifiers/intents_snips_tfidf_weighted.json>`            |             | 0.9786 |    --  |  8.5 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Insult detection | `Insults`_         |      | :config:`Reddit emb <classifiers/insults_kaggle.json>`                                          | ROC-AUC     | 0.9263 | 0.8556 |  6.2 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                   |             | 0.9255 | 0.8612 |  1200 Mb  |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`English Conversational BERT <classifiers/insults_kaggle_conv_bert.json>`               |             | 0.9389 | 0.8941 |  1200 Mb  |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`English BERT on PyTorch <classifiers/insults_kaggle_bert_torch.json>`                  |             | 0.9329 | 0.877  |  1.1 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 5 topics         | `AG News`_         |      | :config:`Wiki emb <classifiers/topic_ag_news.json>`                                             | Accuracy    | 0.8922 | 0.9059 |  8.5 Gb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Intent           |`Yahoo-L31`_        |      | :config:`Yahoo-L31 on conversational BERT <classifiers/yahoo_convers_vs_info_bert.json>`        | ROC-AUC     | 0.9436 |   --   |  1200 Mb  |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment        |`SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`       | Accuracy    | 0.6456 | 0.6715 |  400 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`5-classes SST on multilingual BERT <classifiers/sentiment_sst_multi_bert.json>`        |             | 0.5738 | 0.6024 |  660 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`3-classes SST SWCNN on PyTorch <classifiers/sst_torch_swcnn.json>`                     |             | 0.7379 | 0.6312 |  4.3 Mb   |
-+                  +--------------------+      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |`Yelp`_             |      | :config:`5-classes Yelp on conversational BERT <classifiers/sentiment_yelp_conv_bert.json>`     |             | 0.6925 | 0.6842 |  400 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`5-classes Yelp on multilingual BERT <classifiers/sentiment_yelp_multi_bert.json>`      |             | 0.5896 | 0.5874 |  660 Mb   |
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Sentiment        |`Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`               |             | 0.9965 | 0.9961 |  6.2 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`RuWiki+Lenta emb with preprocessing <classifiers/sentiment_twitter_preproc.json>`      |             | 0.7823 | 0.7759 |  6.2 Gb   |
-+                  +--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-|                  |`RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                   | F1-weighted | 0.6541 | 0.7016 |  6.2 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_     |             | 0.7301 | 0.7576 |  3.4 Gb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                  |             | 0.7519 | 0.7875 |  700 Mb   |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                               |             | 0.6809 | 0.7193 |  1900 Mb  |
-+                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                     |             | 0.7548 | 0.7742 |  657 Mb   |
-+------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| Intent           |Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`      | ROC-AUC     | 0.9412 |   --   |  700 Mb   |
-+------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Task             | Dataset             | Lang | Model                                                                                              | Metric      | Valid            | Test            | Downloads |
++==================+=====================+======+====================================================================================================+=============+==================+=================+===========+
+| 28 intents       | `DSTC 2`_           | En   | :config:`DSTC 2 emb <classifiers/intents_dstc2.json>`                                              | Accuracy    | 0.7613           | 0.7733          |  800 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb <classifiers/intents_dstc2_big.json>`                                            |             | 0.9629           | 0.9617          |  8.5 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                               |             | 0.9673           | 0.9636          |  800 Mb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| 7 intents        | `SNIPS-2017`_ [1]_  |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                              | F1-macro    | 0.8591           |    --           |  800 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                            |             | 0.9820           |    --           |  8.5 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Tfidf + SelectKBest + PCA + Wiki emb <classifiers/intents_snips_sklearn.json>`            |             | 0.9673           |    --           |  8.6 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Wiki emb weighted by Tfidf <classifiers/intents_snips_tfidf_weighted.json>`               |             | 0.9786           |    --           |  8.5 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Insult detection | `Insults`_          |      | :config:`Reddit emb <classifiers/insults_kaggle.json>`                                             | ROC-AUC     | 0.9263           | 0.8556          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English BERT <classifiers/insults_kaggle_bert.json>`                                      |             | 0.9255           | 0.8612          |  1200 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English Conversational BERT <classifiers/insults_kaggle_conv_bert.json>`                  |             | 0.9389           | 0.8941          |  1200 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`English BERT on PyTorch <classifiers/insults_kaggle_bert_torch.json>`                     |             | 0.9329           | 0.877           |  1.1 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| 5 topics         | `AG News`_          |      | :config:`Wiki emb <classifiers/topic_ag_news.json>`                                                | Accuracy    | 0.8922           | 0.9059          |  8.5 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Intent           | `Yahoo-L31`_        |      | :config:`Yahoo-L31 on conversational BERT <classifiers/yahoo_convers_vs_info_bert.json>`           | ROC-AUC     | 0.9436           |   --            |  1200 Mb  |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Sentiment        | `SST`_              |      | :config:`5-classes SST on conversational BERT <classifiers/sentiment_sst_conv_bert.json>`          | Accuracy    | 0.6456           | 0.6715          |  400 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`5-classes SST on multilingual BERT <classifiers/sentiment_sst_multi_bert.json>`           |             | 0.5738           | 0.6024          |  660 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`3-classes SST SWCNN on PyTorch <classifiers/sst_torch_swcnn.json>`                        |             | 0.7379           | 0.6312          |  4.3 Mb   |
++                  +---------------------+      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  | `Yelp`_             |      | :config:`5-classes Yelp on conversational BERT <classifiers/sentiment_yelp_conv_bert.json>`        |             | 0.6925           | 0.6842          |  400 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`5-classes Yelp on multilingual BERT <classifiers/sentiment_yelp_multi_bert.json>`         |             | 0.5896           | 0.5874          |  660 Mb   |
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Sentiment        | `Twitter mokoron`_  | Ru   | :config:`RuWiki+Lenta emb w/o preprocessing <classifiers/sentiment_twitter.json>`                  |             | 0.9965           | 0.9961          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`RuWiki+Lenta emb with preprocessing <classifiers/sentiment_twitter_preproc.json>`         |             | 0.7823           | 0.7759          |  6.2 Gb   |
++                  +---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+|                  | `RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                      | F1-weighted | 0.6541           | 0.7016          |  6.2 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_        |             | 0.7301           | 0.7576          |  3.4 Gb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                     |             | 0.7519           | 0.7875          |  700 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Multi-language BERT <classifiers/rusentiment_bert.json>`                                  |             | 0.6809           | 0.7193          |  1900 Mb  |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational RuBERT <classifiers/rusentiment_convers_bert.json>`                        |             | 0.7548           | 0.7742          |  657 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational DistilRuBERT-tiny <classifiers/rusentiment_convers_distilrubert_2L.json>`  |             | 0.72 ± 0.0016    | 0.74 ± 0.01     |  690 Mb   |
++                  +                     +      +----------------------------------------------------------------------------------------------------+             +------------------+-----------------+-----------+
+|                  |                     |      | :config:`Conversational DistilRuBERT-base <classifiers/rusentiment_convers_distilrubert_6L.json>`  |             | 0.73 ± 0.003     | 0.75 ± 0.013    |  1.0 Gb   |
++------------------+---------------------+      +----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
+| Intent           | Ru like`Yahoo-L31`_ |      | :config:`Conversational vs Informational on ELMo <classifiers/yahoo_convers_vs_info.json>`         | ROC-AUC     | 0.9412           |   --            |  700 Mb   |
++------------------+---------------------+------+----------------------------------------------------------------------------------------------------+-------------+------------------+-----------------+-----------+
 
 .. [1] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
 .. [2] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
@@ -231,11 +239,11 @@ Available pre-trained models for ranking:
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
    | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_mt_word2vec_smn <ranking/ranking_ubuntu_v2_mt_word2vec_smn.json>`                         |   68.56   | 67.91 | 81.49 | 95.63 |  1609 MB  |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-   | `Ubuntu V2`_      |:config:`ranking_ubuntu_v2_bert_uncased <ranking/ranking_ubuntu_v2_bert_uncased.json>`                                |   66.5    | 66.6  | --    | --    |  396 MB   |
+   | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_bert_uncased <ranking/ranking_ubuntu_v2_bert_uncased.json>`                               |   66.5    | 66.6  | --    | --    |  396 MB   |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-   | `Ubuntu V2`_      |:config:`ranking_ubuntu_v2_bert_uncased on PyTorch <ranking/ranking_ubuntu_v2_torch_bert_uncased.json>`               |   65.73   | 65.74 | --    | --    |  1.1 Gb   |
+   | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_bert_uncased on PyTorch <ranking/ranking_ubuntu_v2_torch_bert_uncased.json>`              |   65.73   | 65.74 | --    | --    |  1.1 Gb   |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
-   | `Ubuntu V2`_      |:config:`ranking_ubuntu_v2_bert_sep <ranking/ranking_ubuntu_v2_bert_sep.json>`                                        |   66.5    | 66.5  | --    | --    |  396 MB   |
+   | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_bert_sep <ranking/ranking_ubuntu_v2_bert_sep.json>`                                       |   66.5    | 66.5  | --    | --    |  396 MB   |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
    | `Ubuntu V2`_      | :config:`ranking_ubuntu_v2_mt_interact <ranking/ranking_ubuntu_v2_mt_interact.json>`                                 |   59.2    | 58.7  | --    | --    |  8906 MB  |
    +-------------------+----------------------------------------------------------------------------------------------------------------------+-----------+-------+-------+-------+-----------+
@@ -247,15 +255,19 @@ Available pre-trained models for paraphrase identification:
 .. table::
    :widths: auto
 
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |    Dataset             |Model config                                                                                   | Val (accuracy)| Test (accuracy)| Val (F1)| Test (F1)| Val (log_loss)| Test (log_loss)|Downloads |
-   +========================+===============================================================================================+===============+================+=========+==========+===============+================+==========+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>` |   83.8        |   75.4         |   87.9  |  80.9    |   0.468       |   0.616        |5938M     |
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                     |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                              |   90.2        |   84.9         |   92.3  |  87.9    |   --          |   --           |1325M     |
-   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   |    Dataset             | Model config                                                                                         | Val (accuracy) | Test (accuracy) | Val (F1)     | Test (F1)    | Val (log_loss) | Test (log_loss) | Downloads |
+   +========================+======================================================================================================+================+=================+==============+==============+================+=================+===========+
+   | `paraphraser.ru`_      | :config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>`       |   83.8         |   75.4          |   87.9       |  80.9        |   0.468        |   0.616         | 5938M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                           |   87.4         |   79.3          |   90.2       |  83.4        |   --           |   --            | 1330M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                                    |   90.2         |   84.9          |   92.3       |  87.9        |   --           |   --            | 1325M     |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_2L <classifiers/paraphraser_convers_distilrubert_2L.json>` |  79.4 ± 0.01   |  67.5 ± 0.006   | 84.4 ± 0.04  | 76.2 ± 0.006 |   --           |   --            | 618M      |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
+   | `paraphraser.ru`_      | :config:`paraphraser_convers_distilrubert_6L <classifiers/paraphraser_convers_distilrubert_6L.json>` |  87.1 ± 0.01   |  78.0 ± 0.01    | 90.0 ± 0.08  | 82.9 ± 0.003 |   --           |   --            | 930M      |
+   +------------------------+------------------------------------------------------------------------------------------------------+----------------+-----------------+--------------+--------------+----------------+-----------------+-----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
 
@@ -319,25 +331,29 @@ BERT-based model is described in  `BERT: Pre-training of Deep Bidirectional Tran
 R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching Networks
 <https://www.microsoft.com/en-us/research/publication/mcr/>`__.
 
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    Dataset    | Model config                                                           | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
-+===============+========================================================================+=======+================+=================+=================+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT <squad/squad_bert.json>`                      |  en   |     80.88      |     88.49       |     806Mb       |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov BERT on PyTorch <squad/squad_torch_bert.json>`     |  en   |    80.79       |   88.30         |     1.1 Gb      |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-| `SQuAD-v1.1`_ | :config:`DeepPavlov R-Net <squad/squad.json>`                          |  en   |     71.49      |     80.34       |     ~2.5Gb      |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov RuBERT <squad/squad_ru_bert_infer.json>`           |  ru   |  66.30+-0.24   |   84.60+-0.11   |   1325Mb        |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov multilingual BERT <squad/squad_ru_bert_infer.json>`|  ru   |  64.35+-0.39   |   83.39+-0.08   |   1323Mb        |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|  SDSJ Task B  | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                       |  ru   |     60.62      |     80.04       |     ~5Gb        |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    `DRCD`_    | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>` |  ch   |     84.86      |     89.03       |     630Mb       |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
-|    `DRCD`_    | :config:`DeepPavlov Chinese BERT <squad/squad_zh_bert_zh.json>`        |  ch   |     84.19      |     89.23       |     362Mb       |
-+---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    Dataset     | Model config                                                                                | lang  |    EM (dev)    |    F-1 (dev)    |    Downloads    |
++================+=============================================================================================+=======+================+=================+=================+
+| `SQuAD-v1.1`_  | :config:`DeepPavlov BERT <squad/squad_bert.json>`                                           |  en   |     80.88      |     88.49       |     806Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SQuAD-v1.1`_  | :config:`DeepPavlov BERT on PyTorch <squad/squad_torch_bert.json>`                          |  en   |    80.79       |     88.30       |     1.1 Gb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SQuAD-v1.1`_  | :config:`DeepPavlov R-Net <squad/squad.json>`                                               |  en   |     71.49      |     80.34       |     ~2.5Gb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov RuBERT <squad/squad_ru_bert_infer.json>`                                |  ru   |  66.30 ± 0.24  |   84.60 ± 0.11  |     1325Mb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov multilingual BERT <squad/squad_ru_bert_infer.json>`                     |  ru   |  64.35 ± 0.39  |   83.39 ± 0.08  |     1323Mb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov R-Net <squad/squad_ru.json>`                                            |  ru   |     60.62      |     80.04       |     ~5Gb        |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-tiny <squad/squad_ru_convers_distilrubert_2L_infer.json>`  |  ru   |  48.3 ± 0.41   |  68.9 ± 0.39    |     867Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+| `SDSJ Task B`_ | :config:`DeepPavlov DistilRuBERT-base <squad/squad_ru_convers_distilrubert_6L_infer.json>`  |  ru   |  61.77 ± 0.25  |  80.39 ± 0.21   |     1.18Gb      |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    `DRCD`_     | :config:`DeepPavlov multilingual BERT <squad/squad_zh_bert_mult.json>`                      |  ch   |     84.86      |     89.03       |     630Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
+|    `DRCD`_     | :config:`DeepPavlov Chinese BERT <squad/squad_zh_bert_zh.json>`                             |  ch   |     84.19      |     89.23       |     362Mb       |
++----------------+---------------------------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+
 
 In the case when answer is not necessary present in given context we have :config:`squad_noans <squad/multi_squad_noans.json>`
 model. This model outputs empty string in case if there is no answer in context.
@@ -361,31 +377,31 @@ For more scores see :doc:`full table </features/models/morphotagger>`.
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
     |    Dataset           | Model                                                                                                        | Word accuracy | Sent. accuracy | Download size (MB) |
     +======================+==============================================================================================================+===============+================+====================+
-    |`UD2.3`_ (Russian)    |`UD Pipe 2.3`_ (Straka et al., 2017)                                                                          |    93.5       |                |                    |
+    | `UD2.3`_ (Russian)   | `UD Pipe 2.3`_ (Straka et al., 2017)                                                                         |    93.5       |                |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
     |                      | `UD Pipe Future`_ (Straka et al., 2018)                                                                      |    96.90      |                |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`BERT-based model <morpho_tagger/BERT/morpho_ru_syntagrus_bert.json>`                                 |    97.83      |     72.02      |       661          |
+    |                      | :config:`BERT-based model <morpho_tagger/BERT/morpho_ru_syntagrus_bert.json>`                                |    97.83      |     72.02      |       661          |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |`Pymorphy`_ + `russian_tagsets`_ (first tag)                                                                  |     60.93     |      0.00      |                    |
+    |                      | `Pymorphy`_ + `russian_tagsets`_ (first tag)                                                                 |     60.93     |      0.00      |                    |
     +                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |`UD2.0`_ (Russian)    |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     93.57     |     43.04      |                    |
+    | `UD2.0`_ (Russian)   | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     93.57     |     43.04      |                    |
     +                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus.json>`                             |     95.17     |     50.58      |        48.7        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus.json>`                            |     95.17     |     50.58      |        48.7        |
     +                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Pymorphy-enhanced model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_pymorphy.json>`        |   **96.23**   |     58.00      |        48.7        |
+    |                      | :config:`Pymorphy-enhanced model <morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_pymorphy.json>`       |   **96.23**   |     58.00      |        48.7        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    | `UD2.0`_ (Czech)     |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     91.86     |     42.28      |                    |
+    | `UD2.0`_ (Czech)     | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     91.86     |     42.28      |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/morpho_cs.json>`                                                    |   **94.35**   |     51.56      |        41.8        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/morpho_cs.json>`                                                   |   **94.35**   |     51.56      |        41.8        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |`UD2.0`_ (English)    |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     92.89     |     55.75      |                    |
+    | `UD2.0`_ (English)   | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     92.89     |     55.75      |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/morpho_en.json>`                                                    |   **93.00**   |     55.18      |        16.9        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/morpho_en.json>`                                                   |   **93.00**   |     55.18      |        16.9        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |`UD2.0`_ (German)     |`UD Pipe 1.2`_ (Straka et al., 2017)                                                                          |     76.65     |     10.24      |                    |
+    | `UD2.0`_ (German)    | `UD Pipe 1.2`_ (Straka et al., 2017)                                                                         |     76.65     |     10.24      |                    |
     |                      +--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
-    |                      |:config:`Basic model <morpho_tagger/UD2.0/morpho_de.json>`                                                    |   **83.83**   |     15.25      |        18.6        |
+    |                      | :config:`Basic model <morpho_tagger/UD2.0/morpho_de.json>`                                                   |   **83.83**   |     15.25      |        18.6        |
     +----------------------+--------------------------------------------------------------------------------------------------------------+---------------+----------------+--------------------+
 
 .. _`Pymorphy`: https://pymorphy2.readthedocs.io/en/latest/
@@ -415,7 +431,7 @@ on ``ru_syntagrus`` Russian corpus (version UD 2.3).
     |                         +-------------------------------------------------------------------------------------------+---------+----------+
     |                         | `UDify (multilingual BERT)`_ (Kondratyuk, 2018)                                           | 94.8    | 93.1     |
     |                         +-------------------------------------------------------------------------------------------+---------+----------+
-    |                         |:config:`our BERT model <syntax/syntax_ru_syntagrus_bert.json>`                            | 95.2    | 93.7     |
+    |                         | :config:`our BERT model <syntax/syntax_ru_syntagrus_bert.json>`                           | 95.2    | 93.7     |
     +-------------------------+-------------------------------------------------------------------------------------------+---------+----------+
 
 .. _`UD2.3`: http://hdl.handle.net/11234/1-2895
@@ -473,13 +489,13 @@ based on its Wikipedia knowledge.
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
 | Dataset        | Model config                                                       |  Wiki dump            |   F1   | Downloads |
 +================+====================================================================+=======================+========+===========+
-| `SQuAD-v1.1`_  |:config:`ODQA <odqa/en_odqa_infer_wiki.json>`                       | enwiki (2018-02-11)   |  35.89 | 9.7Gb     |
+| `SQuAD-v1.1`_  | :config:`ODQA <odqa/en_odqa_infer_wiki.json>`                      | enwiki (2018-02-11)   |  35.89 | 9.7Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SQuAD-v1.1`_  |:config:`ODQA <odqa/en_odqa_infer_enwiki20161221.json>`             | enwiki (2016-12-21)   |  37.83 | 9.3Gb     |
+| `SQuAD-v1.1`_  | :config:`ODQA <odqa/en_odqa_infer_enwiki20161221.json>`            | enwiki (2016-12-21)   |  37.83 | 9.3Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SDSJ Task B`_ |:config:`ODQA <odqa/ru_odqa_infer_wiki.json>`                       | ruwiki (2018-04-01)   |  28.56 | 7.7Gb     |
+| `SDSJ Task B`_ | :config:`ODQA <odqa/ru_odqa_infer_wiki.json>`                      | ruwiki (2018-04-01)   |  28.56 | 7.7Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
-| `SDSJ Task B`_ |:config:`ODQA with RuBERT <odqa/ru_odqa_infer_wiki_rubert.json>`    | ruwiki (2018-04-01)   |  37.83 | 4.3Gb     |
+| `SDSJ Task B`_ | :config:`ODQA with RuBERT <odqa/ru_odqa_infer_wiki_rubert.json>`   | ruwiki (2018-04-01)   |  37.83 | 4.3Gb     |
 +----------------+--------------------------------------------------------------------+-----------------------+--------+-----------+
 
 
@@ -554,5 +570,5 @@ goal-oriented bot and a slot-filling model with Telegram UI.
 
 
 .. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250
-.. _`SDSJ Task B`: https://sdsj.sberbank.ai/2017/ru/contest.html
+.. _`SDSJ Task B`: https://arxiv.org/abs/1912.09723
 .. _`DRCD`: https://arxiv.org/abs/1806.00920
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
index 9c7b642364..74ae65dedc 100644
--- a/tests/test_quick_start.py
+++ b/tests/test_quick_start.py
@@ -144,6 +144,18 @@
         ("classifiers/intents_sample_csv.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
         ("classifiers/intents_sample_json.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK]
     },
+    "distil": {
+        ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("classifiers/paraphraser_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("classifiers/rusentiment_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("classifiers/rusentiment_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("ner/ner_rus_convers_distilrubert_2L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("ner/ner_rus_convers_distilrubert_6L.json", "distil", ('IP')): [ONE_ARGUMENT_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_2L_infer.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_6L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+        ("squad/squad_ru_convers_distilrubert_6L_infer.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],
+    },
     "entity_linking": {
         ("kbqa/entity_linking_rus.json", "entity_linking",  ('IP',)):
             [