diff --git a/scripts/question_answering/albert_custom.yaml b/scripts/question_answering/albert_custom.yaml
new file mode 100644
index 0000000000..51a06681cb
--- /dev/null
+++ b/scripts/question_answering/albert_custom.yaml
@@ -0,0 +1,15 @@
+version: 1.0
+
+model:
+  name: albert_base_v2
+  framework: mxnet
+
+tuning:
+  strategy:
+    name: mycustom
+  accuracy_criterion:
+    relative:  0.02
+  exit_policy:
+    timeout: 0
+    max_trials: 1000
+  random_seed: 9527
diff --git a/scripts/question_answering/custom_strategy.py b/scripts/question_answering/custom_strategy.py
new file mode 100644
index 0000000000..6c236ff469
--- /dev/null
+++ b/scripts/question_answering/custom_strategy.py
@@ -0,0 +1,176 @@
+import copy
+import numpy as np
+from collections import OrderedDict
+from neural_compressor.strategy.strategy import TuneStrategy, strategy_registry
+
+plot_operator_influence = True
+
+def calc_approx_error(expected_tensor: np.ndarray, observed_tensor: np.ndarray) -> float:
+    '''
+    Calculating relative error for one tensor
+    '''
+    error = observed_tensor - expected_tensor
+    absolute_error = np.abs(error)
+    mean_absolute_error = absolute_error.mean()
+    mean_expected_value = np.abs(expected_tensor).mean()
+    error = mean_absolute_error / mean_expected_value
+    return error
+
+
+def get_approx_errors(expected_tensors, observed_tensors):
+    '''
+    Calculating relative error for multiple tensors: Dict[tensors_name: str, tensor: np.ndarray]
+    '''
+    errors = {}
+    for node_name in observed_tensors.keys():
+        expected_tensor = expected_tensors[node_name][node_name]
+        observed_tensor = observed_tensors[node_name][node_name]
+        errors[node_name] = calc_approx_error(expected_tensor, observed_tensor)
+    return errors
+
+
+@strategy_registry
+class MyCustomTuneStrategy(TuneStrategy):
+    '''INC Custom strategy definition'''
+    def __init__(self, model, conf, q_dataloader, q_func=None,
+                 eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None):
+        super().__init__(
+            model,
+            conf,
+            q_dataloader,
+            q_func,
+            eval_dataloader,
+            eval_func,
+            dicts,
+            q_hooks)
+
+
+    def get_qtensors(self, quant_cfg, node_list):
+        '''
+        Generating quantized model based on configuration and capturing intermediate tensors
+        '''
+        qmodel = self.adaptor.quantize(quant_cfg, self.model, self.calib_dataloader)
+        tensors = self.adaptor.inspect_tensor(qmodel, self.calib_dataloader, node_list, [1]) # 1 is a batch index
+        return tensors['activation'][0] # we need to specify that we want activation (layer output) because INC stores also weight tensors
+                                        # 0 is the first batch
+    def next_tune_cfg(self):
+        FALLBACK_DTYPE = 'fp32'
+
+        # creating base configuration - all nodes are quantized and calibrated with minmax algorithm
+        best_cfg = {}
+        best_cfg['calib_iteration'] = int(self.calib_iter[0]) # number of batches for calibration
+        best_cfg['calib_sampling_size'] = int(self.calib_sampling_size[0]) # number of samples for calibration (multiplicity of batch)
+        nodes_cfg = OrderedDict()
+        nodes_cfg_idx = {}
+        for node_key, cfgs in self.opwise_tune_cfgs.items():
+            for i, cfg in enumerate(cfgs):
+                if cfg['activation']['algorithm'] == 'minmax':
+                    nodes_cfg_idx[node_key] = i
+                    break
+            nodes_cfg[node_key] = cfg
+        best_cfg['op'] = nodes_cfg
+
+        yield best_cfg
+
+        # If fully quantized model does not meet the requirements, we proceed to exclude some nodes
+
+        # Collecting tensors from the original model - expected tensors
+        node_list = [op_name for (op_name, op_type) in best_cfg['op'].keys()]
+        f32_tensors = self.adaptor.inspect_tensor(self.model, self.calib_dataloader, node_list, [1])
+        f32_tensors = f32_tensors['activation'][0]
+
+        # Collecting tensors from the fully quantized model
+        q_tensors = self.get_qtensors(best_cfg, node_list)
+        approx_errors = get_approx_errors(f32_tensors, q_tensors)
+
+        # best_cfg['op'] is an OrderedDict, which order of elements should correspond to their
+        # order in the computational graph
+        for node_key, cfg in best_cfg['op'].items():
+            # Node's key in INC is its name + its operator
+            node_name, node_op = node_key
+            # Checking what configuration options are available for this particular node
+            capabilities = self.opwise_tune_space[node_key]['activation']['dtype']
+            # If a particular node can be excluded from quanrtization ('fp32' in capabilities)
+            # and current error is bigger than threshold value, we check what accuracy improvement
+            # would be achieved by this exclusion
+            if FALLBACK_DTYPE in capabilities and approx_errors[node_name] > 0.06:
+                original_dtype = cfg['activation']['dtype']
+                cfg['activation']['dtype'] = FALLBACK_DTYPE # Exclude the node from quantization
+
+                # Collecting tensors for a new configuration with the current node excluded
+                q_tensors = self.get_qtensors(best_cfg, node_list)
+                # Calculating errors for the new configuration
+                new_approx_errors = get_approx_errors(f32_tensors, q_tensors)
+                # Calculating error differences for every node in a model
+                err_diffs = {}
+                for tensor_node_name in new_approx_errors.keys():
+                    diff = approx_errors[tensor_node_name] - new_approx_errors[tensor_node_name]
+                    err_diffs[tensor_node_name] = diff
+                err_diffs_arr = np.array(list(err_diffs.values()))
+
+                # If the sum of errors on the following layers is greater than the threshold value we
+                # keep the node excluded
+                threshold_sum_error_layers = err_diffs_arr.size * 0.01
+                if err_diffs_arr.sum() >= threshold_sum_error_layers:
+                    before = approx_errors
+                    after = approx_errors.copy()
+                    after.update(new_approx_errors)
+                    if plot_operator_influence:
+                        import matplotlib.pyplot as plt
+                        plt.figure()
+                        plt.plot(before.values(), marker='o', markersize=2.5, label='Before')
+                        plt.plot(after.values(), marker='o', markersize=2.5, label='After')
+                        plt.ylabel('Relative error')
+                        plt.xlabel('Layer')
+                        plt.legend()
+                        plt.savefig(f'{node_name}_error.png')
+
+                    approx_errors.update(new_approx_errors)
+                    nodes_cfg_idx.pop(node_key) # Mark node as not quantizable
+                else:
+                    cfg['activation']['dtype'] = original_dtype
+
+        yield best_cfg
+
+        # Choosing calibration algorithm (kl or minmax) for every node which was not excluded from quantization
+        for cfg in self.bayesian_configurations(best_cfg, nodes_cfg_idx):
+            yield cfg
+
+    def bayesian_params_to_tune_configs(self, params):
+        '''
+        Creating configuration from params - changing configurations' indexes for real configurations
+        '''
+        node_cfgs = {}
+        for node_key, configs in self.opwise_quant_cfgs.items():
+            if node_key in params:
+                value = int(params[node_key])
+                value = min(value, len(configs) - 1)
+                node_cfgs[node_key] = copy.deepcopy(configs[value])
+        return node_cfgs
+
+    def bayesian_configurations(self, cfg_base, params_base):
+        from neural_compressor.strategy.bayesian import BayesianOptimization
+
+        # For each node we specify the possible range of values (we treat them as a configurations' index)
+        pbounds = {}
+        for node_key, configs in self.opwise_quant_cfgs.items():
+            if node_key in params_base and len(configs) > 1:
+                pbounds[node_key] = (0, len(configs))
+
+        cfg = copy.deepcopy(cfg_base)
+        if len(pbounds) == 0: # if there is nothing to be optimized, we finish
+            cfg['op'].update(self.bayesian_params_to_tune_configs(params_base))
+            return
+
+        bayes_opt = BayesianOptimization(pbounds=pbounds, random_seed=self.cfg.tuning.random_seed)
+        bayes_opt._space.register(params_base, self.last_tune_result[0]) # registering the outcome of current configuration
+        while True:
+            # Generating next configuration
+            params = bayes_opt.gen_next_params()
+            cfg['op'].update(self.bayesian_params_to_tune_configs(params))
+            yield cfg
+            try:
+                # Registering the outcome
+                bayes_opt._space.register(params, self.last_tune_result[0])
+            except KeyError:
+                pass
diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py
index b2ef10640b..db1cce2e00 100644
--- a/scripts/question_answering/models.py
+++ b/scripts/question_answering/models.py
@@ -180,6 +180,7 @@ def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
         self.answerable_scores.add(nn.Dense(2, flatten=False,
                                             weight_initializer=weight_initializer,
                                             bias_initializer=bias_initializer))
+        self.quantized_backbone = None
 
     def get_start_logits(self, contextual_embedding, p_mask):
         """
@@ -287,10 +288,14 @@ def forward(self, tokens, token_types, valid_length, p_mask, start_position):
             Shape (batch_size, sequence_length)
         answerable_logits
         """
+        backbone_net = self.backbone
+        if self.quantized_backbone != None:
+           backbone_net = self.quantized_backbone
+
         if self.use_segmentation:
-            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+            contextual_embeddings = backbone_net(tokens, token_types, valid_length)
         else:
-            contextual_embeddings = self.backbone(tokens, valid_length)
+            contextual_embeddings = backbone_net(tokens, valid_length)
         start_logits = self.get_start_logits(contextual_embeddings, p_mask)
         end_logits = self.get_end_logits(contextual_embeddings,
                                          np.expand_dims(start_position, axis=1),
@@ -337,11 +342,16 @@ def inference(self, tokens, token_types, valid_length, p_mask,
             The answerable logits. Here 0 --> answerable and 1 --> not answerable.
             Shape (batch_size, sequence_length, 2)
         """
+        backbone_net = self.backbone
+        if self.quantized_backbone != None:
+           backbone_net = self.quantized_backbone
+
         # Shape (batch_size, sequence_length, C)
         if self.use_segmentation:
-            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+            contextual_embeddings = backbone_net(tokens, token_types, valid_length)
         else:
-            contextual_embeddings = self.backbone(tokens, valid_length)
+            contextual_embeddings = backbone_net(tokens, valid_length)
+
         start_logits = self.get_start_logits(contextual_embeddings, p_mask)
         # The shape of start_top_index will be (..., start_top_n)
         start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
index 521ee15a47..a3400573d4 100644
--- a/scripts/question_answering/run_squad.py
+++ b/scripts/question_answering/run_squad.py
@@ -120,7 +120,7 @@ def parse_args():
                              'this will be truncated to this length. default is 64')
     parser.add_argument('--pre_shuffle_seed', type=int, default=100,
                         help='Random seed for pre split shuffle')
-    parser.add_argument('--round_to', type=int, default=None,
+    parser.add_argument('--round_to', type=int, default=8,
                         help='The length of padded sequences will be rounded up to be multiple'
                              ' of this argument. When round to is set to 8, training throughput '
                              'may increase for mixed precision training on GPUs with TensorCores.')
@@ -147,9 +147,9 @@ def parse_args():
     parser.add_argument('--max_saved_ckpt', type=int, default=5,
                         help='The maximum number of saved checkpoints')
     parser.add_argument('--dtype', type=str, default='float32',
-                        help='Data type used for evaluation. Either float32 or float16. When you '
+                        help='Data type used for evaluation. Either float32, float16 or int8. When you '
                              'use --dtype float16, amp will be turned on in the training phase and '
-                             'fp16 will be used in evaluation.')
+                             'fp16 will be used in evaluation. For now int8 data type is supported on CPU only.')
     args = parser.parse_args()
     return args
 
@@ -195,13 +195,12 @@ def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
         self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id
 
         # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
-        # Here, we use round_to=8 to improve the throughput.
         self.BatchifyFunction = bf.NamedTuple(ChunkFeature,
                                          {'qas_id': bf.List(),
-                                          'data': bf.Pad(val=self.pad_id, round_to=8),
+                                          'data': bf.Pad(val=self.pad_id, round_to=args.round_to),
                                           'valid_length': bf.Stack(),
-                                          'segment_ids': bf.Pad(round_to=8),
-                                          'masks': bf.Pad(val=1, round_to=8),
+                                          'segment_ids': bf.Pad(round_to=args.round_to),
+                                          'masks': bf.Pad(val=1, round_to=args.round_to),
                                           'is_impossible': bf.Stack(),
                                           'gt_start': bf.Stack(),
                                           'gt_end': bf.Stack(),
@@ -357,7 +356,7 @@ def get_squad_features(args, tokenizer, segment):
                                                        tokenizer=tokenizer,
                                                        is_training=is_training), data_examples)
         logging.info('Done! Time spent:{:.2f} seconds'.format(time.time() - start))
-        with open(data_cache_path, 'w') as f:
+        with open(data_cache_path, 'w', encoding='utf-8') as f:
             for feature in data_features:
                 f.write(feature.to_json() + '\n')
 
@@ -815,6 +814,83 @@ def predict_extended(original_feature,
     assert len(nbest_json) >= 1
     return not_answerable_score, nbest[0][0], nbest_json
 
+def quantize_and_calibrate(net, dataloader): 
+  class QuantizationDataLoader(mx.gluon.data.DataLoader):
+    def __init__(self, dataloader, use_segmentation):
+      self._dataloader = dataloader
+      self._iter = None
+      self._use_segmentation = use_segmentation
+   
+    def __iter__(self):
+      self._iter = iter(self._dataloader)
+      return self
+  
+    def __next__(self):
+      batch = next(self._iter)
+      if self._use_segmentation:
+        return [batch.data, batch.segment_ids, batch.valid_length]
+      else:
+        return [batch.data, batch.valid_length]
+  
+    def __del__(self):
+      del(self._dataloader)
+
+  class BertLayerCollector(mx.contrib.quantization.CalibrationCollector):
+    """Saves layer output min and max values in a dict with layer names as keys.
+    The collected min and max values will be directly used as thresholds for quantization.
+    """
+    def __init__(self, clip_min, clip_max):
+        super(BertLayerCollector, self).__init__()
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+
+    def collect(self, name, op_name, arr):
+        """Callback function for collecting min and max values from an NDArray."""
+        if name not in self.include_layers:
+            return
+        arr = arr.copyto(mx.cpu()).asnumpy()
+        min_range = np.min(arr)
+        max_range = np.max(arr)
+
+        if (name.find("sg_onednn_fully_connected_eltwise") != -1 or op_name.find("LayerNorm") != -1) \
+            and max_range > self.clip_max:
+           max_range = self.clip_max
+        elif name.find('sg_onednn_fully_connected') != -1 and min_range < self.clip_min:
+            min_range = self.clip_min
+
+        if name in self.min_max_dict:
+            cur_min_max = self.min_max_dict[name]
+            self.min_max_dict[name] = (min(cur_min_max[0], min_range),
+                                       max(cur_min_max[1], max_range))
+        else:
+            self.min_max_dict[name] = (min_range, max_range)
+
+  calib_data = QuantizationDataLoader(dataloader, net.use_segmentation)
+  model_name = args.model_name
+  # disable specific layers in some models for the sake of accuracy
+
+  if model_name == 'google_albert_base_v2':
+    logging.warn(f"Currently quantized {model_name} shows significant accuracy drop which is not fixed yet")
+
+  exclude_layers_map = {"google_electra_large":
+                            ["sg_onednn_fully_connected_eltwise_2", "sg_onednn_fully_connected_eltwise_14", 
+                             "sg_onednn_fully_connected_eltwise_18", "sg_onednn_fully_connected_eltwise_22",
+                             "sg_onednn_fully_connected_eltwise_26"
+                            ]}
+  exclude_layers = None
+  if model_name in exclude_layers_map.keys():
+      exclude_layers = exclude_layers_map[model_name]
+  net.quantized_backbone = mx.contrib.quant.quantize_net(net.backbone, quantized_dtype='auto',
+                                                         quantize_mode='smart',
+                                                         exclude_layers=exclude_layers,
+                                                         exclude_layers_match=None,
+                                                         calib_data=calib_data,
+                                                         calib_mode='custom',
+                                                         LayerOutputCollector=BertLayerCollector(clip_min=-50, clip_max=10),
+                                                         num_calib_batches=10,
+                                                         ctx=mx.cpu())
+  return net 
+
 
 def evaluate(args, last=True):
     store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
@@ -826,11 +902,12 @@ def evaluate(args, last=True):
         return
     ctx_l = parse_ctx(args.gpus)
     logging.info(
-        'Srarting inference without horovod on the first node on device {}'.format(
+        'Starting inference without horovod on the first node on device {}'.format(
             str(ctx_l)))
+    network_dtype = args.dtype if args.dtype != 'int8' else 'float32'
 
     cfg, tokenizer, qa_net, use_segmentation = get_network(
-        args.model_name, ctx_l, args.classifier_dropout, dtype=args.dtype)
+        args.model_name, ctx_l, args.classifier_dropout, dtype=network_dtype)
     if args.dtype == 'float16':
         qa_net.cast('float16')
         qa_net.hybridize()
@@ -860,6 +937,9 @@ def eval_validation(ckpt_name, best_eval):
             num_workers=0,
             shuffle=False)
 
+        if args.dtype == 'int8':
+            quantize_and_calibrate(qa_net, dev_dataloader)
+
         log_interval = args.eval_log_interval
         all_results = []
         epoch_tic = time.time()
@@ -999,6 +1079,11 @@ def eval_validation(ckpt_name, best_eval):
 if __name__ == '__main__':
     os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
     args = parse_args()
+    if args.dtype == 'int8':
+        ctx_l = parse_ctx(args.gpus)
+        if ctx_l[0] != mx.cpu() or len(ctx_l) != 1:
+            raise ValueError("Evaluation on int8 data type is supported only for CPU for now")
+
     if args.do_train:
         if args.dtype == 'float16':
             # Initialize amp if it's fp16 training
diff --git a/scripts/question_answering/run_squad_albert.py b/scripts/question_answering/run_squad_albert.py
new file mode 100644
index 0000000000..cbff26b8f8
--- /dev/null
+++ b/scripts/question_answering/run_squad_albert.py
@@ -0,0 +1,802 @@
+"""
+Question Answering with Pretrained Language Model
+"""
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import os
+import json
+import time
+import logging
+import argparse
+import ast
+import functools
+import collections
+import dataclasses
+from dataclasses import dataclass
+from multiprocessing import Pool, cpu_count
+from neural_compressor.experimental import Quantization
+
+import mxnet as mx
+import numpy as np
+#from mxnet import amp
+from mxnet.lr_scheduler import PolyScheduler
+
+import gluonnlp.data.batchify as bf
+from models import ModelForQABasic, ModelForQAConditionalV1
+from eval_utils import squad_eval
+from squad_utils import SquadFeature, get_squad_examples, convert_squad_example_to_feature, get_squad_examples_from_json
+from gluonnlp.models import get_backbone
+from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, \
+    logging_config, parse_ctx
+from gluonnlp.initializer import TruncNorm
+from gluonnlp.data.sampler import SplitSampler
+from gluonnlp.utils.parameter import grad_global_norm, clip_grad_global_norm, count_parameters,\
+    deduplicate_param_dict
+
+import custom_strategy
+
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    pass
+
+mx.npx.set_np()
+
+CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
+if not os.path.exists(CACHE_PATH):
+    os.makedirs(CACHE_PATH, exist_ok=True)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Question Answering example. '
+                    'We fine-tune the pretrained model on SQuAD dataset.')
+    parser.add_argument('--model_name', type=str, default='google_albert_base_v2',
+                        help='Name of the pretrained model.')
+    parser.add_argument('--do_train', action='store_true',
+                        help='Whether to train the model')
+    parser.add_argument('--do_eval', action='store_true', default=True,
+                        help='Whether to evaluate the model')
+    parser.add_argument('--data_dir', type=str, default='squad')
+    parser.add_argument('--version', default='2.0', choices=['1.1', '2.0'],
+                        help='Version of the SQuAD dataset.')
+    parser.add_argument('--output_dir', type=str, default='squad_out',
+                        help='The output directory where the model params will be written.'
+                             ' default is squad_out')
+    # Communication
+    parser.add_argument('--comm_backend', type=str, default='device',
+                        choices=['horovod', 'dist_sync_device', 'device'],
+                        help='Communication backend.')
+    parser.add_argument('--gpus', type=str, default='-1',
+                        help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.')
+    # Training hyperparameters
+    parser.add_argument('--seed', type=int, default=100, help='Random seed')
+    parser.add_argument('--log_interval', type=int, default=50,
+                        help='The logging interval for training')
+    parser.add_argument('--eval_log_interval', type=int, default=10,
+                        help='The logging interval for evaluation')
+    parser.add_argument('--save_interval', type=int, default=None,
+                        help='the number of steps to save model parameters.'
+                             'default is every epoch')
+    parser.add_argument('--epochs', type=float, default=3.0,
+                        help='Number of epochs, default is 3')
+    parser.add_argument('--num_train_steps', type=int, default=None,
+                        help='The number of training steps. Note that epochs will be ignored '
+                             'if training steps are set')
+    parser.add_argument('--batch_size', type=int, default=8,
+                        help='Batch size. Number of examples per gpu in a minibatch. default is 32')
+    parser.add_argument('--eval_batch_size', type=int, default=8,
+                        help='Evaluate batch size. Number of examples per gpu in a minibatch for '
+                             'evaluation.')
+    parser.add_argument('--max_grad_norm', type=float, default=1.0,
+                        help='Max gradient norm.')
+    parser.add_argument('--optimizer', type=str, default='adamw',
+                        help='optimization algorithm. default is adamw')
+    parser.add_argument('--adam_epsilon', type=float, default=1e-6,
+                        help='epsilon of AdamW optimizer')
+    parser.add_argument('--adam_betas', default='(0.9, 0.999)', metavar='B',
+                        help='betas for Adam optimizer')
+    parser.add_argument('--num_accumulated', type=int, default=3,
+                        help='The number of batches for gradients accumulation to '
+                             'simulate large batch size.')
+    parser.add_argument('--lr', type=float, default=2e-5,
+                        help='Initial learning rate. default is 2e-5')
+    parser.add_argument('--warmup_ratio', type=float, default=0.1,
+                        help='Ratio of warmup steps in the learning rate scheduler.')
+    parser.add_argument('--warmup_steps', type=int, default=None,
+                        help='warmup steps. Note that either warmup_steps or warmup_ratio is set.')
+    parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
+    parser.add_argument('--layerwise_decay', type=float, default=-1, help='Layer-wise lr decay')
+    parser.add_argument('--untunable_depth', type=float, default=-1,
+                        help='Depth of untunable parameters')
+    parser.add_argument('--classifier_dropout', type=float, default=0.1,
+                        help='dropout of classifier')
+    # Data pre/post processing
+    parser.add_argument('--max_seq_length', type=int, default=512,
+                        help='The maximum total input sequence length after tokenization.'
+                             'Sequences longer than this will be truncated, and sequences shorter '
+                             'than this will be padded. default is 512')
+    parser.add_argument('--doc_stride', type=int, default=128,
+                        help='When splitting up a long document into chunks, how much stride to '
+                             'take between chunks. default is 128')
+    parser.add_argument('--max_query_length', type=int, default=64,
+                        help='The maximum number of tokens for the query. Questions longer than '
+                             'this will be truncated to this length. default is 64')
+    parser.add_argument('--pre_shuffle_seed', type=int, default=100,
+                        help='Random seed for pre split shuffle')
+    parser.add_argument('--round_to', type=int, default=None,
+                        help='The length of padded sequences will be rounded up to be multiple'
+                             ' of this argument. When round to is set to 8, training throughput '
+                             'may increase for mixed precision training on GPUs with TensorCores.')
+    parser.add_argument('--overwrite_cache', action='store_true', default=True,
+                        help='Whether to overwrite the feature cache.')
+    # Evaluation hyperparameters
+    parser.add_argument('--start_top_n', type=int, default=5,
+                        help='Number of start-position candidates')
+    parser.add_argument('--end_top_n', type=int, default=5,
+                        help='Number of end-position candidates corresponding '
+                             'to a start position')
+    parser.add_argument('--n_best_size', type=int, default=20, help='Top N results written to file')
+    parser.add_argument('--max_answer_length', type=int, default=30,
+                        help='The maximum length of an answer that can be generated. This is '
+                             'needed because the start and end predictions are not conditioned '
+                             'on one another. default is 30')
+    parser.add_argument('--param_checkpoint', type=str, default='fintune_google_albert_base_v2_squad_2.0/google_albert_base_v2_squad2.0_8163.params',#'google_electra_base_squad2.0_8160.params',#'google_en_uncased_bert_large_squad2.0_8159.params',
+                        help='The parameter checkpoint for evaluating the model')
+    parser.add_argument('--backbone_path', type=str, default=None,
+                        help='The parameter checkpoint of backbone model')
+    parser.add_argument('--all_evaluate', action='store_true',
+                        help='Whether to evaluate all intermediate checkpoints '
+                             'instead of only last one')
+    parser.add_argument('--max_saved_ckpt', type=int, default=5,
+                        help='The maximum number of saved checkpoints')
+    parser.add_argument('--dtype', type=str, default='float32',
+                        help='Data type used for evaluation. Either float32, float16 or int8. When you '
+                             'use --dtype float16, amp will be turned on in the training phase and '
+                             'fp16 will be used in evaluation. For now int8 data type is supported on CPU only.')
+    args = parser.parse_args()
+    return args
+
+
+ChunkFeature = collections.namedtuple('ChunkFeature',
+                                      ['qas_id',
+                                       'data',
+                                       'valid_length',
+                                       'segment_ids',
+                                       'masks',
+                                       'is_impossible',
+                                       'gt_start',
+                                       'gt_end',
+                                       'context_offset',
+                                       'chunk_start',
+                                       'chunk_length'])
+
+class SquadDatasetProcessor:
+
+    def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
+        """
+
+        Parameters
+        ----------
+        tokenizer
+            The tokenizer
+        doc_stride
+            The stride to chunk the document
+        max_seq_length
+            Maximum length of the merged data
+        max_query_length
+            Maximum query length
+        """
+        self._tokenizer = tokenizer
+        self._doc_stride = doc_stride
+        self._max_seq_length = max_seq_length
+        self._max_query_length = max_query_length
+
+        vocab = tokenizer.vocab
+        self.pad_id = vocab.pad_id
+        # For roberta model, taking sepecial token <s> as [CLS] and </s> as [SEP]
+        self.cls_id = vocab.bos_id if 'cls_token' not in vocab.special_token_keys else vocab.cls_id
+        self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id
+
+        # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
+        # Here, we use round_to=8 to improve the throughput.
+        self.BatchifyFunction = bf.NamedTuple(ChunkFeature,
+                                              {'qas_id': bf.List(),
+                                               'data': bf.Pad(val=self.pad_id, round_to=8),
+                                               'valid_length': bf.Stack(),
+                                               'segment_ids': bf.Pad(round_to=8),
+                                               'masks': bf.Pad(val=1, round_to=8),
+                                               'is_impossible': bf.Stack(),
+                                               'gt_start': bf.Stack(),
+                                               'gt_end': bf.Stack(),
+                                               'context_offset': bf.Stack(),
+                                               'chunk_start': bf.Stack(),
+                                               'chunk_length': bf.Stack()})
+
+    def process_sample(self, feature: SquadFeature):
+        """Process the data to the following format.
+
+        Note that we mask all the special tokens except the CLS token. The reason for not masking
+        the CLS token is that if the question is not answerable, we will set the start and end to
+        be 0.
+
+
+        Merged:      <CLS> Question <SEP> Context <SEP>
+        Segment IDs:  0       0       0      1      1
+        Mask:         0       1       1      0      1
+
+        Here, we need to emphasize that when mask = 1, the data are actually not masked!
+
+        Parameters
+        ----------
+        feature
+            Tokenized SQuAD feature
+
+        Returns
+        -------
+        ret
+            Divide the feature into multiple chunks and extract the feature which contains
+            the following:
+            - data
+                The data that concatenates the query and the context + special tokens
+            - valid_length
+                The valid_length of the data
+            - segment_ids
+                We assign the query part as segment 0 and the context part as segment 1.
+            - masks
+                We mask all the special tokens. 1 --> not masked, 0 --> masked.
+            - is_impossible
+                Whether the provided context is impossible to answer or not.
+            - gt_start
+                The ground-truth start location of the span
+            - gt_end
+                The ground-truth end location of the span
+            - chunk_start
+                The start of the chunk
+            - chunk_length
+                The length of the chunk
+        """
+        ret = []
+        truncated_query_ids = feature.query_token_ids[:self._max_query_length]
+        chunks = feature.get_chunks(
+            doc_stride=self._doc_stride,
+            max_chunk_length=self._max_seq_length - len(truncated_query_ids) - 3)
+        for chunk in chunks:
+            data = np.array([self.cls_id] + truncated_query_ids + [self.sep_id] +
+                            feature.context_token_ids[chunk.start:(chunk.start + chunk.length)] +
+                            [self.sep_id], dtype=np.int32)
+            valid_length = len(data)
+            segment_ids = np.array([0] + [0] * len(truncated_query_ids) +
+                                   [0] + [1] * chunk.length + [1], dtype=np.int32)
+            masks = np.array([0] + [1] * len(truncated_query_ids) + [1] + [0] * chunk.length + [1],
+                             dtype=np.int32)
+            context_offset = len(truncated_query_ids) + 2
+            if chunk.gt_start_pos is None and chunk.gt_end_pos is None:
+                start_pos = 0
+                end_pos = 0
+            else:
+                # Here, we increase the start and end because we put query before context
+                start_pos = chunk.gt_start_pos + context_offset
+                end_pos = chunk.gt_end_pos + context_offset
+            chunk_feature = ChunkFeature(qas_id=feature.qas_id,
+                                         data=data,
+                                         valid_length=valid_length,
+                                         segment_ids=segment_ids,
+                                         masks=masks,
+                                         is_impossible=chunk.is_impossible,
+                                         gt_start=start_pos,
+                                         gt_end=end_pos,
+                                         context_offset=context_offset,
+                                         chunk_start=chunk.start,
+                                         chunk_length=chunk.length)
+            ret.append(chunk_feature)
+        return ret
+
+    def get_train(self, features, skip_unreliable=True):
+        """Get the training dataset
+
+        Parameters
+        ----------
+        features
+        skip_unreliable
+            Whether to skip the unreliable spans in the training set
+
+        Returns
+        -------
+        train_dataset
+        num_token_answer_mismatch
+        num_unreliable
+        """
+        train_dataset = []
+        num_token_answer_mismatch = 0
+        num_unreliable = 0
+        for feature in features:
+            if feature.token_answer_mismatch:
+                num_token_answer_mismatch += 1
+            if feature.unreliable_span:
+                num_unreliable += 1
+            if skip_unreliable and feature.unreliable_span:
+                # Skip when not reliable
+                continue
+            # Process the feature
+            chunk_features = self.process_sample(feature)
+            train_dataset.extend(chunk_features)
+        return train_dataset, num_token_answer_mismatch, num_unreliable
+
+
+def get_squad_features(args, tokenizer, segment):
+    """
+    Get processed data features of SQuADExampls
+
+    Parameters
+    ----------
+    args : argparse.Namespace
+    tokenizer:
+        Tokenizer instance
+    segment: str
+        train or dev
+
+    Returns
+    -------
+    data_features
+        The list of processed data features
+    """
+    data_cache_path = os.path.join(CACHE_PATH,
+                                   '{}_{}_squad_{}.ndjson'.format(
+                                       segment, args.model_name, args.version))
+    is_training = (segment == 'train')
+    if os.path.exists(data_cache_path) and not args.overwrite_cache:
+        data_features = []
+        with open(data_cache_path, 'r') as f:
+            for line in f:
+                data_features.append(SquadFeature.from_json(line))
+        logging.info('Found cached data features, load from {}'.format(data_cache_path))
+    else:
+        data_examples = get_squad_examples(args.data_dir, segment=segment, version=args.version)
+        start = time.time()
+        num_process = min(cpu_count(), 8)
+        logging.info('Tokenize Data:')
+        with Pool(num_process) as pool:
+            data_features = pool.map(functools.partial(convert_squad_example_to_feature,
+                                                       tokenizer=tokenizer,
+                                                       is_training=is_training), data_examples)
+        logging.info('Done! Time spent:{:.2f} seconds'.format(time.time() - start))
+        with open(data_cache_path, 'w') as f:
+            for feature in data_features:
+                f.write(feature.to_json() + '\n')
+
+    return data_features
+
+
+def get_network(model_name,
+                ctx_l,
+                dropout=0.1,
+                checkpoint_path=None,
+                backbone_path=None,
+                dtype='float32'):
+    """
+    Get the network that fine-tune the Question Answering Task
+
+    Parameters
+    ----------
+    model_name : str
+        The model name of the backbone model
+    ctx_l :
+        Context list of training device like [mx.gpu(0), mx.gpu(1)]
+    dropout : float
+        Dropout probability of the task specified layer
+    checkpoint_path: str
+        Path to a Fine-tuned checkpoint
+    backbone_path: str
+        Path to the backbone model to be loaded in qa_net
+
+    Returns
+    -------
+    cfg
+    tokenizer
+    qa_net
+    use_segmentation
+    """
+    # Create the network
+    use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name
+    Model, cfg, tokenizer, download_params_path, _ = \
+        get_backbone(model_name, load_backbone=not backbone_path)
+    backbone = Model.from_cfg(cfg, use_pooler=False, dtype=dtype)
+    # Load local backbone parameters if backbone_path provided.
+    # Otherwise, download backbone parameters from gluon zoo.
+
+    backbone_params_path = backbone_path if backbone_path else download_params_path
+    if checkpoint_path is None:
+        backbone.load_parameters(backbone_params_path, ignore_extra=True,
+                                 ctx=ctx_l, cast_dtype=True)
+        num_params, num_fixed_params\
+            = count_parameters(deduplicate_param_dict(backbone.collect_params()))
+        logging.info(
+            'Loading Backbone Model from {}, with total/fixd parameters={}/{}'.format(
+                backbone_params_path, num_params, num_fixed_params))
+    qa_net = ModelForQAConditionalV1(backbone=backbone,
+                                     dropout_prob=dropout,
+                                     use_segmentation=use_segmentation,
+                                     weight_initializer=TruncNorm(stdev=0.02))
+    if checkpoint_path is None:
+        # Ignore the UserWarning during initialization,
+        # There is no need to re-initialize the parameters of backbone
+        qa_net.initialize(ctx=ctx_l)
+    else:
+        qa_net.load_parameters(checkpoint_path, ctx=ctx_l, cast_dtype=True)
+    qa_net.hybridize()
+
+    return cfg, tokenizer, qa_net, use_segmentation
+
+
+def setup_logging(args, local_rank):
+    """
+    Setup logging configuration as well as random seed
+    """
+    logging_config(args.output_dir,
+                   name='finetune_squad{}'.format(args.version),  # avoid race
+                   overwrite_handler=True,
+                   console=(local_rank == 0))
+    logging.info(args)
+    set_seed(args.seed)
+    logging.debug('Random seed set to {}'.format(args.seed))
+
+
+RawResultExtended = collections.namedtuple(
+    'RawResultExtended',
+    ['qas_id',
+     'start_top_logits',
+     'start_top_index',
+     'end_top_logits',
+     'end_top_index',
+     'answerable_logits'])
+
+
+def predict_extended(original_feature,
+                     chunked_features,
+                     results,
+                     n_best_size,
+                     max_answer_length=64,
+                     start_top_n=5,
+                     end_top_n=5):
+    """Get prediction results for SQuAD.
+
+    Start Logits: (B, N_start)
+    End Logits: (B, N_start, N_end)
+
+    Parameters
+    ----------
+    original_feature:
+        The original SquadFeature before chunked
+    chunked_features
+        List of ChunkFeatures
+    results
+        List of model predictions for span start and span end.
+    n_best_size
+        Best N results written to file
+    max_answer_length
+        Maximum length of the answer tokens.
+    start_top_n
+        Number of start-position candidates
+    end_top_n
+        Number of end-position candidates
+
+    Returns
+    -------
+    not_answerable_score
+        Model's estimate that the question is not answerable.
+    prediction
+        The final prediction.
+    nbest_json
+        n-best predictions with their probabilities.
+    """
+    not_answerable_score = 1000000  # Score for not-answerable. We set it to be a large and positive
+    # If one chunk votes for answerable, we will treat the context as answerable,
+    # Thus, the overall not_answerable_score = min(chunk_not_answerable_score)
+    all_start_idx = []
+    all_end_idx = []
+    all_pred_score = []
+    context_length = len(original_feature.context_token_ids)
+    token_max_context_score = np.full((len(chunked_features), context_length),
+                                      -np.inf,
+                                      dtype=np.float32)
+    for i, chunked_feature in enumerate(chunked_features):
+        chunk_start = chunked_feature.chunk_start
+        chunk_length = chunked_feature.chunk_length
+        for j in range(chunk_start, chunk_start + chunk_length):
+            # This is a heuristic score
+            # TODO investigate the impact
+            token_max_context_score[i, j] = min(j - chunk_start,
+                                                chunk_start + chunk_length - 1 - j) \
+                + 0.01 * chunk_length
+    token_max_chunk_id = token_max_context_score.argmax(axis=0)
+
+    for chunk_id, (result, chunk_feature) in enumerate(zip(results, chunked_features)):
+        # We use the log-likelihood as the not answerable score.
+        # Thus, a high score indicates that the answer is not answerable
+        cur_not_answerable_score = float(result.answerable_logits[1])
+        not_answerable_score = min(not_answerable_score, cur_not_answerable_score)
+        # Calculate the start_logits + end_logits as the overall score
+        context_offset = chunk_feature.context_offset
+        chunk_start = chunk_feature.chunk_start
+        chunk_length = chunk_feature.chunk_length
+        for i in range(start_top_n):
+            for j in range(end_top_n):
+                pred_score = result.start_top_logits[i] + result.end_top_logits[i, j]
+                start_index = result.start_top_index[i]
+                end_index = result.end_top_index[i, j]
+                # We could hypothetically create invalid predictions, e.g., predict
+                # that the start of the answer span is in the query tokens or out of
+                # the chunk. We throw out all invalid predictions.
+                if not (context_offset <= start_index < context_offset + chunk_length) or \
+                   not (context_offset <= end_index < context_offset + chunk_length) or \
+                   end_index < start_index:
+                    continue
+                pred_answer_length = end_index - start_index + 1
+                if pred_answer_length > max_answer_length:
+                    continue
+                start_idx = int(start_index - context_offset + chunk_start)
+                end_idx = int(end_index - context_offset + chunk_start)
+                if token_max_chunk_id[start_idx] != chunk_id:
+                    continue
+                all_start_idx.append(start_idx)
+                all_end_idx.append(end_idx)
+                all_pred_score.append(pred_score)
+    sorted_start_end_score = sorted(zip(all_start_idx, all_end_idx, all_pred_score),
+                                    key=lambda args: args[-1], reverse=True)
+    nbest = []
+    context_text = original_feature.context_text
+    context_token_offsets = original_feature.context_token_offsets
+    seen_predictions = set()
+    for start_idx, end_idx, pred_score in sorted_start_end_score:
+        if len(seen_predictions) >= n_best_size:
+            break
+        pred_answer = context_text[context_token_offsets[start_idx][0]:
+                                   context_token_offsets[end_idx][1]]
+        seen_predictions.add(pred_answer)
+        nbest.append((pred_answer, pred_score))
+
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if len(nbest) == 0:
+        nbest.append(('', float('-inf')))
+    all_scores = np.array([ele[1] for ele in nbest], dtype=np.float32)
+    probs = np.exp(all_scores) / np.sum(np.exp(all_scores))
+    nbest_json = []
+    for i, (entry, prob) in enumerate(zip(nbest, probs)):
+        output = collections.OrderedDict()
+        output['text'] = entry[0]
+        output['probability'] = float(prob)
+        nbest_json.append(output)
+
+    assert len(nbest_json) >= 1
+    return not_answerable_score, nbest[0][0], nbest_json
+
+
+def evaluate(args, last=True):
+    ctx_l = parse_ctx(args.gpus)
+    setup_logging(args, 0)
+
+    cfg, tokenizer, qa_net, use_segmentation = get_network(
+        args.model_name, ctx_l, args.classifier_dropout, dtype='float32')
+
+    dev_data_path = os.path.join(args.data_dir, 'dev-v{}.json'.format(args.version))
+    with open(dev_data_path, 'r') as f:
+        dev_dataset = json.load(f)['data']
+    expl = get_squad_examples_from_json(dev_data_path, is_training=False)
+    import random
+    random.shuffle(expl)
+    expl = expl[:512+256]
+    eval_qa_ids = {qa.qas_id for qa in expl}
+    for article in dev_dataset:
+        for paragraph in article['paragraphs']:
+            new_qas = []
+            for qa in paragraph['qas']:
+                if qa['id'] in eval_qa_ids:
+                    new_qas.append(qa)
+            paragraph['qas'] = new_qas
+
+    num_process = min(cpu_count(), 8)
+    with Pool(num_process) as pool:
+        dev_features = pool.map(functools.partial(convert_squad_example_to_feature,
+                                                  tokenizer=tokenizer,
+                                                  is_training=False), expl)
+    dataset_processor = SquadDatasetProcessor(tokenizer=tokenizer,
+                                              doc_stride=args.doc_stride,
+                                              max_seq_length=args.max_seq_length,
+                                              max_query_length=args.max_query_length)
+    dev_all_chunk_features = []
+    dev_chunk_feature_ptr = [0]
+    for feature in dev_features:
+        chunk_features = dataset_processor.process_sample(feature)
+        dev_all_chunk_features.extend(chunk_features)
+        dev_chunk_feature_ptr.append(dev_chunk_feature_ptr[-1] + len(chunk_features))
+
+    class QuantizationDataLoader(mx.gluon.data.DataLoader):
+        def __init__(self, dataloader, use_segmentation):
+            self._dataloader = dataloader
+            self._iter = None
+            self._use_segmentation = use_segmentation
+            self.batch_size = args.eval_batch_size
+            self._batch_sampler = dataloader._batch_sampler
+
+        def __iter__(self):
+            self._iter = iter(self._dataloader)
+            return self
+
+        def __next__(self):
+            batch = next(self._iter)
+            if self._use_segmentation:
+                return [batch.data, batch.segment_ids, batch.valid_length]
+            else:
+                return [batch.data, batch.valid_length]
+
+        def __del__(self):
+            del(self._dataloader)
+
+    dev_dataloader = QuantizationDataLoader(mx.gluon.data.DataLoader(
+        dev_all_chunk_features,
+        batchify_fn=dataset_processor.BatchifyFunction,
+        batch_size=args.eval_batch_size,
+        num_workers=0,
+        shuffle=False), qa_net.use_segmentation)
+
+    def quantize_and_calibrate(net):
+        from neural_compressor.experimental import Quantization, common
+        quantizer = Quantization('albert_custom.yaml')
+        quantizer.model = common.Model(net.backbone)
+        quantizer.calib_dataloader = dev_dataloader
+        quantizer.eval_func = eval_validation
+        net.quantized_backbone = quantizer().model
+        net.quantized_backbone.export("best_conf_inc_model")
+        return net
+
+    def eval_validation(backbone):
+        """
+        Model inference during validation or final evaluation.
+        """
+        del qa_net.quantized_backbone
+        qa_net.quantized_backbone = backbone
+        
+        dev_dataloader = mx.gluon.data.DataLoader(
+            dev_all_chunk_features,
+            batchify_fn=dataset_processor.BatchifyFunction,
+            batch_size=args.eval_batch_size,
+            num_workers=0,
+            shuffle=False)
+
+        log_interval = args.eval_log_interval
+        all_results = []
+        epoch_tic = time.time()
+        tic = time.time()
+        epoch_size = len(dev_features)
+        total_num = 0
+        log_num = 0
+        best_eval = {}
+        for batch_idx, dev_batch in enumerate(grouper(dev_dataloader, len(ctx_l))):
+            # Predict for each chunk
+            for sample, ctx in zip(dev_batch, ctx_l):
+                if sample is None:
+                    continue
+                # Copy the data to device
+                tokens = sample.data.as_in_ctx(ctx)
+                total_num += len(tokens)
+                log_num += len(tokens)
+                segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None
+                valid_length = sample.valid_length.as_in_ctx(ctx)
+                p_mask = sample.masks.as_in_ctx(ctx)
+                p_mask = 1 - p_mask  # In the network, we use 1 --> no_mask, 0 --> mask
+                start_top_logits, start_top_index, end_top_logits, end_top_index, answerable_logits \
+                    = qa_net.inference(tokens, segment_ids, valid_length, p_mask,
+                                       args.start_top_n, args.end_top_n)
+                for i, qas_id in enumerate(sample.qas_id):
+                    result = RawResultExtended(qas_id=qas_id,
+                                               start_top_logits=start_top_logits[i].asnumpy(),
+                                               start_top_index=start_top_index[i].asnumpy(),
+                                               end_top_logits=end_top_logits[i].asnumpy(),
+                                               end_top_index=end_top_index[i].asnumpy(),
+                                               answerable_logits=answerable_logits[i].asnumpy())
+
+                    all_results.append(result)
+
+            # logging
+            if (batch_idx + 1) % log_interval == 0:
+                # Output the loss of per step
+                toc = time.time()
+                logging.info(
+                    '[batch {}], Time cost={:.2f},'
+                    ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format(
+                        batch_idx + 1, toc - tic, log_num / (toc - tic),
+                        (epoch_size - total_num) / (total_num / (toc - epoch_tic)) / 3600))
+                tic = time.time()
+                log_num = 0
+
+        epoch_toc = time.time()
+        logging.info('Time cost=%2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
+                     total_num / (epoch_toc - epoch_tic))
+
+        all_predictions = collections.OrderedDict()
+        all_nbest_json = collections.OrderedDict()
+        no_answer_score_json = collections.OrderedDict()
+        for index, (left_index, right_index) in enumerate(zip(dev_chunk_feature_ptr[:-1],
+                                                              dev_chunk_feature_ptr[1:])):
+            chunked_features = dev_all_chunk_features[left_index:right_index]
+            results = all_results[left_index:right_index]
+            original_feature = dev_features[index]
+            qas_ids = set([result.qas_id for result in results] +
+                          [feature.qas_id for feature in chunked_features])
+            assert len(qas_ids) == 1, 'Mismatch Occured between features and results'
+            example_qas_id = list(qas_ids)[0]
+            assert example_qas_id == original_feature.qas_id, \
+                'Mismatch Occured between original feature and chunked features'
+            not_answerable_score, best_pred, nbest_json = predict_extended(
+                original_feature=original_feature,
+                chunked_features=chunked_features,
+                results=results,
+                n_best_size=args.n_best_size,
+                max_answer_length=args.max_answer_length,
+                start_top_n=args.start_top_n,
+                end_top_n=args.end_top_n)
+            no_answer_score_json[example_qas_id] = not_answerable_score
+            all_predictions[example_qas_id] = best_pred
+            all_nbest_json[example_qas_id] = nbest_json
+
+        if args.version == '2.0':
+            exact = 'best_exact'
+            f1 = 'best_f1'
+            na_prob = no_answer_score_json
+        else:
+            exact = 'exact'
+            f1 = 'f1'
+            na_prob = None
+
+        cur_eval, revised_predictions = squad_eval(
+            dev_dataset, all_predictions, na_prob, revise=na_prob is not None)
+        logging.info('The evaluated results are {}'.format(json.dumps(cur_eval)))
+
+        cur_metrics = 0.5 * (cur_eval[exact] + cur_eval[f1])
+        if best_eval:
+            best_metrics = 0.5 * (best_eval[exact] + best_eval[f1])
+        else:
+            best_metrics = 0.
+
+        if cur_metrics > best_metrics:
+            logging.info('The evaluated files are saved in {}'.format(args.output_dir))
+            output_prediction_file = os.path.join(args.output_dir, 'predictions.json')
+            output_nbest_file = os.path.join(args.output_dir, 'nbest_predictions.json')
+            na_prob_file = os.path.join(args.output_dir, 'na_prob.json')
+            revised_prediction_file = os.path.join(args.output_dir, 'revised_predictions.json')
+
+            with open(output_prediction_file, 'w') as of:
+                of.write(json.dumps(all_predictions, indent=4) + '\n')
+            with open(output_nbest_file, 'w') as of:
+                of.write(json.dumps(all_nbest_json, indent=4) + '\n')
+            with open(na_prob_file, 'w') as of:
+                of.write(json.dumps(no_answer_score_json, indent=4) + '\n')
+            with open(revised_prediction_file, 'w') as of:
+                of.write(json.dumps(revised_predictions, indent=4) + '\n')
+
+            best_eval = cur_eval
+            best_eval.update({'best_ckpt': 'mybest'})
+        return best_eval['best_f1']/100
+
+    qa_net.load_parameters(args.param_checkpoint, ctx=ctx_l, cast_dtype=True)
+    data_example = next(iter(dev_dataloader))
+    
+    qa_net.backbone.optimize_for(*data_example, backend='ONEDNN')
+    quantized_backbone = quantize_and_calibrate(qa_net)
+    best_eval = eval_validation(quantized_backbone)
+
+    logging.info('The best evaluated results are {}'.format(json.dumps(best_eval)))
+    output_eval_results_file = os.path.join(args.output_dir, 'best_results.json')
+    with open(output_eval_results_file, 'w') as of:
+        of.write(json.dumps(best_eval, indent=4) + '\n')
+    return best_eval
+
+
+if __name__ == '__main__':
+    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+    args = parse_args()
+    evaluate(args, last=not args.all_evaluate)
+
+
diff --git a/setup.py b/setup.py
index baf44e6110..7b27865c84 100644
--- a/setup.py
+++ b/setup.py
@@ -133,6 +133,7 @@ def find_version(*file_paths):
             'pylint_quotes',
             'flake8',
             'recommonmark',
+            'sphinx>=1.5.5',
             'sphinx-gallery',
             'sphinx_rtd_theme',
             'mxtheme',