diff --git a/scripts/question_answering/albert_custom.yaml b/scripts/question_answering/albert_custom.yaml new file mode 100644 index 0000000000..51a06681cb --- /dev/null +++ b/scripts/question_answering/albert_custom.yaml @@ -0,0 +1,15 @@ +version: 1.0 + +model: + name: albert_base_v2 + framework: mxnet + +tuning: + strategy: + name: mycustom + accuracy_criterion: + relative: 0.02 + exit_policy: + timeout: 0 + max_trials: 1000 + random_seed: 9527 diff --git a/scripts/question_answering/custom_strategy.py b/scripts/question_answering/custom_strategy.py new file mode 100644 index 0000000000..6c236ff469 --- /dev/null +++ b/scripts/question_answering/custom_strategy.py @@ -0,0 +1,176 @@ +import copy +import numpy as np +from collections import OrderedDict +from neural_compressor.strategy.strategy import TuneStrategy, strategy_registry + +plot_operator_influence = True + +def calc_approx_error(expected_tensor: np.ndarray, observed_tensor: np.ndarray) -> float: + ''' + Calculating relative error for one tensor + ''' + error = observed_tensor - expected_tensor + absolute_error = np.abs(error) + mean_absolute_error = absolute_error.mean() + mean_expected_value = np.abs(expected_tensor).mean() + error = mean_absolute_error / mean_expected_value + return error + + +def get_approx_errors(expected_tensors, observed_tensors): + ''' + Calculating relative error for multiple tensors: Dict[tensors_name: str, tensor: np.ndarray] + ''' + errors = {} + for node_name in observed_tensors.keys(): + expected_tensor = expected_tensors[node_name][node_name] + observed_tensor = observed_tensors[node_name][node_name] + errors[node_name] = calc_approx_error(expected_tensor, observed_tensor) + return errors + + +@strategy_registry +class MyCustomTuneStrategy(TuneStrategy): + '''INC Custom strategy definition''' + def __init__(self, model, conf, q_dataloader, q_func=None, + eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None): + super().__init__( + model, + conf, + q_dataloader, + q_func, + eval_dataloader, + eval_func, + dicts, + q_hooks) + + + def get_qtensors(self, quant_cfg, node_list): + ''' + Generating quantized model based on configuration and capturing intermediate tensors + ''' + qmodel = self.adaptor.quantize(quant_cfg, self.model, self.calib_dataloader) + tensors = self.adaptor.inspect_tensor(qmodel, self.calib_dataloader, node_list, [1]) # 1 is a batch index + return tensors['activation'][0] # we need to specify that we want activation (layer output) because INC stores also weight tensors + # 0 is the first batch + def next_tune_cfg(self): + FALLBACK_DTYPE = 'fp32' + + # creating base configuration - all nodes are quantized and calibrated with minmax algorithm + best_cfg = {} + best_cfg['calib_iteration'] = int(self.calib_iter[0]) # number of batches for calibration + best_cfg['calib_sampling_size'] = int(self.calib_sampling_size[0]) # number of samples for calibration (multiplicity of batch) + nodes_cfg = OrderedDict() + nodes_cfg_idx = {} + for node_key, cfgs in self.opwise_tune_cfgs.items(): + for i, cfg in enumerate(cfgs): + if cfg['activation']['algorithm'] == 'minmax': + nodes_cfg_idx[node_key] = i + break + nodes_cfg[node_key] = cfg + best_cfg['op'] = nodes_cfg + + yield best_cfg + + # If fully quantized model does not meet the requirements, we proceed to exclude some nodes + + # Collecting tensors from the original model - expected tensors + node_list = [op_name for (op_name, op_type) in best_cfg['op'].keys()] + f32_tensors = self.adaptor.inspect_tensor(self.model, self.calib_dataloader, node_list, [1]) + f32_tensors = f32_tensors['activation'][0] + + # Collecting tensors from the fully quantized model + q_tensors = self.get_qtensors(best_cfg, node_list) + approx_errors = get_approx_errors(f32_tensors, q_tensors) + + # best_cfg['op'] is an OrderedDict, which order of elements should correspond to their + # order in the computational graph + for node_key, cfg in best_cfg['op'].items(): + # Node's key in INC is its name + its operator + node_name, node_op = node_key + # Checking what configuration options are available for this particular node + capabilities = self.opwise_tune_space[node_key]['activation']['dtype'] + # If a particular node can be excluded from quanrtization ('fp32' in capabilities) + # and current error is bigger than threshold value, we check what accuracy improvement + # would be achieved by this exclusion + if FALLBACK_DTYPE in capabilities and approx_errors[node_name] > 0.06: + original_dtype = cfg['activation']['dtype'] + cfg['activation']['dtype'] = FALLBACK_DTYPE # Exclude the node from quantization + + # Collecting tensors for a new configuration with the current node excluded + q_tensors = self.get_qtensors(best_cfg, node_list) + # Calculating errors for the new configuration + new_approx_errors = get_approx_errors(f32_tensors, q_tensors) + # Calculating error differences for every node in a model + err_diffs = {} + for tensor_node_name in new_approx_errors.keys(): + diff = approx_errors[tensor_node_name] - new_approx_errors[tensor_node_name] + err_diffs[tensor_node_name] = diff + err_diffs_arr = np.array(list(err_diffs.values())) + + # If the sum of errors on the following layers is greater than the threshold value we + # keep the node excluded + threshold_sum_error_layers = err_diffs_arr.size * 0.01 + if err_diffs_arr.sum() >= threshold_sum_error_layers: + before = approx_errors + after = approx_errors.copy() + after.update(new_approx_errors) + if plot_operator_influence: + import matplotlib.pyplot as plt + plt.figure() + plt.plot(before.values(), marker='o', markersize=2.5, label='Before') + plt.plot(after.values(), marker='o', markersize=2.5, label='After') + plt.ylabel('Relative error') + plt.xlabel('Layer') + plt.legend() + plt.savefig(f'{node_name}_error.png') + + approx_errors.update(new_approx_errors) + nodes_cfg_idx.pop(node_key) # Mark node as not quantizable + else: + cfg['activation']['dtype'] = original_dtype + + yield best_cfg + + # Choosing calibration algorithm (kl or minmax) for every node which was not excluded from quantization + for cfg in self.bayesian_configurations(best_cfg, nodes_cfg_idx): + yield cfg + + def bayesian_params_to_tune_configs(self, params): + ''' + Creating configuration from params - changing configurations' indexes for real configurations + ''' + node_cfgs = {} + for node_key, configs in self.opwise_quant_cfgs.items(): + if node_key in params: + value = int(params[node_key]) + value = min(value, len(configs) - 1) + node_cfgs[node_key] = copy.deepcopy(configs[value]) + return node_cfgs + + def bayesian_configurations(self, cfg_base, params_base): + from neural_compressor.strategy.bayesian import BayesianOptimization + + # For each node we specify the possible range of values (we treat them as a configurations' index) + pbounds = {} + for node_key, configs in self.opwise_quant_cfgs.items(): + if node_key in params_base and len(configs) > 1: + pbounds[node_key] = (0, len(configs)) + + cfg = copy.deepcopy(cfg_base) + if len(pbounds) == 0: # if there is nothing to be optimized, we finish + cfg['op'].update(self.bayesian_params_to_tune_configs(params_base)) + return + + bayes_opt = BayesianOptimization(pbounds=pbounds, random_seed=self.cfg.tuning.random_seed) + bayes_opt._space.register(params_base, self.last_tune_result[0]) # registering the outcome of current configuration + while True: + # Generating next configuration + params = bayes_opt.gen_next_params() + cfg['op'].update(self.bayesian_params_to_tune_configs(params)) + yield cfg + try: + # Registering the outcome + bayes_opt._space.register(params, self.last_tune_result[0]) + except KeyError: + pass diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py index b2ef10640b..db1cce2e00 100644 --- a/scripts/question_answering/models.py +++ b/scripts/question_answering/models.py @@ -180,6 +180,7 @@ def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1, self.answerable_scores.add(nn.Dense(2, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) + self.quantized_backbone = None def get_start_logits(self, contextual_embedding, p_mask): """ @@ -287,10 +288,14 @@ def forward(self, tokens, token_types, valid_length, p_mask, start_position): Shape (batch_size, sequence_length) answerable_logits """ + backbone_net = self.backbone + if self.quantized_backbone != None: + backbone_net = self.quantized_backbone + if self.use_segmentation: - contextual_embeddings = self.backbone(tokens, token_types, valid_length) + contextual_embeddings = backbone_net(tokens, token_types, valid_length) else: - contextual_embeddings = self.backbone(tokens, valid_length) + contextual_embeddings = backbone_net(tokens, valid_length) start_logits = self.get_start_logits(contextual_embeddings, p_mask) end_logits = self.get_end_logits(contextual_embeddings, np.expand_dims(start_position, axis=1), @@ -337,11 +342,16 @@ def inference(self, tokens, token_types, valid_length, p_mask, The answerable logits. Here 0 --> answerable and 1 --> not answerable. Shape (batch_size, sequence_length, 2) """ + backbone_net = self.backbone + if self.quantized_backbone != None: + backbone_net = self.quantized_backbone + # Shape (batch_size, sequence_length, C) if self.use_segmentation: - contextual_embeddings = self.backbone(tokens, token_types, valid_length) + contextual_embeddings = backbone_net(tokens, token_types, valid_length) else: - contextual_embeddings = self.backbone(tokens, valid_length) + contextual_embeddings = backbone_net(tokens, valid_length) + start_logits = self.get_start_logits(contextual_embeddings, p_mask) # The shape of start_top_index will be (..., start_top_n) start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1, diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py index 521ee15a47..a3400573d4 100644 --- a/scripts/question_answering/run_squad.py +++ b/scripts/question_answering/run_squad.py @@ -120,7 +120,7 @@ def parse_args(): 'this will be truncated to this length. default is 64') parser.add_argument('--pre_shuffle_seed', type=int, default=100, help='Random seed for pre split shuffle') - parser.add_argument('--round_to', type=int, default=None, + parser.add_argument('--round_to', type=int, default=8, help='The length of padded sequences will be rounded up to be multiple' ' of this argument. When round to is set to 8, training throughput ' 'may increase for mixed precision training on GPUs with TensorCores.') @@ -147,9 +147,9 @@ def parse_args(): parser.add_argument('--max_saved_ckpt', type=int, default=5, help='The maximum number of saved checkpoints') parser.add_argument('--dtype', type=str, default='float32', - help='Data type used for evaluation. Either float32 or float16. When you ' + help='Data type used for evaluation. Either float32, float16 or int8. When you ' 'use --dtype float16, amp will be turned on in the training phase and ' - 'fp16 will be used in evaluation.') + 'fp16 will be used in evaluation. For now int8 data type is supported on CPU only.') args = parser.parse_args() return args @@ -195,13 +195,12 @@ def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length): self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality. - # Here, we use round_to=8 to improve the throughput. self.BatchifyFunction = bf.NamedTuple(ChunkFeature, {'qas_id': bf.List(), - 'data': bf.Pad(val=self.pad_id, round_to=8), + 'data': bf.Pad(val=self.pad_id, round_to=args.round_to), 'valid_length': bf.Stack(), - 'segment_ids': bf.Pad(round_to=8), - 'masks': bf.Pad(val=1, round_to=8), + 'segment_ids': bf.Pad(round_to=args.round_to), + 'masks': bf.Pad(val=1, round_to=args.round_to), 'is_impossible': bf.Stack(), 'gt_start': bf.Stack(), 'gt_end': bf.Stack(), @@ -357,7 +356,7 @@ def get_squad_features(args, tokenizer, segment): tokenizer=tokenizer, is_training=is_training), data_examples) logging.info('Done! Time spent:{:.2f} seconds'.format(time.time() - start)) - with open(data_cache_path, 'w') as f: + with open(data_cache_path, 'w', encoding='utf-8') as f: for feature in data_features: f.write(feature.to_json() + '\n') @@ -815,6 +814,83 @@ def predict_extended(original_feature, assert len(nbest_json) >= 1 return not_answerable_score, nbest[0][0], nbest_json +def quantize_and_calibrate(net, dataloader): + class QuantizationDataLoader(mx.gluon.data.DataLoader): + def __init__(self, dataloader, use_segmentation): + self._dataloader = dataloader + self._iter = None + self._use_segmentation = use_segmentation + + def __iter__(self): + self._iter = iter(self._dataloader) + return self + + def __next__(self): + batch = next(self._iter) + if self._use_segmentation: + return [batch.data, batch.segment_ids, batch.valid_length] + else: + return [batch.data, batch.valid_length] + + def __del__(self): + del(self._dataloader) + + class BertLayerCollector(mx.contrib.quantization.CalibrationCollector): + """Saves layer output min and max values in a dict with layer names as keys. + The collected min and max values will be directly used as thresholds for quantization. + """ + def __init__(self, clip_min, clip_max): + super(BertLayerCollector, self).__init__() + self.clip_min = clip_min + self.clip_max = clip_max + + def collect(self, name, op_name, arr): + """Callback function for collecting min and max values from an NDArray.""" + if name not in self.include_layers: + return + arr = arr.copyto(mx.cpu()).asnumpy() + min_range = np.min(arr) + max_range = np.max(arr) + + if (name.find("sg_onednn_fully_connected_eltwise") != -1 or op_name.find("LayerNorm") != -1) \ + and max_range > self.clip_max: + max_range = self.clip_max + elif name.find('sg_onednn_fully_connected') != -1 and min_range < self.clip_min: + min_range = self.clip_min + + if name in self.min_max_dict: + cur_min_max = self.min_max_dict[name] + self.min_max_dict[name] = (min(cur_min_max[0], min_range), + max(cur_min_max[1], max_range)) + else: + self.min_max_dict[name] = (min_range, max_range) + + calib_data = QuantizationDataLoader(dataloader, net.use_segmentation) + model_name = args.model_name + # disable specific layers in some models for the sake of accuracy + + if model_name == 'google_albert_base_v2': + logging.warn(f"Currently quantized {model_name} shows significant accuracy drop which is not fixed yet") + + exclude_layers_map = {"google_electra_large": + ["sg_onednn_fully_connected_eltwise_2", "sg_onednn_fully_connected_eltwise_14", + "sg_onednn_fully_connected_eltwise_18", "sg_onednn_fully_connected_eltwise_22", + "sg_onednn_fully_connected_eltwise_26" + ]} + exclude_layers = None + if model_name in exclude_layers_map.keys(): + exclude_layers = exclude_layers_map[model_name] + net.quantized_backbone = mx.contrib.quant.quantize_net(net.backbone, quantized_dtype='auto', + quantize_mode='smart', + exclude_layers=exclude_layers, + exclude_layers_match=None, + calib_data=calib_data, + calib_mode='custom', + LayerOutputCollector=BertLayerCollector(clip_min=-50, clip_max=10), + num_calib_batches=10, + ctx=mx.cpu()) + return net + def evaluate(args, last=True): store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( @@ -826,11 +902,12 @@ def evaluate(args, last=True): return ctx_l = parse_ctx(args.gpus) logging.info( - 'Srarting inference without horovod on the first node on device {}'.format( + 'Starting inference without horovod on the first node on device {}'.format( str(ctx_l))) + network_dtype = args.dtype if args.dtype != 'int8' else 'float32' cfg, tokenizer, qa_net, use_segmentation = get_network( - args.model_name, ctx_l, args.classifier_dropout, dtype=args.dtype) + args.model_name, ctx_l, args.classifier_dropout, dtype=network_dtype) if args.dtype == 'float16': qa_net.cast('float16') qa_net.hybridize() @@ -860,6 +937,9 @@ def eval_validation(ckpt_name, best_eval): num_workers=0, shuffle=False) + if args.dtype == 'int8': + quantize_and_calibrate(qa_net, dev_dataloader) + log_interval = args.eval_log_interval all_results = [] epoch_tic = time.time() @@ -999,6 +1079,11 @@ def eval_validation(ckpt_name, best_eval): if __name__ == '__main__': os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' args = parse_args() + if args.dtype == 'int8': + ctx_l = parse_ctx(args.gpus) + if ctx_l[0] != mx.cpu() or len(ctx_l) != 1: + raise ValueError("Evaluation on int8 data type is supported only for CPU for now") + if args.do_train: if args.dtype == 'float16': # Initialize amp if it's fp16 training diff --git a/scripts/question_answering/run_squad_albert.py b/scripts/question_answering/run_squad_albert.py new file mode 100644 index 0000000000..cbff26b8f8 --- /dev/null +++ b/scripts/question_answering/run_squad_albert.py @@ -0,0 +1,802 @@ +""" +Question Answering with Pretrained Language Model +""" +# pylint:disable=redefined-outer-name,logging-format-interpolation + +import os +import json +import time +import logging +import argparse +import ast +import functools +import collections +import dataclasses +from dataclasses import dataclass +from multiprocessing import Pool, cpu_count +from neural_compressor.experimental import Quantization + +import mxnet as mx +import numpy as np +#from mxnet import amp +from mxnet.lr_scheduler import PolyScheduler + +import gluonnlp.data.batchify as bf +from models import ModelForQABasic, ModelForQAConditionalV1 +from eval_utils import squad_eval +from squad_utils import SquadFeature, get_squad_examples, convert_squad_example_to_feature, get_squad_examples_from_json +from gluonnlp.models import get_backbone +from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, \ + logging_config, parse_ctx +from gluonnlp.initializer import TruncNorm +from gluonnlp.data.sampler import SplitSampler +from gluonnlp.utils.parameter import grad_global_norm, clip_grad_global_norm, count_parameters,\ + deduplicate_param_dict + +import custom_strategy + +try: + import horovod.mxnet as hvd +except ImportError: + pass + +mx.npx.set_np() + +CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached')) +if not os.path.exists(CACHE_PATH): + os.makedirs(CACHE_PATH, exist_ok=True) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Question Answering example. ' + 'We fine-tune the pretrained model on SQuAD dataset.') + parser.add_argument('--model_name', type=str, default='google_albert_base_v2', + help='Name of the pretrained model.') + parser.add_argument('--do_train', action='store_true', + help='Whether to train the model') + parser.add_argument('--do_eval', action='store_true', default=True, + help='Whether to evaluate the model') + parser.add_argument('--data_dir', type=str, default='squad') + parser.add_argument('--version', default='2.0', choices=['1.1', '2.0'], + help='Version of the SQuAD dataset.') + parser.add_argument('--output_dir', type=str, default='squad_out', + help='The output directory where the model params will be written.' + ' default is squad_out') + # Communication + parser.add_argument('--comm_backend', type=str, default='device', + choices=['horovod', 'dist_sync_device', 'device'], + help='Communication backend.') + parser.add_argument('--gpus', type=str, default='-1', + help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.') + # Training hyperparameters + parser.add_argument('--seed', type=int, default=100, help='Random seed') + parser.add_argument('--log_interval', type=int, default=50, + help='The logging interval for training') + parser.add_argument('--eval_log_interval', type=int, default=10, + help='The logging interval for evaluation') + parser.add_argument('--save_interval', type=int, default=None, + help='the number of steps to save model parameters.' + 'default is every epoch') + parser.add_argument('--epochs', type=float, default=3.0, + help='Number of epochs, default is 3') + parser.add_argument('--num_train_steps', type=int, default=None, + help='The number of training steps. Note that epochs will be ignored ' + 'if training steps are set') + parser.add_argument('--batch_size', type=int, default=8, + help='Batch size. Number of examples per gpu in a minibatch. default is 32') + parser.add_argument('--eval_batch_size', type=int, default=8, + help='Evaluate batch size. Number of examples per gpu in a minibatch for ' + 'evaluation.') + parser.add_argument('--max_grad_norm', type=float, default=1.0, + help='Max gradient norm.') + parser.add_argument('--optimizer', type=str, default='adamw', + help='optimization algorithm. default is adamw') + parser.add_argument('--adam_epsilon', type=float, default=1e-6, + help='epsilon of AdamW optimizer') + parser.add_argument('--adam_betas', default='(0.9, 0.999)', metavar='B', + help='betas for Adam optimizer') + parser.add_argument('--num_accumulated', type=int, default=3, + help='The number of batches for gradients accumulation to ' + 'simulate large batch size.') + parser.add_argument('--lr', type=float, default=2e-5, + help='Initial learning rate. default is 2e-5') + parser.add_argument('--warmup_ratio', type=float, default=0.1, + help='Ratio of warmup steps in the learning rate scheduler.') + parser.add_argument('--warmup_steps', type=int, default=None, + help='warmup steps. Note that either warmup_steps or warmup_ratio is set.') + parser.add_argument('--wd', type=float, default=0.01, help='weight decay') + parser.add_argument('--layerwise_decay', type=float, default=-1, help='Layer-wise lr decay') + parser.add_argument('--untunable_depth', type=float, default=-1, + help='Depth of untunable parameters') + parser.add_argument('--classifier_dropout', type=float, default=0.1, + help='dropout of classifier') + # Data pre/post processing + parser.add_argument('--max_seq_length', type=int, default=512, + help='The maximum total input sequence length after tokenization.' + 'Sequences longer than this will be truncated, and sequences shorter ' + 'than this will be padded. default is 512') + parser.add_argument('--doc_stride', type=int, default=128, + help='When splitting up a long document into chunks, how much stride to ' + 'take between chunks. default is 128') + parser.add_argument('--max_query_length', type=int, default=64, + help='The maximum number of tokens for the query. Questions longer than ' + 'this will be truncated to this length. default is 64') + parser.add_argument('--pre_shuffle_seed', type=int, default=100, + help='Random seed for pre split shuffle') + parser.add_argument('--round_to', type=int, default=None, + help='The length of padded sequences will be rounded up to be multiple' + ' of this argument. When round to is set to 8, training throughput ' + 'may increase for mixed precision training on GPUs with TensorCores.') + parser.add_argument('--overwrite_cache', action='store_true', default=True, + help='Whether to overwrite the feature cache.') + # Evaluation hyperparameters + parser.add_argument('--start_top_n', type=int, default=5, + help='Number of start-position candidates') + parser.add_argument('--end_top_n', type=int, default=5, + help='Number of end-position candidates corresponding ' + 'to a start position') + parser.add_argument('--n_best_size', type=int, default=20, help='Top N results written to file') + parser.add_argument('--max_answer_length', type=int, default=30, + help='The maximum length of an answer that can be generated. This is ' + 'needed because the start and end predictions are not conditioned ' + 'on one another. default is 30') + parser.add_argument('--param_checkpoint', type=str, default='fintune_google_albert_base_v2_squad_2.0/google_albert_base_v2_squad2.0_8163.params',#'google_electra_base_squad2.0_8160.params',#'google_en_uncased_bert_large_squad2.0_8159.params', + help='The parameter checkpoint for evaluating the model') + parser.add_argument('--backbone_path', type=str, default=None, + help='The parameter checkpoint of backbone model') + parser.add_argument('--all_evaluate', action='store_true', + help='Whether to evaluate all intermediate checkpoints ' + 'instead of only last one') + parser.add_argument('--max_saved_ckpt', type=int, default=5, + help='The maximum number of saved checkpoints') + parser.add_argument('--dtype', type=str, default='float32', + help='Data type used for evaluation. Either float32, float16 or int8. When you ' + 'use --dtype float16, amp will be turned on in the training phase and ' + 'fp16 will be used in evaluation. For now int8 data type is supported on CPU only.') + args = parser.parse_args() + return args + + +ChunkFeature = collections.namedtuple('ChunkFeature', + ['qas_id', + 'data', + 'valid_length', + 'segment_ids', + 'masks', + 'is_impossible', + 'gt_start', + 'gt_end', + 'context_offset', + 'chunk_start', + 'chunk_length']) + +class SquadDatasetProcessor: + + def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length): + """ + + Parameters + ---------- + tokenizer + The tokenizer + doc_stride + The stride to chunk the document + max_seq_length + Maximum length of the merged data + max_query_length + Maximum query length + """ + self._tokenizer = tokenizer + self._doc_stride = doc_stride + self._max_seq_length = max_seq_length + self._max_query_length = max_query_length + + vocab = tokenizer.vocab + self.pad_id = vocab.pad_id + # For roberta model, taking sepecial token as [CLS] and as [SEP] + self.cls_id = vocab.bos_id if 'cls_token' not in vocab.special_token_keys else vocab.cls_id + self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id + + # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality. + # Here, we use round_to=8 to improve the throughput. + self.BatchifyFunction = bf.NamedTuple(ChunkFeature, + {'qas_id': bf.List(), + 'data': bf.Pad(val=self.pad_id, round_to=8), + 'valid_length': bf.Stack(), + 'segment_ids': bf.Pad(round_to=8), + 'masks': bf.Pad(val=1, round_to=8), + 'is_impossible': bf.Stack(), + 'gt_start': bf.Stack(), + 'gt_end': bf.Stack(), + 'context_offset': bf.Stack(), + 'chunk_start': bf.Stack(), + 'chunk_length': bf.Stack()}) + + def process_sample(self, feature: SquadFeature): + """Process the data to the following format. + + Note that we mask all the special tokens except the CLS token. The reason for not masking + the CLS token is that if the question is not answerable, we will set the start and end to + be 0. + + + Merged: Question Context + Segment IDs: 0 0 0 1 1 + Mask: 0 1 1 0 1 + + Here, we need to emphasize that when mask = 1, the data are actually not masked! + + Parameters + ---------- + feature + Tokenized SQuAD feature + + Returns + ------- + ret + Divide the feature into multiple chunks and extract the feature which contains + the following: + - data + The data that concatenates the query and the context + special tokens + - valid_length + The valid_length of the data + - segment_ids + We assign the query part as segment 0 and the context part as segment 1. + - masks + We mask all the special tokens. 1 --> not masked, 0 --> masked. + - is_impossible + Whether the provided context is impossible to answer or not. + - gt_start + The ground-truth start location of the span + - gt_end + The ground-truth end location of the span + - chunk_start + The start of the chunk + - chunk_length + The length of the chunk + """ + ret = [] + truncated_query_ids = feature.query_token_ids[:self._max_query_length] + chunks = feature.get_chunks( + doc_stride=self._doc_stride, + max_chunk_length=self._max_seq_length - len(truncated_query_ids) - 3) + for chunk in chunks: + data = np.array([self.cls_id] + truncated_query_ids + [self.sep_id] + + feature.context_token_ids[chunk.start:(chunk.start + chunk.length)] + + [self.sep_id], dtype=np.int32) + valid_length = len(data) + segment_ids = np.array([0] + [0] * len(truncated_query_ids) + + [0] + [1] * chunk.length + [1], dtype=np.int32) + masks = np.array([0] + [1] * len(truncated_query_ids) + [1] + [0] * chunk.length + [1], + dtype=np.int32) + context_offset = len(truncated_query_ids) + 2 + if chunk.gt_start_pos is None and chunk.gt_end_pos is None: + start_pos = 0 + end_pos = 0 + else: + # Here, we increase the start and end because we put query before context + start_pos = chunk.gt_start_pos + context_offset + end_pos = chunk.gt_end_pos + context_offset + chunk_feature = ChunkFeature(qas_id=feature.qas_id, + data=data, + valid_length=valid_length, + segment_ids=segment_ids, + masks=masks, + is_impossible=chunk.is_impossible, + gt_start=start_pos, + gt_end=end_pos, + context_offset=context_offset, + chunk_start=chunk.start, + chunk_length=chunk.length) + ret.append(chunk_feature) + return ret + + def get_train(self, features, skip_unreliable=True): + """Get the training dataset + + Parameters + ---------- + features + skip_unreliable + Whether to skip the unreliable spans in the training set + + Returns + ------- + train_dataset + num_token_answer_mismatch + num_unreliable + """ + train_dataset = [] + num_token_answer_mismatch = 0 + num_unreliable = 0 + for feature in features: + if feature.token_answer_mismatch: + num_token_answer_mismatch += 1 + if feature.unreliable_span: + num_unreliable += 1 + if skip_unreliable and feature.unreliable_span: + # Skip when not reliable + continue + # Process the feature + chunk_features = self.process_sample(feature) + train_dataset.extend(chunk_features) + return train_dataset, num_token_answer_mismatch, num_unreliable + + +def get_squad_features(args, tokenizer, segment): + """ + Get processed data features of SQuADExampls + + Parameters + ---------- + args : argparse.Namespace + tokenizer: + Tokenizer instance + segment: str + train or dev + + Returns + ------- + data_features + The list of processed data features + """ + data_cache_path = os.path.join(CACHE_PATH, + '{}_{}_squad_{}.ndjson'.format( + segment, args.model_name, args.version)) + is_training = (segment == 'train') + if os.path.exists(data_cache_path) and not args.overwrite_cache: + data_features = [] + with open(data_cache_path, 'r') as f: + for line in f: + data_features.append(SquadFeature.from_json(line)) + logging.info('Found cached data features, load from {}'.format(data_cache_path)) + else: + data_examples = get_squad_examples(args.data_dir, segment=segment, version=args.version) + start = time.time() + num_process = min(cpu_count(), 8) + logging.info('Tokenize Data:') + with Pool(num_process) as pool: + data_features = pool.map(functools.partial(convert_squad_example_to_feature, + tokenizer=tokenizer, + is_training=is_training), data_examples) + logging.info('Done! Time spent:{:.2f} seconds'.format(time.time() - start)) + with open(data_cache_path, 'w') as f: + for feature in data_features: + f.write(feature.to_json() + '\n') + + return data_features + + +def get_network(model_name, + ctx_l, + dropout=0.1, + checkpoint_path=None, + backbone_path=None, + dtype='float32'): + """ + Get the network that fine-tune the Question Answering Task + + Parameters + ---------- + model_name : str + The model name of the backbone model + ctx_l : + Context list of training device like [mx.gpu(0), mx.gpu(1)] + dropout : float + Dropout probability of the task specified layer + checkpoint_path: str + Path to a Fine-tuned checkpoint + backbone_path: str + Path to the backbone model to be loaded in qa_net + + Returns + ------- + cfg + tokenizer + qa_net + use_segmentation + """ + # Create the network + use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name + Model, cfg, tokenizer, download_params_path, _ = \ + get_backbone(model_name, load_backbone=not backbone_path) + backbone = Model.from_cfg(cfg, use_pooler=False, dtype=dtype) + # Load local backbone parameters if backbone_path provided. + # Otherwise, download backbone parameters from gluon zoo. + + backbone_params_path = backbone_path if backbone_path else download_params_path + if checkpoint_path is None: + backbone.load_parameters(backbone_params_path, ignore_extra=True, + ctx=ctx_l, cast_dtype=True) + num_params, num_fixed_params\ + = count_parameters(deduplicate_param_dict(backbone.collect_params())) + logging.info( + 'Loading Backbone Model from {}, with total/fixd parameters={}/{}'.format( + backbone_params_path, num_params, num_fixed_params)) + qa_net = ModelForQAConditionalV1(backbone=backbone, + dropout_prob=dropout, + use_segmentation=use_segmentation, + weight_initializer=TruncNorm(stdev=0.02)) + if checkpoint_path is None: + # Ignore the UserWarning during initialization, + # There is no need to re-initialize the parameters of backbone + qa_net.initialize(ctx=ctx_l) + else: + qa_net.load_parameters(checkpoint_path, ctx=ctx_l, cast_dtype=True) + qa_net.hybridize() + + return cfg, tokenizer, qa_net, use_segmentation + + +def setup_logging(args, local_rank): + """ + Setup logging configuration as well as random seed + """ + logging_config(args.output_dir, + name='finetune_squad{}'.format(args.version), # avoid race + overwrite_handler=True, + console=(local_rank == 0)) + logging.info(args) + set_seed(args.seed) + logging.debug('Random seed set to {}'.format(args.seed)) + + +RawResultExtended = collections.namedtuple( + 'RawResultExtended', + ['qas_id', + 'start_top_logits', + 'start_top_index', + 'end_top_logits', + 'end_top_index', + 'answerable_logits']) + + +def predict_extended(original_feature, + chunked_features, + results, + n_best_size, + max_answer_length=64, + start_top_n=5, + end_top_n=5): + """Get prediction results for SQuAD. + + Start Logits: (B, N_start) + End Logits: (B, N_start, N_end) + + Parameters + ---------- + original_feature: + The original SquadFeature before chunked + chunked_features + List of ChunkFeatures + results + List of model predictions for span start and span end. + n_best_size + Best N results written to file + max_answer_length + Maximum length of the answer tokens. + start_top_n + Number of start-position candidates + end_top_n + Number of end-position candidates + + Returns + ------- + not_answerable_score + Model's estimate that the question is not answerable. + prediction + The final prediction. + nbest_json + n-best predictions with their probabilities. + """ + not_answerable_score = 1000000 # Score for not-answerable. We set it to be a large and positive + # If one chunk votes for answerable, we will treat the context as answerable, + # Thus, the overall not_answerable_score = min(chunk_not_answerable_score) + all_start_idx = [] + all_end_idx = [] + all_pred_score = [] + context_length = len(original_feature.context_token_ids) + token_max_context_score = np.full((len(chunked_features), context_length), + -np.inf, + dtype=np.float32) + for i, chunked_feature in enumerate(chunked_features): + chunk_start = chunked_feature.chunk_start + chunk_length = chunked_feature.chunk_length + for j in range(chunk_start, chunk_start + chunk_length): + # This is a heuristic score + # TODO investigate the impact + token_max_context_score[i, j] = min(j - chunk_start, + chunk_start + chunk_length - 1 - j) \ + + 0.01 * chunk_length + token_max_chunk_id = token_max_context_score.argmax(axis=0) + + for chunk_id, (result, chunk_feature) in enumerate(zip(results, chunked_features)): + # We use the log-likelihood as the not answerable score. + # Thus, a high score indicates that the answer is not answerable + cur_not_answerable_score = float(result.answerable_logits[1]) + not_answerable_score = min(not_answerable_score, cur_not_answerable_score) + # Calculate the start_logits + end_logits as the overall score + context_offset = chunk_feature.context_offset + chunk_start = chunk_feature.chunk_start + chunk_length = chunk_feature.chunk_length + for i in range(start_top_n): + for j in range(end_top_n): + pred_score = result.start_top_logits[i] + result.end_top_logits[i, j] + start_index = result.start_top_index[i] + end_index = result.end_top_index[i, j] + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the answer span is in the query tokens or out of + # the chunk. We throw out all invalid predictions. + if not (context_offset <= start_index < context_offset + chunk_length) or \ + not (context_offset <= end_index < context_offset + chunk_length) or \ + end_index < start_index: + continue + pred_answer_length = end_index - start_index + 1 + if pred_answer_length > max_answer_length: + continue + start_idx = int(start_index - context_offset + chunk_start) + end_idx = int(end_index - context_offset + chunk_start) + if token_max_chunk_id[start_idx] != chunk_id: + continue + all_start_idx.append(start_idx) + all_end_idx.append(end_idx) + all_pred_score.append(pred_score) + sorted_start_end_score = sorted(zip(all_start_idx, all_end_idx, all_pred_score), + key=lambda args: args[-1], reverse=True) + nbest = [] + context_text = original_feature.context_text + context_token_offsets = original_feature.context_token_offsets + seen_predictions = set() + for start_idx, end_idx, pred_score in sorted_start_end_score: + if len(seen_predictions) >= n_best_size: + break + pred_answer = context_text[context_token_offsets[start_idx][0]: + context_token_offsets[end_idx][1]] + seen_predictions.add(pred_answer) + nbest.append((pred_answer, pred_score)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if len(nbest) == 0: + nbest.append(('', float('-inf'))) + all_scores = np.array([ele[1] for ele in nbest], dtype=np.float32) + probs = np.exp(all_scores) / np.sum(np.exp(all_scores)) + nbest_json = [] + for i, (entry, prob) in enumerate(zip(nbest, probs)): + output = collections.OrderedDict() + output['text'] = entry[0] + output['probability'] = float(prob) + nbest_json.append(output) + + assert len(nbest_json) >= 1 + return not_answerable_score, nbest[0][0], nbest_json + + +def evaluate(args, last=True): + ctx_l = parse_ctx(args.gpus) + setup_logging(args, 0) + + cfg, tokenizer, qa_net, use_segmentation = get_network( + args.model_name, ctx_l, args.classifier_dropout, dtype='float32') + + dev_data_path = os.path.join(args.data_dir, 'dev-v{}.json'.format(args.version)) + with open(dev_data_path, 'r') as f: + dev_dataset = json.load(f)['data'] + expl = get_squad_examples_from_json(dev_data_path, is_training=False) + import random + random.shuffle(expl) + expl = expl[:512+256] + eval_qa_ids = {qa.qas_id for qa in expl} + for article in dev_dataset: + for paragraph in article['paragraphs']: + new_qas = [] + for qa in paragraph['qas']: + if qa['id'] in eval_qa_ids: + new_qas.append(qa) + paragraph['qas'] = new_qas + + num_process = min(cpu_count(), 8) + with Pool(num_process) as pool: + dev_features = pool.map(functools.partial(convert_squad_example_to_feature, + tokenizer=tokenizer, + is_training=False), expl) + dataset_processor = SquadDatasetProcessor(tokenizer=tokenizer, + doc_stride=args.doc_stride, + max_seq_length=args.max_seq_length, + max_query_length=args.max_query_length) + dev_all_chunk_features = [] + dev_chunk_feature_ptr = [0] + for feature in dev_features: + chunk_features = dataset_processor.process_sample(feature) + dev_all_chunk_features.extend(chunk_features) + dev_chunk_feature_ptr.append(dev_chunk_feature_ptr[-1] + len(chunk_features)) + + class QuantizationDataLoader(mx.gluon.data.DataLoader): + def __init__(self, dataloader, use_segmentation): + self._dataloader = dataloader + self._iter = None + self._use_segmentation = use_segmentation + self.batch_size = args.eval_batch_size + self._batch_sampler = dataloader._batch_sampler + + def __iter__(self): + self._iter = iter(self._dataloader) + return self + + def __next__(self): + batch = next(self._iter) + if self._use_segmentation: + return [batch.data, batch.segment_ids, batch.valid_length] + else: + return [batch.data, batch.valid_length] + + def __del__(self): + del(self._dataloader) + + dev_dataloader = QuantizationDataLoader(mx.gluon.data.DataLoader( + dev_all_chunk_features, + batchify_fn=dataset_processor.BatchifyFunction, + batch_size=args.eval_batch_size, + num_workers=0, + shuffle=False), qa_net.use_segmentation) + + def quantize_and_calibrate(net): + from neural_compressor.experimental import Quantization, common + quantizer = Quantization('albert_custom.yaml') + quantizer.model = common.Model(net.backbone) + quantizer.calib_dataloader = dev_dataloader + quantizer.eval_func = eval_validation + net.quantized_backbone = quantizer().model + net.quantized_backbone.export("best_conf_inc_model") + return net + + def eval_validation(backbone): + """ + Model inference during validation or final evaluation. + """ + del qa_net.quantized_backbone + qa_net.quantized_backbone = backbone + + dev_dataloader = mx.gluon.data.DataLoader( + dev_all_chunk_features, + batchify_fn=dataset_processor.BatchifyFunction, + batch_size=args.eval_batch_size, + num_workers=0, + shuffle=False) + + log_interval = args.eval_log_interval + all_results = [] + epoch_tic = time.time() + tic = time.time() + epoch_size = len(dev_features) + total_num = 0 + log_num = 0 + best_eval = {} + for batch_idx, dev_batch in enumerate(grouper(dev_dataloader, len(ctx_l))): + # Predict for each chunk + for sample, ctx in zip(dev_batch, ctx_l): + if sample is None: + continue + # Copy the data to device + tokens = sample.data.as_in_ctx(ctx) + total_num += len(tokens) + log_num += len(tokens) + segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None + valid_length = sample.valid_length.as_in_ctx(ctx) + p_mask = sample.masks.as_in_ctx(ctx) + p_mask = 1 - p_mask # In the network, we use 1 --> no_mask, 0 --> mask + start_top_logits, start_top_index, end_top_logits, end_top_index, answerable_logits \ + = qa_net.inference(tokens, segment_ids, valid_length, p_mask, + args.start_top_n, args.end_top_n) + for i, qas_id in enumerate(sample.qas_id): + result = RawResultExtended(qas_id=qas_id, + start_top_logits=start_top_logits[i].asnumpy(), + start_top_index=start_top_index[i].asnumpy(), + end_top_logits=end_top_logits[i].asnumpy(), + end_top_index=end_top_index[i].asnumpy(), + answerable_logits=answerable_logits[i].asnumpy()) + + all_results.append(result) + + # logging + if (batch_idx + 1) % log_interval == 0: + # Output the loss of per step + toc = time.time() + logging.info( + '[batch {}], Time cost={:.2f},' + ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format( + batch_idx + 1, toc - tic, log_num / (toc - tic), + (epoch_size - total_num) / (total_num / (toc - epoch_tic)) / 3600)) + tic = time.time() + log_num = 0 + + epoch_toc = time.time() + logging.info('Time cost=%2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic, + total_num / (epoch_toc - epoch_tic)) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + no_answer_score_json = collections.OrderedDict() + for index, (left_index, right_index) in enumerate(zip(dev_chunk_feature_ptr[:-1], + dev_chunk_feature_ptr[1:])): + chunked_features = dev_all_chunk_features[left_index:right_index] + results = all_results[left_index:right_index] + original_feature = dev_features[index] + qas_ids = set([result.qas_id for result in results] + + [feature.qas_id for feature in chunked_features]) + assert len(qas_ids) == 1, 'Mismatch Occured between features and results' + example_qas_id = list(qas_ids)[0] + assert example_qas_id == original_feature.qas_id, \ + 'Mismatch Occured between original feature and chunked features' + not_answerable_score, best_pred, nbest_json = predict_extended( + original_feature=original_feature, + chunked_features=chunked_features, + results=results, + n_best_size=args.n_best_size, + max_answer_length=args.max_answer_length, + start_top_n=args.start_top_n, + end_top_n=args.end_top_n) + no_answer_score_json[example_qas_id] = not_answerable_score + all_predictions[example_qas_id] = best_pred + all_nbest_json[example_qas_id] = nbest_json + + if args.version == '2.0': + exact = 'best_exact' + f1 = 'best_f1' + na_prob = no_answer_score_json + else: + exact = 'exact' + f1 = 'f1' + na_prob = None + + cur_eval, revised_predictions = squad_eval( + dev_dataset, all_predictions, na_prob, revise=na_prob is not None) + logging.info('The evaluated results are {}'.format(json.dumps(cur_eval))) + + cur_metrics = 0.5 * (cur_eval[exact] + cur_eval[f1]) + if best_eval: + best_metrics = 0.5 * (best_eval[exact] + best_eval[f1]) + else: + best_metrics = 0. + + if cur_metrics > best_metrics: + logging.info('The evaluated files are saved in {}'.format(args.output_dir)) + output_prediction_file = os.path.join(args.output_dir, 'predictions.json') + output_nbest_file = os.path.join(args.output_dir, 'nbest_predictions.json') + na_prob_file = os.path.join(args.output_dir, 'na_prob.json') + revised_prediction_file = os.path.join(args.output_dir, 'revised_predictions.json') + + with open(output_prediction_file, 'w') as of: + of.write(json.dumps(all_predictions, indent=4) + '\n') + with open(output_nbest_file, 'w') as of: + of.write(json.dumps(all_nbest_json, indent=4) + '\n') + with open(na_prob_file, 'w') as of: + of.write(json.dumps(no_answer_score_json, indent=4) + '\n') + with open(revised_prediction_file, 'w') as of: + of.write(json.dumps(revised_predictions, indent=4) + '\n') + + best_eval = cur_eval + best_eval.update({'best_ckpt': 'mybest'}) + return best_eval['best_f1']/100 + + qa_net.load_parameters(args.param_checkpoint, ctx=ctx_l, cast_dtype=True) + data_example = next(iter(dev_dataloader)) + + qa_net.backbone.optimize_for(*data_example, backend='ONEDNN') + quantized_backbone = quantize_and_calibrate(qa_net) + best_eval = eval_validation(quantized_backbone) + + logging.info('The best evaluated results are {}'.format(json.dumps(best_eval))) + output_eval_results_file = os.path.join(args.output_dir, 'best_results.json') + with open(output_eval_results_file, 'w') as of: + of.write(json.dumps(best_eval, indent=4) + '\n') + return best_eval + + +if __name__ == '__main__': + os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' + args = parse_args() + evaluate(args, last=not args.all_evaluate) + + diff --git a/setup.py b/setup.py index baf44e6110..7b27865c84 100644 --- a/setup.py +++ b/setup.py @@ -133,6 +133,7 @@ def find_version(*file_paths): 'pylint_quotes', 'flake8', 'recommonmark', + 'sphinx>=1.5.5', 'sphinx-gallery', 'sphinx_rtd_theme', 'mxtheme',