diff --git a/Write_MET_binned_histogram.py b/Write_MET_binned_histogram.py index d152e838..0f963a94 100644 --- a/Write_MET_binned_histogram.py +++ b/Write_MET_binned_histogram.py @@ -332,8 +332,8 @@ def MET_rel_error_bad(predict_met, gen_met, name='Met_res.pdf'): # for i in range(rel_err.shape[0]): # std += (mean - rel_err[i]) **2 - #std = std/rel_err.shape[0] - #std = math.sqrt(std) + # std = std/rel_err.shape[0] + # std = math.sqrt(std) mean = mean * 1000 mean = int(mean) @@ -467,8 +467,8 @@ def Phi_abs_error(predict_met, gen_met, name='Met_res.pdf'): def Pt_abs_error_opaque(puppi_met, ml_met, gen_met, name='Met_res.pdf'): puppi_err = (puppi_met - gen_met) ml_err = (ml_met - gen_met) - #minErr = min(np.array([rel_err, rel_err2]).flatten()) - #maxErr = max(np.array([rel_err, rel_err2]).flatten()) + # minErr = min(np.array([rel_err, rel_err2]).flatten()) + # maxErr = max(np.array([rel_err, rel_err2]).flatten()) plt.figure() plt.hist(puppi_err, bins=np.linspace(-250, 250, 50+1), alpha=0.5, label='puppi') plt.hist(ml_err, bins=np.linspace(-250, 250, 50+1), alpha=0.5, label='ML') @@ -573,7 +573,7 @@ def MET_binned_predict_mean(predict_met, gen_met, binning, mini, maxi, genMET_cu plt.xlim(mini, maxi) plt.ylim(mini, 700) plt.xlabel('Gen MET mean [GeV]', fontsize=16) - #plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16) + # plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16) plt.ylabel('predicted MET mean [GeV]', fontsize=16) plt.legend() plt.savefig(name) @@ -621,7 +621,7 @@ def MET_binned_predict_mean_opaque(predict_met, predict_met2, gen_met, binning, plt.xlim(mini, maxi) plt.ylim(mini, maxi) plt.xlabel('Gen MET mean [GeV]', fontsize=16) - #plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16) + # plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16) plt.ylabel('predicted MET mean [GeV]', fontsize=16) plt.legend() plt.savefig(name) @@ -673,9 +673,9 @@ def extract_result(feat_array, targ_array, path, name, mode): def histo_2D(predict_pT, gen_pT, min_, max_, name='2D_histo.png'): X_hist = np.arange(0, 500, 20) Y_hist = X_hist # 1.25*X_hist - #Y_hist_1 = 0.75*X_hist + # Y_hist_1 = 0.75*X_hist plt.plot(X_hist, Y_hist, '-r') - #plt.plot(X_hist, Y_hist_1, '-r') + # plt.plot(X_hist, Y_hist_1, '-r') x_bins = np.linspace(min_, max_, 50) y_bins = np.linspace(min_, max_, 50) plt.hist2d(gen_pT, predict_pT, bins=[x_bins, y_bins], cmap=plt.cm.jet) diff --git a/convertNanoToHDF5_L1triggerToDeepMET.py b/convertNanoToHDF5_L1triggerToDeepMET.py index 33a6a21f..7bb9cdc7 100644 --- a/convertNanoToHDF5_L1triggerToDeepMET.py +++ b/convertNanoToHDF5_L1triggerToDeepMET.py @@ -6,7 +6,7 @@ import numpy as np import awkward as ak import h5py -#import progressbar +# import progressbar from tqdm import tqdm import os diff --git a/convert_full_model.py b/convert_full_model.py index f54971a2..ef42d3de 100644 --- a/convert_full_model.py +++ b/convert_full_model.py @@ -1,3 +1,4 @@ +import argparse import tensorflow from models import dense_embedding from tensorflow.keras.layers import Input, Concatenate @@ -10,7 +11,11 @@ from utils import preProcessing import h5py import scipy +import seaborn +import pandas as pd +import matplotlib.pyplot as plt +# TODO: what does this do? co = {} _add_supported_quantized_objects(co) @@ -26,140 +31,239 @@ def print_dict(d, indent=0): print(':' + ' ' * (20 - len(key) - 2 * indent) + str(value)) -# load full model: -model_name = 'trained_DeepMET' -# model_name = 'trained_quantized_DeepMET' -# model_name = 'trained_quantized_DeepMET_normfac1000' -model = tensorflow.keras.models.load_model(f'models/baseline_DeepMET{"_quantized" if "quantized" in model_name else ""}/{model_name}.h5', compile=False, custom_objects=co) - -reuse_factor = 1 -precision = 'ap_fixed<32,16>' -io_type = 'io_parallel' -strategy = 'Latency' -output_dir = 'hls_output_{}_{}_{}_rf{}_{}'.format(model_name ,io_type, strategy, reuse_factor, precision) -batch_size = 1 -synth = False -trace = True -normFac = 1 - -# check everthing works -model.summary() -model.save('{}/model.h5'.format(output_dir)) - -config = hls4ml.utils.config_from_keras_model(model, - granularity='name', - default_reuse_factor=reuse_factor, - default_precision=precision) -config['Model']['Strategy'] = strategy -for name in config['LayerName'].keys(): - config['LayerName'][name]['Trace'] = trace -config['LayerName']['input_cat0']['Precision']['result'] = 'ap_uint<4>' -config['LayerName']['input_cat1']['Precision']['result'] = 'ap_uint<4>' -# config['LayerName']['input_cont']['Precision']['result'] = 'ap_fixed<20,10>' -#if 'q_dense' in config['LayerName']: -# config['LayerName']['q_dense']['Precision']['accum'] = 'ap_fixed<32,16>' -# config['LayerName']['q_dense']['Precision']['weight'] = 'ap_fixed<32,16>' -# config['LayerName']['q_dense']['Precision']['bias'] = 'ap_fixed<32,16>' -# config['LayerName']['q_dense_1']['Precision']['accum'] = 'ap_fixed<32,16>' -# config['LayerName']['q_dense_1']['Precision']['weight'] = 'ap_fixed<32,16>' -# config['LayerName']['q_dense_1']['Precision']['bias'] = 'ap_fixed<32,16>' -config['LayerName']['multiply']['n_elem'] = 100 -config['LayerName']['output']['n_filt'] = 2 -# skip optimize_pointwise_conv -# config['SkipOptimizers'] = ['optimize_pointwise_conv'] -# for layer in config['LayerName'].keys(): -# config['LayerName'][layer]['Trace'] = True - -print("-----------------------------------") -print_dict(config) -print("-----------------------------------") -hls_model = hls4ml.converters.convert_from_keras_model(model, - hls_config=config, - io_type=io_type, - output_dir=output_dir, - part='xcvu13p-flga2577-2-e', - clock_period=5, - project_name='L1METML_v1', -) -hls_model.compile() - -hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file='{}/model_hls4ml.png'.format(output_dir)) - -if synth: - hls_model.build(synth=synth) - hls4ml.report.read_vivado_report(output_dir) - -f = h5py.File('data/test_data.h5') -# 1000 test events is good enough -X = f['X'][:1000] -y = -f['Y'][:1000] - -# preprocessing -X_pre = list(preProcessing(X, normFac=normFac)) -X_pre = [np.ascontiguousarray(x) for x in X_pre] - -y_pred = model.predict(X_pre) -y_hls = hls_model.predict(X_pre) - -met = np.hypot(y[:, 0], y[:, 1]) -met_pred = np.hypot(y_pred[:, 0], y_pred[:, 1]) * normFac -met_hls = np.hypot(y_hls[:, 0], y_hls[:, 1]) * normFac -met_pup_x = np.sum(X[:, :, 1], axis=-1) -met_pup_y = np.sum(X[:, :, 2], axis=-1) -met_pup = np.hypot(met_pup_x, met_pup_y) +def load_model(model_name): + if 'quantized' in model_name: + model = tensorflow.keras.models.load_model(f'models/baseline_DeepMET_quantized/{model_name}.h5', compile=False, custom_objects=co) + if 'test' in model_name: + model = tensorflow.keras.models.load_model(f'test_12_36/model.h5', compile=False, custom_objects=co) + else: + model = tensorflow.keras.models.load_model(f'models/baseline_DeepMET/{model_name}.h5', compile=False) + return model -import seaborn -import pandas as pd -import matplotlib.pyplot as plt -df = pd.DataFrame.from_dict({'Gen MET': met, 'PUPPI MET': met_pup, 'QKeras MET': met_pred, 'hls4ml MET': met_hls}) -plt.figure() -seaborn.pairplot(df, corner=True) -plt.savefig(f'{output_dir}/profiling_MET.png', dpi=300) - -df = pd.DataFrame.from_dict({'Gen MET x': y[:, 0], 'PUPPI MET x': met_pup_x, 'QKeras MET x': y_pred[:, 0], 'hls4ml MET x': y_hls[:, 0]}) -plt.figure() -seaborn.pairplot(df, corner=True) -plt.savefig(f'{output_dir}/profiling_MET_x.png', dpi=300) - -df = pd.DataFrame.from_dict({'Gen MET y': y[:, 1], 'PUPPI MET y': met_pup_y, 'QKeras MET y': y_pred[:, 1], 'hls4ml MET y': y_hls[:, 1]}) -plt.figure() -seaborn.pairplot(df, corner=True) -plt.savefig(f'{output_dir}/profiling_MET_y.png', dpi=300) - -response_pup = met_pup / met -response_pred = met_pred / met -response_hls = met_hls / met -bins = np.linspace(0, 2, 25) -plt.figure(figsize=(12, 5)) -plt.subplot(1, 3, 1) -plt.hist(response_pup, bins=bins, label=f'PUPPI, median={np.median(response_pup):0.2f}, IQR={scipy.stats.iqr(response_pup):0.2f}') -plt.legend() -plt.xlabel("MET response $\hat{y}/y$") -plt.ylabel("Events") -plt.subplot(1, 3, 2) -plt.hist(response_pred, bins=bins, label=f'QKeras, median={np.median(response_pred):0.2f}, IQR={scipy.stats.iqr(response_pred):0.2f}') -plt.legend() -plt.xlabel("MET response $\hat{y}/y$") -plt.ylabel("Events") -plt.subplot(1, 3, 3) -plt.hist(response_hls, bins=bins, label=f'hls4ml, median={np.median(response_hls):0.2f}, IQR={scipy.stats.iqr(response_hls):0.2f}') -plt.legend() -plt.xlabel("MET response $\hat{y}/y$") -plt.ylabel("Events") -plt.tight_layout() -plt.savefig(f"{output_dir}/response_MET.png", dpi=300) - -y_hls, hls4ml_trace = hls_model.trace(X_pre) -keras_trace = hls4ml.model.profiling.get_ymodel_keras(model, X_pre) - -for layer in hls4ml_trace.keys(): +def configure_hls_model(model, config_params): + config = hls4ml.utils.config_from_keras_model( + model, + granularity='name', + default_reuse_factor=config_params['reuse-factor'], + default_precision=config_params['precision']) + config['Model']['Strategy'] = config_params['strategy'] + for name in config['LayerName'].keys(): + config['LayerName'][name]['Trace'] = config_params['trace'] + config['LayerName']['input_cat0']['Precision']['result'] = 'ap_uint<4>' + config['LayerName']['input_cat1']['Precision']['result'] = 'ap_uint<4>' + # config['LayerName']['input_cont']['Precision']['result'] = 'ap_fixed<20,10>' + # if 'q_dense' in config['LayerName']: + # config['LayerName']['q_dense']['Precision']['accum'] = 'ap_fixed<32,16>' + # config['LayerName']['q_dense']['Precision']['weight'] = 'ap_fixed<32,16>' + # config['LayerName']['q_dense']['Precision']['bias'] = 'ap_fixed<32,16>' + # config['LayerName']['q_dense_1']['Precision']['accum'] = 'ap_fixed<32,16>' + # config['LayerName']['q_dense_1']['Precision']['weight'] = 'ap_fixed<32,16>' + # config['LayerName']['q_dense_1']['Precision']['bias'] = 'ap_fixed<32,16>' + config['LayerName']['multiply']['n_elem'] = 100 + config['LayerName']['output']['n_filt'] = 2 + # skip optimize_pointwise_conv + # config['SkipOptimizers'] = ['optimize_pointwise_conv'] + # for layer in config['LayerName'].keys(): + # config['LayerName'][layer]['Trace'] = True + + print("-----------------------------------") + print_dict(config) + return config + + +def convert_to_hls_model(model, config, output_dir, io_type, part, clock_period, project_name): + print("-----------------------------------") + hls_model = hls4ml.converters.convert_from_keras_model(model, + hls_config=config, + io_type=io_type, + output_dir=output_dir, + part=part, + clock_period=clock_period, + project_name=project_name, + ) + hls_model.compile() + return hls_model + + +def preprocess_data(file_path, norm_factor): + with h5py.File(file_path, 'r') as f: + # 1000 test events is good enough + X = f['X'][:1000] + y = -f['Y'][:1000] + X_preprocessed = list(preProcessing(X, normFac=norm_factor)) + return [np.ascontiguousarray(x) for x in X_preprocessed], X, y + + +def plot_metrics(data_to_plot, hls_model, model, output_dir): + met = data_to_plot['met'] + met_pred = data_to_plot['met_pred'] + met_hls = data_to_plot['met_hls'] + met_pup = data_to_plot['met_pup'] + met_pup_x = data_to_plot['met_pup_x'] + met_pup_y = data_to_plot['met_pup_y'] + y_pred = data_to_plot['y_pred'] + y_hls = data_to_plot['y_hls'] + y = data_to_plot['y'] + X_pre = data_to_plot['x_pre'] + + df = pd.DataFrame.from_dict({ + 'Gen MET': met, + 'PUPPI MET': met_pup, + 'QKeras MET': met_pred, + 'hls4ml MET': met_hls, + }) plt.figure() - if layer not in keras_trace: continue - plt.scatter(hls4ml_trace[layer].flatten(), keras_trace[layer].flatten(), s=0.2) - min_x = min(np.amin(hls4ml_trace[layer]), np.amin(keras_trace[layer])) - max_x = max(np.amax(hls4ml_trace[layer]), np.amax(keras_trace[layer])) - plt.plot([min_x, max_x], [min_x, max_x], c='gray') - plt.xlabel(f'hls4ml {layer}') - plt.ylabel(f'QKeras {layer}') - plt.savefig(f'{output_dir}/profiling_{layer}.png', dpi=300) + seaborn.pairplot(df, corner=True) + plt.savefig(f'{output_dir}/profiling_MET.png', dpi=300) + plt.close() + + df = pd.DataFrame.from_dict( + {'Gen MET x': y[:, 0], + 'PUPPI MET x': met_pup_x, + 'QKeras MET x': y_pred[:, 0], + 'hls4ml MET x': y_hls[:, 0], + }) + plt.figure() + seaborn.pairplot(df, corner=True) + plt.savefig(f'{output_dir}/profiling_MET_x.png', dpi=300) + + df = pd.DataFrame.from_dict({ + 'Gen MET y': y[:, 1], + 'PUPPI MET y': met_pup_y, + 'QKeras MET y': y_pred[:, 1], + 'hls4ml MET y': y_hls[:, 1] + }) + plt.figure() + seaborn.pairplot(df, corner=True) + plt.savefig(f'{output_dir}/profiling_MET_y.png', dpi=300) + + response_pup = met_pup / met + response_pred = met_pred / met + response_hls = met_hls / met + bins = np.linspace(0, 2, 25) + plt.figure(figsize=(12, 5)) + plt.subplot(1, 3, 1) + plt.hist(response_pup, bins=bins, label=f'PUPPI, median={np.median(response_pup):0.2f}, IQR={scipy.stats.iqr(response_pup):0.2f}') + plt.legend() + plt.xlabel("MET response $\\hat{y}/y$") + plt.ylabel("Events") + plt.subplot(1, 3, 2) + plt.hist(response_pred, bins=bins, label=f'QKeras, median={np.median(response_pred):0.2f}, IQR={scipy.stats.iqr(response_pred):0.2f}') + plt.legend() + plt.xlabel("MET response $\\hat{y}/y$") + plt.ylabel("Events") + plt.subplot(1, 3, 3) + plt.hist(response_hls, bins=bins, label=f'hls4ml, median={np.median(response_hls):0.2f}, IQR={scipy.stats.iqr(response_hls):0.2f}') + plt.legend() + plt.xlabel("MET response $\\hat{y}/y$") + plt.ylabel("Events") + plt.tight_layout() + plt.savefig(f"{output_dir}/response_MET.png", dpi=300) + + y_hls, hls4ml_trace = hls_model.trace(X_pre) + keras_trace = hls4ml.model.profiling.get_ymodel_keras(model, X_pre) + + for layer in hls4ml_trace.keys(): + plt.figure() + if layer not in keras_trace: + continue + plt.scatter(hls4ml_trace[layer].flatten(), keras_trace[layer].flatten(), s=0.2) + min_x = min(np.amin(hls4ml_trace[layer]), np.amin(keras_trace[layer])) + max_x = max(np.amax(hls4ml_trace[layer]), np.amax(keras_trace[layer])) + plt.plot([min_x, max_x], [min_x, max_x], c='gray') + plt.xlabel(f'hls4ml {layer}') + plt.ylabel(f'QKeras {layer}') + plt.savefig(f'{output_dir}/profiling_{layer}.png', dpi=300) + + +def main(args): + model_name = args.model_name + + model = load_model(model_name) + + config_params = { + 'reuse-factor': 1, + 'strategy': 'Latency', + 'precision': 'ap_fixed<32,16>', + 'trace': True, + } + io_type = 'io_parallel' + output_dir = 'hls_output_{}_{}_{}_rf{}_{}'.format( + model_name, + io_type, + config_params['strategy'], + config_params['reuse-factor'], + config_params['precision'] + ) + batch_size = 1 + synth = False + trace = True + normFac = 1 # identify where NormFac is used (and how) and if it can be fed via argument + + # check everthing works + model.summary() + model.save('{}/model.h5'.format(output_dir)) + + # create hls model + config = configure_hls_model(model, config_params) + hls_model = convert_to_hls_model(model, config, output_dir, io_type, 'xcvu13p-flga2577-2-e', 5, 'L1METML_v1') + + hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file='{}/model_hls4ml.png'.format(output_dir)) + + if synth: + hls_model.build(synth=synth) + hls4ml.report.read_vivado_report(output_dir) + + # load and preprocess data + X_pre, X, y = preprocess_data(args.data_path, norm_factor=1) + + y_pred = model.predict(X_pre) + y_hls = hls_model.predict(X_pre) + + met = np.hypot(y[:, 0], y[:, 1]) + met_pred = np.hypot(y_pred[:, 0], y_pred[:, 1]) * normFac + met_hls = np.hypot(y_hls[:, 0], y_hls[:, 1]) * normFac + met_pup_x = np.sum(X[:, :, 1], axis=-1) # does this need to be X_pre? previously X + met_pup_y = np.sum(X[:, :, 2], axis=-1) # does this need to be X_pre? previously X + met_pup = np.hypot(met_pup_x, met_pup_y) + + data_to_plot = { + 'met': met, + 'met_pred': met_pred, + 'met_hls': met_hls, + 'met_pup': met_pup, + 'met_pup_x': met_pup_x, + 'met_pup_y': met_pup_y, + 'y_pred': y_pred, + 'y_hls': y_hls, + 'y': y, + 'x_pre': X_pre, + } + + plot_metrics(data_to_plot, hls_model, model, output_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + '--model-name', + type=str, + default='trained_DeepMET', + choices=[ + 'trained_DeepMET', + 'trained_quantized_DeepMET', + 'trained_quantized_DeepMET_normfac1000', + 'test_12_36'], + help='Model name') + parser.add_argument( + '--data-path', + type=str, + default='data/test_data.h5', + help='Location of data file (.h5 format)') + + args = parser.parse_args() + # TODO: figure what knobs are tuned here by the user and pass them as arguments + # TODO: refactor commented part of hls_config, potentially adding args or default values + main(args) diff --git a/hls_conversion_config.yaml b/hls_conversion_config.yaml new file mode 100644 index 00000000..203884c3 --- /dev/null +++ b/hls_conversion_config.yaml @@ -0,0 +1,19 @@ +model_name: "trained_DeepMET" # Choose from available models + +# not implemented yet + +# HLS config parameters +config_params: + reuse-factor: 1 + strategy: "Latency" + precision: "ap_fixed<32,16>" + trace: true + + +io_type: "io_parallel" +part: "xcvu13p-flga2577-2-e" +clock_period: 5 +project_name: "L1METML_v1" +batch_size: 1 +synth: false +normFac: 1 # Identify where NormFac is used and if it can be fed via argument diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>.tar.gz b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>.tar.gz new file mode 100644 index 00000000..68bcd4a2 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>.tar.gz differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_bridge.cpp b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_bridge.cpp new file mode 100644 index 00000000..ed18b460 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_bridge.cpp @@ -0,0 +1,104 @@ +#ifndef L1METML_V1_BRIDGE_H_ +#define L1METML_V1_BRIDGE_H_ + +#include "firmware/L1METML_v1.h" +#include "firmware/nnet_utils/nnet_helpers.h" +#include +#include + +// hls-fpga-machine-learning insert bram + +namespace nnet { +bool trace_enabled = false; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +extern "C" { + +struct trace_data { + const char *name; + void *data; +}; + +void allocate_trace_storage(size_t element_size) { + nnet::trace_enabled = true; + nnet::trace_outputs = new std::map; + nnet::trace_type_size = element_size; + nnet::trace_outputs->insert(std::pair("embedding0", (void *) malloc(N_LAYER_1_3*N_LAYER_2_3 * element_size))); + nnet::trace_outputs->insert(std::pair("embedding1", (void *) malloc(N_LAYER_1_4*N_LAYER_2_4 * element_size))); + nnet::trace_outputs->insert(std::pair("concatenate", (void *) malloc(OUT_CONCAT_0_6*OUT_CONCAT_1_6 * element_size))); + nnet::trace_outputs->insert(std::pair("concatenate_1", (void *) malloc(OUT_CONCAT_0_7*OUT_CONCAT_1_7 * element_size))); + nnet::trace_outputs->insert(std::pair("dense", (void *) malloc(N_OUTPUTS_22*N_FILT_22 * element_size))); + nnet::trace_outputs->insert(std::pair("activation", (void *) malloc(N_LAYER_1_8*N_LAYER_2_8 * element_size))); + nnet::trace_outputs->insert(std::pair("dense_1", (void *) malloc(N_OUTPUTS_23*N_FILT_23 * element_size))); + nnet::trace_outputs->insert(std::pair("activation_1", (void *) malloc(N_LAYER_1_12*N_LAYER_2_12 * element_size))); + nnet::trace_outputs->insert(std::pair("met_weight", (void *) malloc(N_OUTPUTS_24*N_FILT_24 * element_size))); + nnet::trace_outputs->insert(std::pair("multiply", (void *) malloc(N_INPUT_1_19*N_INPUT_2_19 * element_size))); + nnet::trace_outputs->insert(std::pair("output", (void *) malloc(N_FILT_21 * element_size))); +} + +void free_trace_storage() { + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + void *ptr = i->second; + free(ptr); + } + nnet::trace_outputs->clear(); + delete nnet::trace_outputs; + nnet::trace_outputs = NULL; + nnet::trace_enabled = false; +} + +void collect_trace_output(struct trace_data *c_trace_outputs) { + int ii = 0; + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + c_trace_outputs[ii].name = i->first.c_str(); + c_trace_outputs[ii].data = i->second; + ii++; + } +} + +// Wrapper of top level function for Python bridge +void L1METML_v1_float( + float input_cont[N_INPUT_1_5*N_INPUT_2_5], float input_pxpy[N_INPUT_1_19*N_INPUT_2_19], float input_cat0[N_INPUT_1_1], float input_cat1[N_INPUT_1_2], + float layer21_out[N_FILT_21] +) { + + input5_t input_cont_ap[N_INPUT_1_5*N_INPUT_2_5]; + nnet::convert_data(input_cont, input_cont_ap); + input19_t input_pxpy_ap[N_INPUT_1_19*N_INPUT_2_19]; + nnet::convert_data(input_pxpy, input_pxpy_ap); + input_t input_cat0_ap[N_INPUT_1_1]; + nnet::convert_data(input_cat0, input_cat0_ap); + input2_t input_cat1_ap[N_INPUT_1_2]; + nnet::convert_data(input_cat1, input_cat1_ap); + + result_t layer21_out_ap[N_FILT_21]; + + L1METML_v1(input_cont_ap,input_pxpy_ap,input_cat0_ap,input_cat1_ap,layer21_out_ap); + + nnet::convert_data(layer21_out_ap, layer21_out); +} + +void L1METML_v1_double( + double input_cont[N_INPUT_1_5*N_INPUT_2_5], double input_pxpy[N_INPUT_1_19*N_INPUT_2_19], double input_cat0[N_INPUT_1_1], double input_cat1[N_INPUT_1_2], + double layer21_out[N_FILT_21] +) { + input5_t input_cont_ap[N_INPUT_1_5*N_INPUT_2_5]; + nnet::convert_data(input_cont, input_cont_ap); + input19_t input_pxpy_ap[N_INPUT_1_19*N_INPUT_2_19]; + nnet::convert_data(input_pxpy, input_pxpy_ap); + input_t input_cat0_ap[N_INPUT_1_1]; + nnet::convert_data(input_cat0, input_cat0_ap); + input2_t input_cat1_ap[N_INPUT_1_2]; + nnet::convert_data(input_cat1, input_cat1_ap); + + result_t layer21_out_ap[N_FILT_21]; + + L1METML_v1(input_cont_ap,input_pxpy_ap,input_cat0_ap,input_cat1_ap,layer21_out_ap); + + nnet::convert_data(layer21_out_ap, layer21_out); +} +} + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_test.cpp b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_test.cpp new file mode 100644 index 00000000..1c452f68 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_test.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "firmware/L1METML_v1.h" +#include "firmware/nnet_utils/nnet_helpers.h" + +// hls-fpga-machine-learning insert bram + +#define CHECKPOINT 5000 + +namespace nnet { +bool trace_enabled = true; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +int main(int argc, char **argv) { + // load input data from text file + std::ifstream fin("tb_data/tb_input_features.dat"); + // load predictions from text file + std::ifstream fpr("tb_data/tb_output_predictions.dat"); + +#ifdef RTL_SIM + std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log"; +#else + std::string RESULTS_LOG = "tb_data/csim_results.log"; +#endif + std::ofstream fout(RESULTS_LOG); + + std::string iline; + std::string pline; + int e = 0; + + if (fin.is_open() && fpr.is_open()) { + while (std::getline(fin, iline) && std::getline(fpr, pline)) { + if (e % CHECKPOINT == 0) + std::cout << "Processing input " << e << std::endl; + char *cstr = const_cast(iline.c_str()); + char *current; + std::vector in; + current = strtok(cstr, " "); + while (current != NULL) { + in.push_back(atof(current)); + current = strtok(NULL, " "); + } + cstr = const_cast(pline.c_str()); + std::vector pr; + current = strtok(cstr, " "); + while (current != NULL) { + pr.push_back(atof(current)); + current = strtok(NULL, " "); + } + + // hls-fpga-machine-learning insert data + input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5]; + nnet::copy_data(in, input_cont); + input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19]; + nnet::copy_data(in, input_pxpy); + input_t input_cat0[N_INPUT_1_1]; + nnet::copy_data(in, input_cat0); + input2_t input_cat1[N_INPUT_1_2]; + nnet::copy_data(in, input_cat1); + result_t layer21_out[N_FILT_21]; + + // hls-fpga-machine-learning insert top-level-function + L1METML_v1(input_cont,input_pxpy,input_cat0,input_cat1,layer21_out); + + if (e % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + for(int i = 0; i < N_FILT_21; i++) { + std::cout << pr[i] << " "; + } + std::cout << std::endl; + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + nnet::print_result(layer21_out, std::cout, true); + } + e++; + + // hls-fpga-machine-learning insert tb-output + nnet::print_result(layer21_out, fout); + } + fin.close(); + fpr.close(); + } else { + std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl; + + // hls-fpga-machine-learning insert zero + input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5]; + nnet::fill_zero(input_cont); + input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19]; + nnet::fill_zero(input_pxpy); + input_t input_cat0[N_INPUT_1_1]; + nnet::fill_zero(input_cat0); + input2_t input_cat1[N_INPUT_1_2]; + nnet::fill_zero(input_cat1); + result_t layer21_out[N_FILT_21]; + + // hls-fpga-machine-learning insert top-level-function + L1METML_v1(input_cont,input_pxpy,input_cat0,input_cat1,layer21_out); + + // hls-fpga-machine-learning insert output + nnet::print_result(layer21_out, std::cout, true); + + // hls-fpga-machine-learning insert tb-output + nnet::print_result(layer21_out, fout); + } + + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + + return 0; +} diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_lib.sh b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_lib.sh new file mode 100644 index 00000000..d60a2dd3 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_lib.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +CC=g++ +if [[ "$OSTYPE" == "linux-gnu" ]]; then + CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique" +elif [[ "$OSTYPE" == "darwin"* ]]; then + CFLAGS="-O3 -fPIC -std=c++11" +fi +LDFLAGS= +INCFLAGS="-Ifirmware/ap_types/" +PROJECT=L1METML_v1 +LIB_STAMP=95715E3e + +${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so +rm -f *.o diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_prj.tcl b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_prj.tcl new file mode 100644 index 00000000..82b3c5a6 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_prj.tcl @@ -0,0 +1,250 @@ +################# +# HLS4ML +################# +array set opt { + reset 0 + csim 1 + synth 1 + cosim 1 + validation 1 + export 0 + vsynth 0 + fifo_opt 0 +} + +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +proc remove_recursive_log_wave {} { + set tcldir [file dirname [info script]] + source [file join $tcldir project.tcl] + + set filename ${project_name}_prj/solution1/sim/verilog/${project_name}.tcl + set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}] + set temp $filename.new.$timestamp + # set backup $filename.bak.$timestamp + + set in [open $filename r] + set out [open $temp w] + + # line-by-line, read the original file + while {[gets $in line] != -1} { + if {[string equal "$line" "log_wave -r /"]} { + set line { } + } + puts $out $line + } + + close $in + close $out + + # move the new data to the proper filename + file delete -force $filename + file rename -force $temp $filename +} + +proc add_vcd_instructions_tcl {} { + set tcldir [file dirname [info script]] + source [file join $tcldir project.tcl] + + set filename ${project_name}_prj/solution1/sim/verilog/${project_name}.tcl + set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}] + set temp $filename.new.$timestamp + # set backup $filename.bak.$timestamp + + set in [open $filename r] + set out [open $temp w] + + # line-by-line, read the original file + while {[gets $in line] != -1} { + if {[string equal "$line" "log_wave -r /"]} { + set line {source "../../../../project.tcl" + if {[string equal "$backend" "vivadoaccelerator"]} { + current_scope [get_scopes -regex "/apatb_${project_name}_axi_top/AESL_inst_${project_name}_axi/${project_name}_U0.*"] + set scopes [get_scopes -regexp {layer(\d*)_.*data_0_V_U.*}] + append scopes { } + current_scope "/apatb_${project_name}_axi_top/AESL_inst_${project_name}_axi" + append scopes [get_scopes -regexp {(in_local_V_data.*_0_.*)}] + append scopes { } + append scopes [get_scopes -regexp {(out_local_V_data.*_0_.*)}] + } else { + current_scope [get_scopes -regex "/apatb_${project_name}_top/AESL_inst_${project_name}"] + set scopes [get_scopes -regexp {layer(\d*)_.*data_0_V_U.*}] + } + open_vcd fifo_opt.vcd + foreach scope $scopes { + current_scope $scope + if {[catch [get_objects usedw]] == 0} { + puts "$scope skipped" + continue + } + set usedw [get_objects usedw] + set depth [get_objects DEPTH] + add_wave $usedw + log_vcd $usedw + log_wave $usedw + add_wave $depth + log_vcd $depth + log_wave $depth + } + } + } + + if {[string equal "$line" "quit"]} { + set line {flush_vcd + close_vcd + quit + } + } + # then write the transformed line + puts $out $line + } + + close $in + close $out + + # move the new data to the proper filename + file delete -force $filename + file rename -force $temp $filename +} + +foreach arg $::argv { + foreach o [lsort [array names opt]] { + regexp "$o=+(\\w+)" $arg unused opt($o) + } +} + +proc report_time { op_name time_start time_end } { + set time_taken [expr $time_end - $time_start] + set time_s [expr ($time_taken / 1000) % 60] + set time_m [expr ($time_taken / (1000*60)) % 60] + set time_h [expr ($time_taken / (1000*60*60)) % 24] + puts "***** ${op_name} COMPLETED IN ${time_h}h${time_m}m${time_s}s *****" +} + +# Compare file content: 1 = same, 0 = different +proc compare_files {file_1 file_2} { + # Check if files exist, error otherwise + if {! ([file exists $file_1] && [file exists $file_2])} { + return 0 + } + # Files with different sizes are obviously different + if {[file size $file_1] != [file size $file_2]} { + return 0 + } + + # String compare the content of the files + set fh_1 [open $file_1 r] + set fh_2 [open $file_2 r] + set equal [string equal [read $fh_1] [read $fh_2]] + close $fh_1 + close $fh_2 + return $equal +} + +file mkdir tb_data +set CSIM_RESULTS "./tb_data/csim_results.log" +set RTL_COSIM_RESULTS "./tb_data/rtl_cosim_results.log" + +if {$opt(reset)} { + open_project -reset ${project_name}_prj +} else { + open_project ${project_name}_prj +} +set_top ${project_name} +add_files firmware/${project_name}.cpp -cflags "-std=c++0x" +add_files -tb ${project_name}_test.cpp -cflags "-std=c++0x" +add_files -tb firmware/weights +add_files -tb tb_data +if {$opt(reset)} { + open_solution -reset "solution1" +} else { + open_solution "solution1" +} +catch {config_array_partition -maximum_size 8192} +config_compile -name_max_length 80 +set_part $part +config_schedule -enable_dsp_full_reg=false +create_clock -period $clock_period -name default +set_clock_uncertainty $clock_uncertainty default + + +if {$opt(csim)} { + puts "***** C SIMULATION *****" + set time_start [clock clicks -milliseconds] + csim_design + set time_end [clock clicks -milliseconds] + report_time "C SIMULATION" $time_start $time_end +} + +if {$opt(synth)} { + puts "***** C/RTL SYNTHESIS *****" + set time_start [clock clicks -milliseconds] + csynth_design + set time_end [clock clicks -milliseconds] + report_time "C/RTL SYNTHESIS" $time_start $time_end +} + +if {$opt(cosim)} { + puts "***** C/RTL SIMULATION *****" + # TODO: This is a workaround (Xilinx defines __RTL_SIMULATION__ only for SystemC testbenches). + add_files -tb ${project_name}_test.cpp -cflags "-std=c++0x -DRTL_SIM" + set time_start [clock clicks -milliseconds] + + cosim_design -trace_level all -setup + + if {$opt(fifo_opt)} { + puts "\[hls4ml\] - FIFO optimization started" + add_vcd_instructions_tcl + } + + remove_recursive_log_wave + set old_pwd [pwd] + cd ${project_name}_prj/solution1/sim/verilog/ + source run_sim.tcl + cd $old_pwd + + set time_end [clock clicks -milliseconds] + puts "INFO:" + if {[string equal "$backend" "vivadoaccelerator"]} { + puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]] + } else { + puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]] + } + report_time "C/RTL SIMULATION" $time_start $time_end +} + +if {$opt(validation)} { + puts "***** C/RTL VALIDATION *****" + if {[compare_files $CSIM_RESULTS $RTL_COSIM_RESULTS]} { + puts "INFO: Test PASSED" + } else { + puts "ERROR: Test failed" + puts "ERROR: - csim log: $CSIM_RESULTS" + puts "ERROR: - RTL-cosim log: $RTL_COSIM_RESULTS" + exit 1 + } +} + +if {$opt(export)} { + puts "***** EXPORT IP *****" + set time_start [clock clicks -milliseconds] + export_design -format ip_catalog -version $version + set time_end [clock clicks -milliseconds] + report_time "EXPORT IP" $time_start $time_end +} + +if {$opt(vsynth)} { + puts "***** VIVADO SYNTHESIS *****" + if {[file exist ${project_name}_prj/solution1/syn/vhdl]} { + set time_start [clock clicks -milliseconds] + exec vivado -mode batch -source vivado_synth.tcl >@ stdout + set time_end [clock clicks -milliseconds] + report_time "VIVADO SYNTHESIS" $time_start $time_end + } else { + puts "ERROR: Cannot find generated VHDL files. Did you run C synthesis?" + exit 1 + } +} + +exit diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-2Bd4CD9f.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-2Bd4CD9f.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-2Bd4CD9f.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-87B65ff2.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-87B65ff2.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-87B65ff2.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-8aEF503a.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-8aEF503a.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-8aEF503a.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-95715E3e.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-95715E3e.so new file mode 100755 index 00000000..7b00d3fa Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-95715E3e.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-B1BDE0dd.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-B1BDE0dd.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-B1BDE0dd.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-CEB54420.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-CEB54420.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-CEB54420.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-F1DF32D7.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-F1DF32D7.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-F1DF32D7.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-dDAfeD3b.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-dDAfeD3b.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-dDAfeD3b.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-ecB7D1bC.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-ecB7D1bC.so new file mode 100755 index 00000000..2a7b45ec Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-ecB7D1bC.so differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.cpp b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.cpp new file mode 100644 index 00000000..18ef7438 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.cpp @@ -0,0 +1,117 @@ +#include + +#include "L1METML_v1.h" +#include "parameters.h" + +void L1METML_v1( + input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5], input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19], input_t input_cat0[N_INPUT_1_1], input2_t input_cat1[N_INPUT_1_2], + result_t layer21_out[N_FILT_21] +) { + + // hls-fpga-machine-learning insert IO + #pragma HLS ARRAY_RESHAPE variable=input_cont complete dim=0 + #pragma HLS ARRAY_RESHAPE variable=input_pxpy complete dim=0 + #pragma HLS ARRAY_RESHAPE variable=input_cat0 complete dim=0 + #pragma HLS ARRAY_RESHAPE variable=input_cat1 complete dim=0 + #pragma HLS ARRAY_PARTITION variable=layer21_out complete dim=0 + #pragma HLS INTERFACE ap_vld port=input_cont,input_pxpy,input_cat0,input_cat1,layer21_out + #pragma HLS DATAFLOW + +#ifndef __SYNTHESIS__ + static bool loaded_weights = false; + if (!loaded_weights) { + // hls-fpga-machine-learning insert load weights + nnet::load_weights_from_txt(e3, "e3.txt"); + nnet::load_weights_from_txt(e4, "e4.txt"); + nnet::load_weights_from_txt(w22, "w22.txt"); + nnet::load_weights_from_txt(b22, "b22.txt"); + nnet::load_weights_from_txt(w23, "w23.txt"); + nnet::load_weights_from_txt(b23, "b23.txt"); + nnet::load_weights_from_txt(w24, "w24.txt"); + nnet::load_weights_from_txt(b24, "b24.txt"); + loaded_weights = true; + } +#endif + + // **************************************** + // NETWORK INSTANTIATION + // **************************************** + + // hls-fpga-machine-learning insert layers + + layer3_t layer3_out[N_LAYER_1_3*N_LAYER_2_3]; + #pragma HLS ARRAY_PARTITION variable=layer3_out complete dim=0 + nnet::embedding(input_cat0, layer3_out, e3); // embedding0 +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer3_out, "embedding0", N_LAYER_1_3*N_LAYER_2_3); +#endif + + layer4_t layer4_out[N_LAYER_1_4*N_LAYER_2_4]; + #pragma HLS ARRAY_PARTITION variable=layer4_out complete dim=0 + nnet::embedding(input_cat1, layer4_out, e4); // embedding1 +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer4_out, "embedding1", N_LAYER_1_4*N_LAYER_2_4); +#endif + + layer6_t layer6_out[OUT_CONCAT_0_6*OUT_CONCAT_1_6]; + #pragma HLS ARRAY_PARTITION variable=layer6_out complete dim=0 + nnet::concatenate2d(layer3_out, layer4_out, layer6_out); // concatenate +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer6_out, "concatenate", OUT_CONCAT_0_6*OUT_CONCAT_1_6); +#endif + + layer7_t layer7_out[OUT_CONCAT_0_7*OUT_CONCAT_1_7]; + #pragma HLS ARRAY_PARTITION variable=layer7_out complete dim=0 + nnet::concatenate2d(input_cont, layer6_out, layer7_out); // concatenate_1 +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer7_out, "concatenate_1", OUT_CONCAT_0_7*OUT_CONCAT_1_7); +#endif + + layer22_t layer22_out[N_OUTPUTS_22*N_FILT_22]; + #pragma HLS ARRAY_PARTITION variable=layer22_out complete dim=0 + nnet::pointwise_conv_1d_cl(layer7_out, layer22_out, w22, b22); // dense +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer22_out, "dense", N_OUTPUTS_22*N_FILT_22); +#endif + + layer11_t layer11_out[N_LAYER_1_8*N_LAYER_2_8]; + #pragma HLS ARRAY_PARTITION variable=layer11_out complete dim=0 + nnet::tanh(layer22_out, layer11_out); // activation +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer11_out, "activation", N_LAYER_1_8*N_LAYER_2_8); +#endif + + layer23_t layer23_out[N_OUTPUTS_23*N_FILT_23]; + #pragma HLS ARRAY_PARTITION variable=layer23_out complete dim=0 + nnet::pointwise_conv_1d_cl(layer11_out, layer23_out, w23, b23); // dense_1 +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer23_out, "dense_1", N_OUTPUTS_23*N_FILT_23); +#endif + + layer15_t layer15_out[N_LAYER_1_12*N_LAYER_2_12]; + #pragma HLS ARRAY_PARTITION variable=layer15_out complete dim=0 + nnet::tanh(layer23_out, layer15_out); // activation_1 +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer15_out, "activation_1", N_LAYER_1_12*N_LAYER_2_12); +#endif + + layer24_t layer24_out[N_OUTPUTS_24*N_FILT_24]; + #pragma HLS ARRAY_PARTITION variable=layer24_out complete dim=0 + nnet::pointwise_conv_1d_cl(layer15_out, layer24_out, w24, b24); // met_weight +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer24_out, "met_weight", N_OUTPUTS_24*N_FILT_24); +#endif + + layer20_t layer20_out[N_INPUT_1_19*N_INPUT_2_19]; + #pragma HLS ARRAY_PARTITION variable=layer20_out complete dim=0 + nnet::multiply(layer24_out, input_pxpy, layer20_out); // multiply +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer20_out, "multiply", N_INPUT_1_19*N_INPUT_2_19); +#endif + + nnet::global_pooling1d_cl(layer20_out, layer21_out); // output +#ifndef __SYNTHESIS__ + nnet::save_layer_output(layer21_out, "output", N_FILT_21); +#endif + +} diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.h new file mode 100644 index 00000000..69dd92ca --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.h @@ -0,0 +1,16 @@ +#ifndef L1METML_V1_H_ +#define L1METML_V1_H_ + +#include "ap_fixed.h" +#include "ap_int.h" +#include "hls_stream.h" + +#include "defines.h" + +// Prototype of top level function for C-synthesis +void L1METML_v1( + input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5], input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19], input_t input_cat0[N_INPUT_1_1], input2_t input_cat1[N_INPUT_1_2], + result_t layer21_out[N_FILT_21] +); + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_common.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_common.h new file mode 100644 index 00000000..4d2886cb --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_common.h @@ -0,0 +1,376 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_COMMON_H__ +#define __AP_COMMON_H__ + +// ---------------------------------------------------------------------- + +// Forward declaration of all AP types. +#include + + +#ifdef __SYNTHESIS__ +#error "The open-source version of AP types does not support synthesis." +#endif // ifdef __SYNTHESIS__ +#define _AP_ENABLE_HALF_ 0 + + +#if _AP_ENABLE_HALF_ == 1 +// Before ap_private definition. +#ifdef __SYNTHESIS__ +#define _HLS_HALF_DEFINED_ +typedef __fp16 half; +#else +class half; +#endif // __SYNTHESIS__ +#endif // _AP_ENABLE_HALF_ + +// ---------------------------------------------------------------------- + +// Macro functions +#define AP_MAX(a, b) ((a) > (b) ? (a) : (b)) +#define AP_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define AP_ABS(a) ((a) >= 0 ? (a) : -(a)) + +#ifndef AP_ASSERT +#ifndef __SYNTHESIS__ +#include +#define AP_ASSERT(cond, msg) assert((cond) && (msg)) +#else +#define AP_ASSERT(cond, msg) +#endif // ifndef __SYNTHESIS__ +#endif // ifndef AP_ASSERT + +#ifndef __SYNTHESIS__ +// for fprintf messages. +#include +// for exit on error. +#include +#endif + +// same disable condition as assert. +#if !defined(__SYNTHESIS__) && !defined(NDEBUG) + +#define _AP_DEBUG(cond, ...) \ + do { \ + if ((cond)) { \ + fprintf(stderr, "DEBUG: " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ + } while (0) +#define _AP_WARNING(cond, ...) \ + do { \ + if ((cond)) { \ + fprintf(stderr, "WARNING: " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ + } while (0) +#define _AP_ERROR(cond, ...) \ + do { \ + if ((cond)) { \ + fprintf(stderr, "ERROR: " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + abort(); \ + } \ + } while (0) + +#else // if !defined(__SYNTHESIS__) && !defined(NDEBUG) + +#define __AP_VOID_CAST static_cast +#define _AP_DEBUG(cond, ...) (__AP_VOID_CAST(0)) +#define _AP_WARNING(cond, ...) (__AP_VOID_CAST(0)) +#define _AP_ERROR(cond, ...) (__AP_VOID_CAST(0)) + +#endif // if !defined(__SYNTHESIS__) && !defined(NDEBUG) else + +// ---------------------------------------------------------------------- + +// Attribute only for synthesis +#ifdef __SYNTHESIS__ +#define INLINE inline __attribute__((always_inline)) +//#define INLINE inline __attribute__((noinline)) +#else +#define INLINE inline +#endif + +#define AP_WEAK +// __attribute__((weak)) + +#ifndef AP_INT_MAX_W +#define AP_INT_MAX_W 1024 +#endif + +#define BIT_WIDTH_UPPER_LIMIT (1 << 15) +#if AP_INT_MAX_W > BIT_WIDTH_UPPER_LIMIT +#error "Bitwidth exceeds 32768 (1 << 15), the maximum allowed value" +#endif + +#define MAX_MODE(BITS) ((BITS + 1023) / 1024) + +// ---------------------------------------------------------------------- + +// XXX apcc cannot handle global std::ios_base::Init() brought in by +#ifndef AP_AUTOCC +#ifndef __SYNTHESIS__ +// for overload operator<< +#include +#endif +#endif // ifndef AP_AUTOCC + +#ifndef __SYNTHESIS__ +// for string format. +#include +// for string. +#include +#endif + +// for detecting if char is signed. +enum { CHAR_IS_SIGNED = (char)-1 < 0 }; + +// TODO we have similar traits in x_hls_utils.h, should consider unify. +namespace _ap_type { +template +struct is_signed { + static const bool value = _Tp(-1) < _Tp(1); +}; + +template +struct is_integral { + static const bool value = false; +}; +#define DEF_IS_INTEGRAL(CTYPE) \ + template <> \ + struct is_integral { \ + static const bool value = true; \ + }; +DEF_IS_INTEGRAL(bool) +DEF_IS_INTEGRAL(char) +DEF_IS_INTEGRAL(signed char) +DEF_IS_INTEGRAL(unsigned char) +DEF_IS_INTEGRAL(short) +DEF_IS_INTEGRAL(unsigned short) +DEF_IS_INTEGRAL(int) +DEF_IS_INTEGRAL(unsigned int) +DEF_IS_INTEGRAL(long) +DEF_IS_INTEGRAL(unsigned long) +DEF_IS_INTEGRAL(ap_slong) +DEF_IS_INTEGRAL(ap_ulong) +#undef DEF_IS_INTEGRAL + +template +struct enable_if {}; +// partial specialization for true +template +struct enable_if { + typedef _Tp type; +}; + +template +struct remove_const { + typedef _Tp type; +}; + +template +struct remove_const<_Tp const> { + typedef _Tp type; +}; +} // namespace _ap_type + +// ---------------------------------------------------------------------- + +// Define ssdm_int and _ssdm_op. +// XXX deleted in open-source version + +#ifndef NON_C99STRING +#define _AP_C99 true +#else +#define _AP_C99 false +#endif + +static inline unsigned char guess_radix(const char* s) { + unsigned char rd = 10; ///< default radix + const char* p = s; + // skip neg sign if it exists + if (p[0] == '-' || p[0] == '+') ++p; + // guess based on following two bits. + if (p[0] == '0') { + if (p[1] == 'b' || p[1] == 'B') { + rd = 2; + } else if (p[1] == 'o' || p[1] == 'O') { + rd = 8; + } else if (p[1] == 'x' || p[1] == 'X') { + rd = 16; + } else if (p[1] == 'd' || p[1] == 'D') { + rd = 10; + } + } + return rd; +} + +// ---------------------------------------------------------------------- + +// Basic integral struct upon which ap_int and ap_fixed are defined. +#ifdef __SYNTHESIS__ +// Use ssdm_int, a compiler dependent, attribute constrained integeral type as +// basic data type. +#define _AP_ROOT_TYPE ssdm_int +// Basic ops. +#define _AP_ROOT_op_concat(Ret, X, Y) _ssdm_op_concat(Ret, X, Y) +#define _AP_ROOT_op_get_bit(Val, Bit) _ssdm_op_get_bit(Val, Bit) +#define _AP_ROOT_op_set_bit(Val, Bit, Repl) _ssdm_op_set_bit(Val, Bit, Repl) +#define _AP_ROOT_op_get_range(Val, Lo, Hi) _ssdm_op_get_range(Val, Lo, Hi) +#define _AP_ROOT_op_set_range(Val, Lo, Hi, Repl) \ + _ssdm_op_set_range(Val, Lo, Hi, Repl) +#define _AP_ROOT_op_reduce(Op, Val) _ssdm_op_reduce(Op, Val) +#else // ifdef __SYNTHESIS__ +// Use ap_private for compiler-independent basic data type +template +class ap_private; +/// model ssdm_int in standard C++ for simulation. +template +struct ssdm_int_sim { + /// integral type with template-specified width and signedness. + ap_private<_AP_W, _AP_S> V; + ssdm_int_sim() {} +}; +#define _AP_ROOT_TYPE ssdm_int_sim +// private's ref uses _AP_ROOT_TYPE. +#include +// XXX The C-sim model cannot use GCC-extension +// Basic ops. Ret and Val are ap_private. +template +inline _Tp1 _AP_ROOT_op_concat(const _Tp1& Ret, const _Tp2& X, const _Tp3& Y) { + _Tp1 r = (X).operator,(Y); + return r; +} +#define _AP_ROOT_op_get_bit(Val, Bit) (Val).get_bit((Bit)) +template +inline _Tp1& _AP_ROOT_op_set_bit(_Tp1& Val, const _Tp2& Bit, const _Tp3& Repl) { + (Val).set_bit((Bit), (Repl)); + return Val; +} +// notice the order of high and low index is different in ssdm call and +// ap_private.range()... +#define _AP_ROOT_op_get_range(Val, Lo, Hi) (Val).range((Hi), (Lo)) +template +inline _Tp1& _AP_ROOT_op_set_range(_Tp1& Val, const _Tp2& Lo, const _Tp3& Hi, + const _Tp4& Repl) { + (Val).range((Hi), (Lo)) = Repl; + return (Val); +} +#define _AP_ROOT_op_and_reduce(Val) (Val).and_reduce() +#define _AP_ROOT_op_nand_reduce(Val) (Val).nand_reduce() +#define _AP_ROOT_op_or_reduce(Val) (Val).or_reduce() +#define _AP_ROOT_op_xor_reduce(Val) (Val).xor_reduce() +// ## is the concatenation in preprocessor: +#define _AP_ROOT_op_reduce(Op, Val) _AP_ROOT_op_##Op##_reduce(Val) +#endif // ifdef __SYNTHESIS__ else + +// ---------------------------------------------------------------------- + +// Constants for half, single, double pricision floating points +#define HALF_MAN 10 +#define FLOAT_MAN 23 +#define DOUBLE_MAN 52 + +#define HALF_EXP 5 +#define FLOAT_EXP 8 +#define DOUBLE_EXP 11 + +#define BIAS(e) ((1L << (e - 1L)) - 1L) +#define HALF_BIAS BIAS(HALF_EXP) +#define FLOAT_BIAS BIAS(FLOAT_EXP) +#define DOUBLE_BIAS BIAS(DOUBLE_EXP) + +#define APFX_IEEE_DOUBLE_E_MAX DOUBLE_BIAS +#define APFX_IEEE_DOUBLE_E_MIN (-DOUBLE_BIAS + 1) + +INLINE ap_ulong doubleToRawBits(double pf) { + union { + ap_ulong __L; + double __D; + } LD; + LD.__D = pf; + return LD.__L; +} + +INLINE unsigned int floatToRawBits(float pf) { + union { + unsigned int __L; + float __D; + } LD; + LD.__D = pf; + return LD.__L; +} + +#if _AP_ENABLE_HALF_ == 1 +INLINE unsigned short halfToRawBits(half pf) { +#ifdef __SYNTHESIS__ + union { + unsigned short __L; + half __D; + } LD; + LD.__D = pf; + return LD.__L; +#else + return pf.get_bits(); +#endif +} +#endif + +// usigned long long is at least 64-bit +INLINE double rawBitsToDouble(ap_ulong pi) { + union { + ap_ulong __L; + double __D; + } LD; + LD.__L = pi; + return LD.__D; +} + +// long is at least 32-bit +INLINE float rawBitsToFloat(unsigned long pi) { + union { + unsigned int __L; + float __D; + } LD; + LD.__L = pi; + return LD.__D; +} + +#if _AP_ENABLE_HALF_ == 1 +// short is at least 16-bit +INLINE half rawBitsToHalf(unsigned short pi) { +#ifdef __SYNTHESIS__ + union { + unsigned short __L; + half __D; + } LD; + LD.__L = pi; + return LD.__D; +#else + // sim model of half has a non-trivial constructor + half __D; + __D.set_bits(pi); + return __D; +#endif +} +#endif + +#endif // ifndef __AP_COMMON_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_decl.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_decl.h new file mode 100644 index 00000000..ddd00f1c --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_decl.h @@ -0,0 +1,212 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_DECL_H__ +#define __AP_DECL_H__ + +// ---------------------------------------------------------------------- + +#if !defined(__AP_FIXED_H__) && !defined(__AP_INT_H__) && !defined(__AUTOPILOT_CBE_H__) && !defined(__HLS_HALF_H__) +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +// Test __SYNTHESIS__ only for mode +#if !defined(__SYNTHESIS__) && (defined(AESL_SYN) || defined(__HLS_SYN__)) +//#pragma message "AESL_SYN and __HLS_SYN__ should be replaced by __SYNTHESIS__" +#define __SYNTHESIS__ +#endif + +/* for safety*/ +#if (defined(_AP_N) || defined(_AP_C)) +#error One or more of the following is defined: _AP_N, _AP_C. Definition conflicts with their usage as template parameters. +#endif + +/* for safety*/ +#if (defined(_AP_W) || defined(_AP_I) || defined(_AP_S) || defined(_AP_Q) || \ + defined(_AP_O) || defined(_AP_W2) || defined(_AP_I2) || \ + defined(_AP_S2) || defined(_AP_Q2) || defined(_AP_O2) || \ + defined(_AP_N) || defined(_AP_N2)) +#error \ + "One or more of the following is defined: _AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N, _AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2. Definition conflicts with their usage as template parameters." +#endif + +/*for safety*/ +#if (defined(_AP_W3) || defined(_AP_S3) || defined(_AP_W4) || defined(_AP_S4)) +#error \ + "One or more of the following is defined: _AP_W3, _AP_S3, _AP_W4,_AP_S4. Definition conflicts with their usage as template parameters." +#endif + +#if (defined(_AP_W1) || defined(_AP_S1) || defined(_AP_T) || \ + defined(_AP_T1) || defined(_AP_T2) || defined(_AP_T3) || defined(_AP_T4)) +#error \ + "One or more of the following is defined: _AP_W1, _AP_S1, _AP_T, _AP_T1, _AP_T2, _AP_T3, _AP_T4. Definition conflicts with their usage as template parameters." +#endif + +#ifndef __cplusplus +#error "AP data type can only be used in C++" +#endif + +// ---------------------------------------------------------------------- + +#ifndef __SC_COMPATIBLE__ +/// ap_fixed quantification mode +enum ap_q_mode { + AP_RND, //< rounding to plus infinity + AP_RND_ZERO, //< rounding to zero + AP_RND_MIN_INF, //< rounding to minus infinity + AP_RND_INF, //< rounding to infinity + AP_RND_CONV, //< convergent rounding + AP_TRN, //< truncation + AP_TRN_ZERO, //< truncation to zero +}; + +// FIXME for legacy code +#ifndef SYSTEMC_INCLUDED +#define SC_RND AP_RND +#define SC_RND_ZERO AP_RND_ZERO +#define SC_RND_MIN_INF AP_RND_MIN_INF +#define SC_RND_INF AP_RND_INF +#define SC_RND_CONV AP_RND_CONV +#define SC_TRN AP_TRN +#define SC_TRN_ZERO AP_TRN_ZERO +#endif // !defined(SYSTEMC_INCLUDED) + +/// ap_fixed saturation mode +enum ap_o_mode { + AP_SAT, //< saturation + AP_SAT_ZERO, //< saturation to zero + AP_SAT_SYM, //< symmetrical saturation + AP_WRAP, //< wrap-around (*) + AP_WRAP_SM, //< sign magnitude wrap-around (*) +}; + +// FIXME for legacy code +#ifndef SYSTEMC_INCLUDED +#define SC_SAT AP_SAT +#define SC_SAT_ZERO AP_SAT_ZERO +#define SC_SAT_SYM AP_SAT_SYM +#define SC_WRAP AP_WRAP +#define SC_WRAP_SM AP_WRAP_SM +#endif // !defined(SYSTEMC_INCLUDED) + +#else // defined(__SC_COMPATIBLE__) + +// There will not be sc_fxdefs.h, and the emu should be defined by ap_fixed. + +/// ap_fixed quantification mode +enum ap_q_mode { + SC_RND, //< rounding to plus infinity + SC_RND_ZERO, //< rounding to zero + SC_RND_MIN_INF, //< rounding to minus infinity + SC_RND_INF, //< rounding to infinity + SC_RND_CONV, //< convergent rounding + SC_TRN, //< truncation + SC_TRN_ZERO, //< truncation to zero +}; + +#define AP_RND SC_RND +#define AP_RND_ZERO SC_RND_ZERO +#define AP_RND_MIN_INF SC_RND_MIN_INF +#define AP_RND_INF SC_RND_INF +#define AP_RND_CONV SC_RND_CONV +#define AP_TRN SC_TRN +#define AP_TRN_ZERO SC_TRN_ZERO + +/// ap_fixed saturation mode +enum ap_o_mode { + SC_SAT, //< saturation + SC_SAT_ZERO, //< saturation to zero + SC_SAT_SYM, //< symmetrical saturation + SC_WRAP, //< wrap-around (*) + SC_WRAP_SM, //< sign magnitude wrap-around (*) +}; + +#define AP_SAT SC_SAT +#define AP_SAT_ZERO SC_SAT_ZERO +#define AP_SAT_SYM SC_SAT_SYM +#define AP_WRAP SC_WRAP +#define AP_WRAP_SM SC_WRAP_SM + +#endif // defined(__SC_COMPATIBLE__) + +template +struct ap_int_base; + +template +struct ap_int; + +template +struct ap_uint; + +template +struct ap_range_ref; + +template +struct ap_bit_ref; + +template +struct ap_concat_ref; + +template +struct ap_fixed_base; + +template +struct ap_fixed; + +template +struct ap_ufixed; + +template +struct af_range_ref; + +template +struct af_bit_ref; + +/// string base mode +enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 }; + +#ifndef SYSTEMC_INCLUDED +#define SC_BIN 2 +#define SC_OCT 8 +#define SC_DEC 10 +#define SC_HEX 16 +#endif // !defined(SYSTEMC_INCLUDED) + +// Alias C data types +#ifdef _MSC_VER +typedef signed __int64 ap_slong; +typedef unsigned __int64 ap_ulong; +#else // !defined(_MSC_VER) +typedef signed long long ap_slong; +typedef unsigned long long ap_ulong; +#endif // !defined(_MSC_VER) + +enum { + _AP_SIZE_char = 8, + _AP_SIZE_short = sizeof(short) * 8, + _AP_SIZE_int = sizeof(int) * 8, + _AP_SIZE_long = sizeof(long) * 8, + _AP_SIZE_ap_slong = sizeof(ap_slong) * 8 +}; + +#endif // !defined(__AP_DECL_H__) + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed.h new file mode 100644 index 00000000..cd0192bc --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed.h @@ -0,0 +1,360 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_FIXED_H__ +#define __AP_FIXED_H__ + +#include +#include +#include + +//--------------------------------------------------------------- + +/// Signed Arbitrary Precision Fixed-Point Type. +// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h +template +struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> { + typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base; + // Constructor + /// default ctor + INLINE ap_fixed() : Base() {} + + /// default copy ctor + INLINE ap_fixed(const ap_fixed& op) { Base::V = op.V; } + + /// copy ctor from ap_fixed_base. + template + INLINE ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + INLINE ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} + + //// from ap_fixed + //template + //INLINE ap_fixed( + // const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {} + + //template + //INLINE ap_fixed( + // const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {} + + //// from ap_ufixed. + //template + //INLINE ap_fixed( + // const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) { + //} + + //template + //INLINE ap_fixed( + // const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) { + //} + + /// copy ctor from ap_int_base. + template + INLINE ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + INLINE ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + //// from ap_int. + //template + //INLINE ap_fixed(const ap_int<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, true>(op)) {} + + //template + //INLINE ap_fixed(const volatile ap_int<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, true>(op)) {} + + //// from ap_uint. + //template + //INLINE ap_fixed(const ap_uint<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, false>(op)) {} + + //template + //INLINE ap_fixed(const volatile ap_uint<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, false>(op)) {} + + // from ap_bit_ref. + template + INLINE ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + // from ap_range_ref. + template + INLINE ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + // from ap_concat_ref. + template + INLINE ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) + : Base(op) {} + + // from af_bit_ref. + template + INLINE ap_fixed( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + // from af_range_ref. + template + INLINE ap_fixed( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + +// from c types. +#define CTOR(TYPE) \ + INLINE ap_fixed(TYPE v) : Base(v) {} + + CTOR(bool) + CTOR(char) + CTOR(signed char) + CTOR(unsigned char) + CTOR(short) + CTOR(unsigned short) + CTOR(int) + CTOR(unsigned int) + CTOR(long) + CTOR(unsigned long) + CTOR(ap_slong) + CTOR(ap_ulong) +#if _AP_ENABLE_HALF_ == 1 + CTOR(half) +#endif + CTOR(float) + CTOR(double) +#undef CTOR + + INLINE ap_fixed(const char* s) : Base(s) {} + + INLINE ap_fixed(const char* s, signed char rd) : Base(s, rd) {} + + // Assignment + // The assignment operator is technically inherited; however, it is always + // hidden by an explicitly or implicitly defined assignment operator for the + // derived class. + /* XXX ctor will be used when right is not of proper type. */ + INLINE ap_fixed& operator=( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + + INLINE void operator=( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile { + Base::V = op.V; + } + + INLINE ap_fixed& operator=( + const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + + INLINE void operator=( + const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile { + Base::V = op.V; + } +}; // struct ap_fixed. + +//------------------------------------------------------------------- + +// Unsigned Arbitrary Precision Fixed-Point Type. +// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h +template +struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> { + typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base; + // Constructor + /// default ctor + INLINE ap_ufixed() : Base() {} + + /// default copy ctor + INLINE ap_ufixed(const ap_ufixed& op) { Base::V = op.V; } + + /// copy ctor from ap_fixed_base + template + INLINE ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} + + /// copy ctor from ap_fixed_base + template + INLINE ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, + _AP_O2, _AP_N2>& op) + : Base(op) {} + + //template + //INLINE ap_ufixed( + // const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {} + + //template + //INLINE ap_ufixed( + // const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {} + + //template + //INLINE ap_ufixed( + // const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) { + //} + + //template + //INLINE ap_ufixed( + // const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + // : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) { + //} + + /// copy ctor from ap_int_base. + template + INLINE ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + INLINE ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {} + + //template + //INLINE ap_ufixed(const ap_int<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, true>(op)) {} + + //template + //INLINE ap_ufixed(const volatile ap_int<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, true>(op)) {} + + //template + //INLINE ap_ufixed(const ap_uint<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, false>(op)) {} + + //template + //INLINE ap_ufixed(const volatile ap_uint<_AP_W2>& op) + // : Base(ap_int_base<_AP_W2, false>(op)) {} + + template + INLINE ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + INLINE ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {} + + template + INLINE ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) + : Base(op) {} + + template + INLINE ap_ufixed( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + INLINE ap_ufixed( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + +#define CTOR(TYPE) \ + INLINE ap_ufixed(TYPE v) : Base(v) {} + + CTOR(bool) + CTOR(char) + CTOR(signed char) + CTOR(unsigned char) + CTOR(short) + CTOR(unsigned short) + CTOR(int) + CTOR(unsigned int) + CTOR(long) + CTOR(unsigned long) + CTOR(ap_slong) + CTOR(ap_ulong) +#if _AP_ENABLE_HALF_ == 1 + CTOR(half) +#endif + CTOR(float) + CTOR(double) +#undef CTOR + + INLINE ap_ufixed(const char* s) : Base(s) {} + + INLINE ap_ufixed(const char* s, signed char rd) : Base(s, rd) {} + + // Assignment + INLINE ap_ufixed& operator=( + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + + INLINE void operator=( + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile { + Base::V = op.V; + } + + INLINE ap_ufixed& operator=( + const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + + INLINE void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, + _AP_N>& op) volatile { + Base::V = op.V; + } +}; // struct ap_ufixed + + +#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED)) +// XXX sc_trace overload for ap_fixed is already included in +// "ap_sysc/ap_sc_extras.h", so do not define in synthesis. +template +INLINE void sc_trace(sc_core::sc_trace_file* tf, + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op, + const std::string& name) { + tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name); +} + +template +INLINE void sc_trace(sc_core::sc_trace_file* tf, + const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op, + const std::string& name) { + tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name); +} +#endif // System C sim + +// Specialization of std containers, so that std::complex can have its +// image part automatically zero-initialized when only real part is provided. +#include + +#endif // ifndef __AP_FIXED_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_base.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_base.h new file mode 100644 index 00000000..1d94b938 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_base.h @@ -0,0 +1,2354 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_FIXED_BASE_H__ +#define __AP_FIXED_BASE_H__ + +#ifndef __AP_FIXED_H__ +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +// for ap_int_base and its reference types. +#include +#ifndef __SYNTHESIS__ +#if _AP_ENABLE_HALF_ == 1 +// for half type +#include +#endif +// for std io +#include +#endif + +#ifndef __cplusplus +#error "C++ is required to include this header file" +#else // __cplusplus + +// for warning on unsupported rounding mode in conversion to float/double. +#if !defined(__SYNTHESIS__) && __cplusplus >= 201103L && \ + (defined(__gnu_linux__) || defined(_WIN32)) +#define AP_FIXED_ENABLE_CPP_FENV 1 +#include +#endif + +// ---------------------------------------------------------------------- + +/* Major TODO + long double support: constructor, assign and other operators. + binary operators with ap_fixed_base and const char*. + return ap_fixed/ap_ufixed when result signedness is known. +*/ + +// Helper function in conversion to floating point types. + +#ifdef __SYNTHESIS__ +#define _AP_ctype_op_get_bit(var, index) _AP_ROOT_op_get_bit(var, index) +#define _AP_ctype_op_set_bit(var, index, x) _AP_ROOT_op_set_bit(var, index, x) +#define _AP_ctype_op_get_range(var, low, high) \ + _AP_ROOT_op_get_range(var, low, high) +#define _AP_ctype_op_set_range(var, low, high, x) \ + _AP_ROOT_op_set_range(var, low, high, x) +#else // ifdef __SYNTHESIS__ +template +inline bool _AP_ctype_op_get_bit(_Tp1& var, const _Tp2& index) { + return !!(var & (1ull << (index))); +} +template +inline _Tp1 _AP_ctype_op_set_bit(_Tp1& var, const _Tp2& index, const _Tp3& x) { + var |= (((x) ? 1ull : 0ull) << (index)); + return var; +} +template +inline _Tp1 _AP_ctype_op_get_range(_Tp1& var, const _Tp2& low, + const _Tp3& high) { + _Tp1 r = var; + ap_ulong mask = -1ll; + mask >>= (sizeof(_Tp1) * 8 - ((high) - (low) + 1)); + r >>= (low); + r &= mask; + return r; +} +template +inline _Tp1 _AP_ctype_op_set_range(_Tp1& var, const _Tp2& low, const _Tp3& high, + const _Tp4& x) { + ap_ulong mask = -1ll; + mask >>= (_AP_SIZE_ap_slong - ((high) - (low) + 1)); + var &= ~(mask << (low)); + var |= ((mask & x) << (low)); + return var; +} +#endif // ifdef __SYNTHESIS__ + + +// trait for letting base class to return derived class. +// Notice that derived class template is incomplete, and we cannot use +// the member of the derived class. +template +struct _ap_fixed_factory; +template +struct _ap_fixed_factory<_AP_W2, _AP_I2, true> { + typedef ap_fixed<_AP_W2, _AP_I2> type; +}; +template +struct _ap_fixed_factory<_AP_W2, _AP_I2, false> { + typedef ap_ufixed<_AP_W2, _AP_I2> type; +}; + +/// ap_fixed_base: AutoPilot fixed point. +/** partial specialization of signed. + @tparam _AP_W width. + @tparam _AP_I integral part width. + @tparam _AP_S signed. + @tparam _AP_Q quantization mode. Default is AP_TRN. + @tparam _AP_O saturation mode. Default is AP_WRAP. + @tparam _AP_N saturation wrap value. Default is 0. + */ +// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h +template +struct ap_fixed_base : _AP_ROOT_TYPE<_AP_W, _AP_S> { + public: + typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base; + static const int width = _AP_W; + static const int iwidth = _AP_I; + static const ap_q_mode qmode = _AP_Q; + static const ap_o_mode omode = _AP_O; + + /// Return type trait. + template + struct RType { + enum { + _AP_F = _AP_W - _AP_I, + F2 = _AP_W2 - _AP_I2, + mult_w = _AP_W + _AP_W2, + mult_i = _AP_I + _AP_I2, + mult_s = _AP_S || _AP_S2, + plus_w = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + + 1 + AP_MAX(_AP_F, F2), + plus_i = + AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1, + plus_s = _AP_S || _AP_S2, + minus_w = + AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1 + + AP_MAX(_AP_F, F2), + minus_i = + AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1, + minus_s = true, +#ifndef __SC_COMPATIBLE__ + div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0), +#else + div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0), +#endif + div_i = _AP_S2 + _AP_I + F2, + div_s = _AP_S || _AP_S2, + logic_w = + AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + + AP_MAX(_AP_F, F2), + logic_i = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)), + logic_s = _AP_S || _AP_S2 + }; + + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs; + typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs; + + typedef ap_fixed_base mult_base; + typedef ap_fixed_base plus_base; + typedef ap_fixed_base minus_base; + typedef ap_fixed_base logic_base; + typedef ap_fixed_base div_base; + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base; + + typedef typename _ap_fixed_factory::type mult; + typedef typename _ap_fixed_factory::type plus; + typedef typename _ap_fixed_factory::type minus; + typedef typename _ap_fixed_factory::type logic; + typedef typename _ap_fixed_factory::type div; + typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1; + }; + + private: +#ifndef __SYNTHESIS__ + // This cannot handle hex float format string. + void fromString(const std::string& val, unsigned char radix) { + _AP_ERROR(!(radix == 2 || radix == 8 || radix == 10 || radix == 16), + "ap_fixed_base::fromString(%s, %d)", val.c_str(), radix); + + Base::V = 0; + int startPos = 0; + int endPos = val.length(); + int decPos = val.find("."); + if (decPos == -1) decPos = endPos; + + // handle sign + bool isNegative = false; + if (val[0] == '-') { + isNegative = true; + ++startPos; + } else if (val[0] == '+') + ++startPos; + + // If there are no integer bits, e.g.: + // .0000XXXX, then keep at least one bit. + // If the width is greater than the number of integer bits, e.g.: + // XXXX.XXXX, then we keep the integer bits + // if the number of integer bits is greater than the width, e.g.: + // XXX000 then we keep the integer bits. + // Always keep one bit. + ap_fixed_base + integer_bits = 0; + + // Figure out if we can shift instead of multiply + unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0); + + //std::cout << "\n\n" << val << "\n"; + //std::cout << startPos << " " << decPos << " " << endPos << "\n"; + + bool sticky_int = false; + + // Traverse the integer digits from the MSD, multiplying by radix as we go. + for (int i = startPos; i < decPos; i++) { + // Get a digit + char cdigit = val[i]; + if (cdigit == '\0') continue; + unsigned digit = ap_private_ops::decode_digit(cdigit, radix); + + sticky_int |= integer_bits[AP_MAX(_AP_I, 4) + 4 - 1] | + integer_bits[AP_MAX(_AP_I, 4) + 4 - 2] | + integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | + integer_bits[AP_MAX(_AP_I, 4) + 4 - 4]; + // Shift or multiply the value by the radix + if (shift) + integer_bits <<= shift; + else + integer_bits *= radix; + + // Add in the digit we just interpreted + integer_bits += digit; + //std::cout << "idigit = " << digit << " " << integer_bits.to_string() + // << " " << sticky_int << "\n"; + } + integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] = + integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | sticky_int; + + ap_fixed_base fractional_bits = 0; + bool sticky = false; + + // Traverse the fractional digits from the LSD, dividing by radix as we go. + for (int i = endPos - 1; i >= decPos + 1; i--) { + // Get a digit + char cdigit = val[i]; + if (cdigit == '\0') continue; + unsigned digit = ap_private_ops::decode_digit(cdigit, radix); + // Add in the digit we just interpreted + fractional_bits += digit; + + sticky |= fractional_bits[0] | fractional_bits[1] | fractional_bits[2] | + fractional_bits[3]; + // Shift or divide the value by the radix + if (shift) + fractional_bits >>= shift; + else + fractional_bits /= radix; + + //std::cout << "fdigit = " << digit << " " << fractional_bits.to_string() + // << " " << sticky << "\n"; + } + + //std::cout << "Int =" << integer_bits.to_string() << " " << + // fractional_bits.to_string() << "\n"; + + fractional_bits[0] = fractional_bits[0] | sticky; + + if (isNegative) + *this = -(integer_bits + fractional_bits); + else + *this = integer_bits + fractional_bits; + + //std::cout << "end = " << this->to_string(16) << "\n"; + } + + /// report invalid constrction of ap_fixed_base + INLINE void report() { + if (!_AP_S && _AP_O == AP_WRAP_SM) { + fprintf(stderr, "ap_ufxied<...> cannot support AP_WRAP_SM.\n"); + exit(1); + } + if (_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024) { + fprintf(stderr, + "[E] ap_%sfixed<%d, ...>: Bitwidth exceeds the " + "default max value %d. Please use macro " + "AP_INT_MAX_W to set a larger max value.\n", + _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024); + exit(1); + } + } +#else + INLINE void report() {} +#endif // ifdef __SYNTHESIS__ + + /// @name helper functions. + // @{ + INLINE void overflow_adjust(bool underflow, bool overflow, bool lD, + bool sign) { + if (!underflow && !overflow) return; + if (_AP_O == AP_WRAP) { + if (_AP_N == 0) return; + if (_AP_S) { + // signed AP_WRAP + // n_bits == 1 + Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign); + if (_AP_N > 1) { + // n_bits > 1 + ap_int_base<_AP_W, false> mask(-1); + if (sign) mask.V = 0; + Base::V = + _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V); + } + } else { + // unsigned AP_WRAP + ap_int_base<_AP_W, false> mask(-1); + Base::V = + _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 1, mask.V); + } + } else if (_AP_O == AP_SAT_ZERO) { + Base::V = 0; + } else if (_AP_O == AP_WRAP_SM && _AP_S) { + bool Ro = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); + if (_AP_N == 0) { + if (lD != Ro) { + Base::V = ~Base::V; + Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, lD); + } + } else { + if (_AP_N == 1 && sign != Ro) { + Base::V = ~Base::V; + } else if (_AP_N > 1) { + bool lNo = _AP_ROOT_op_get_bit(Base::V, _AP_W - _AP_N); + if (lNo == sign) Base::V = ~Base::V; + ap_int_base<_AP_W, false> mask(-1); + if (sign) mask.V = 0; + Base::V = + _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V); + } + Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign); + } + } else { + if (_AP_S) { + if (overflow) { + Base::V = 1; + Base::V <<= _AP_W - 1; + Base::V = ~Base::V; + } else if (underflow) { + Base::V = 1; + Base::V <<= _AP_W - 1; + if (_AP_O == AP_SAT_SYM) Base::V |= 1; + } + } else { + if (overflow) + Base::V = ~(ap_int_base<_AP_W, false>(0).V); + else if (underflow) + Base::V = 0; + } + } + } + + INLINE bool quantization_adjust(bool qb, bool r, bool s) { + bool carry = (bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1); + if (_AP_Q == AP_TRN) return false; + if (_AP_Q == AP_RND_ZERO) + qb &= s || r; + else if (_AP_Q == AP_RND_MIN_INF) + qb &= r; + else if (_AP_Q == AP_RND_INF) + qb &= !s || r; + else if (_AP_Q == AP_RND_CONV) + qb &= _AP_ROOT_op_get_bit(Base::V, 0) || r; + else if (_AP_Q == AP_TRN_ZERO) + qb = s && (qb || r); + Base::V += qb; + return carry && (!(bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1)); + } + // @} + + public: + /// @name constructors. + // @{ + /// default ctor. + INLINE ap_fixed_base() {} + + /// copy ctor. + template + INLINE ap_fixed_base( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + operator=(op); + report(); + } + + template + INLINE ap_fixed_base( + const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + operator=(op); + report(); + } + + template + INLINE ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op) { + ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp; + tmp.V = op.V; + operator=(tmp); + report(); + } + + template + INLINE ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) { + ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp; + tmp.V = op.V; + operator=(tmp); + report(); + } + +#ifndef __SYNTHESIS__ +#ifndef NON_C99STRING + INLINE ap_fixed_base(const char* s, signed char rd = 0) { + unsigned char radix = rd; + std::string str = ap_private_ops::parseString(s, radix); // will guess rd, default 10 + _AP_ERROR(radix == 0, "ap_fixed_base(const char* \"%s\", %d), str=%s, radix = %d", + s, rd, str.c_str(), radix); // TODO remove this check + fromString(str, radix); + } +#else + INLINE ap_fixed_base(const char* s, signed char rd = 10) { + ap_int_base<_AP_W, _AP_S> t(s, rd); + Base::V = t.V; + } +#endif // ifndef NON_C99STRING +#else // ifndef __SYNTHESIS__ + // XXX _ssdm_string2bits only takes const string and const radix. + // It seems XFORM will do compile time processing of the string. + INLINE ap_fixed_base(const char* s) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q, + _AP_O, _AP_N, _AP_C99); + Base::V = t; + } + INLINE ap_fixed_base(const char* s, signed char rd) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q, + _AP_O, _AP_N, _AP_C99); + Base::V = t; + } +#endif // ifndef __SYNTHESIS__ else + + template + INLINE ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) { + *this = ((bool)op); + report(); + } + + template + INLINE ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) { + *this = (ap_int_base<_AP_W2, false>(op)); + report(); + } + + template + INLINE ap_fixed_base( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) { + *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op)); + report(); + } + + template + INLINE ap_fixed_base( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + *this = (bool(op)); + report(); + } + + template + INLINE ap_fixed_base( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + *this = (ap_int_base<_AP_W2, false>(op)); + report(); + } + + // ctors from c types. + // make a temp ap_fixed_base first, and use ap_fixed_base.operator= +#define CTOR_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \ + INLINE ap_fixed_base(const C_TYPE x) { \ + ap_fixed_base<(_AP_W2), (_AP_W2), (_AP_S2)> tmp; \ + tmp.V = x; \ + *this = tmp; \ + } + + CTOR_FROM_INT(bool, 1, false) + CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED) + CTOR_FROM_INT(signed char, 8, true) + CTOR_FROM_INT(unsigned char, 8, false) + CTOR_FROM_INT(short, _AP_SIZE_short, true) + CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false) + CTOR_FROM_INT(int, _AP_SIZE_int, true) + CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false) + CTOR_FROM_INT(long, _AP_SIZE_long, true) + CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false) + CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true) + CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false) +#undef CTOR_FROM_INT +/* + * TODO: + *Theere used to be several funtions which were AP_WEAK. + *Now they're all INLINE expect ap_fixed_base(double d) + *Maybe we can use '#pragma HLS inline' instead of INLINE. + */ + AP_WEAK ap_fixed_base(double d) { + ap_int_base<64, false> ireg; + ireg.V = doubleToRawBits(d); + bool isneg = _AP_ROOT_op_get_bit(ireg.V, 63); + + ap_int_base exp; + ap_int_base exp_tmp; + exp_tmp.V = + _AP_ROOT_op_get_range(ireg.V, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1); + exp = exp_tmp - DOUBLE_BIAS; + ap_int_base man; + man.V = _AP_ROOT_op_get_range(ireg.V, 0, DOUBLE_MAN - 1); + // do not support NaN + _AP_WARNING(exp == APFX_IEEE_DOUBLE_E_MAX + 1 && man.V != 0, + "assign NaN to fixed point value"); + man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1); + if (isneg) man = -man; + if ((ireg.V & 0x7fffffffffffffffLL) == 0) { + Base::V = 0; + } else { + int _AP_W2 = DOUBLE_MAN + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I, + F2 = _AP_W2 - _AP_I2; + bool _AP_S2 = true, + QUAN_INC = F2 > _AP_F && + !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2)); + bool carry = false; + // handle quantization + unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2; + if (F2 == _AP_F) + Base::V = man.V; + else if (F2 > _AP_F) { + if (sh_amt < DOUBLE_MAN + 2) + Base::V = man.V >> sh_amt; + else { + Base::V = isneg ? -1 : 0; + } + if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) { + bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)_AP_ROOT_op_get_bit( + man.V, F2 - _AP_F - 1); + bool r = + (F2 > _AP_F + 1) + ? _AP_ROOT_op_get_range(man.V, 0, (F2 - _AP_F - 2 < _AP_W2) + ? (F2 - _AP_F - 2) + : (_AP_W2 - 1)) != 0 + : false; + carry = quantization_adjust(qb, r, isneg); + } + } else { // no quantization + Base::V = man.V; + if (sh_amt < _AP_W) + Base::V = Base::V << sh_amt; + else + Base::V = 0; + } + // handle overflow/underflow + if ((_AP_O != AP_WRAP || _AP_N != 0) && + ((!_AP_S && _AP_S2) || + _AP_I - _AP_S < + _AP_I2 - _AP_S2 + + (QUAN_INC || + (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { // saturation + bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true; + bool neg_src = isneg; + bool lD = false; + int pos1 = F2 - _AP_F + _AP_W; + int pos2 = F2 - _AP_F + _AP_W + 1; + bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); + if (pos1 < _AP_W2 && pos1 >= 0) + // lD = _AP_ROOT_op_get_bit(man.V, pos1); + lD = (man.V >> pos1) & 1; + if (pos1 < _AP_W2) { + bool Range1_all_ones = true; + bool Range1_all_zeros = true; + bool Range2_all_ones = true; + ap_int_base Range2; + ap_int_base all_ones(-1); + + if (pos2 >= 0 && pos2 < _AP_W2) { + // Range2.V = _AP_ROOT_op_get_range(man.V, + // pos2, _AP_W2 - 1); + Range2.V = man.V; + Range2.V >>= pos2; + Range2_all_ones = Range2 == (all_ones >> pos2); + } else if (pos2 < 0) + Range2_all_ones = false; + if (pos1 >= 0 && pos2 < _AP_W2) { + Range1_all_ones = Range2_all_ones && lD; + Range1_all_zeros = !Range2.V && !lD; + } else if (pos2 == _AP_W2) { + Range1_all_ones = lD; + Range1_all_zeros = !lD; + } else if (pos1 < 0) { + Range1_all_zeros = !man.V; + Range1_all_ones = false; + } + + deleted_zeros = + deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros); + deleted_ones = + carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones; + neg_src = isneg && !(carry && Range1_all_ones); + } else + neg_src = isneg && newsignbit; + bool neg_trg = _AP_S && newsignbit; + bool overflow = (neg_trg || !deleted_zeros) && !isneg; + bool underflow = (!neg_trg || !deleted_ones) && neg_src; + if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S) + underflow |= + neg_src && + (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0 + : true); + overflow_adjust(underflow, overflow, lD, neg_src); + } + } + report(); + } + + // TODO more optimized implementation. + INLINE ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); } + +#if _AP_ENABLE_HALF_ == 1 + // TODO more optimized implementation. + INLINE ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); } +#endif + // @} + + /// @name assign operator + /// assign, using another ap_fixed_base of same template parameters. + /* + INLINE ap_fixed_base& operator=( + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { + Base::V = op.V; + return *this; + } + */ + + template + INLINE ap_fixed_base& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + + const int _AP_F = _AP_W - _AP_I; + const int F2 = _AP_W2 - _AP_I2; + const int QUAN_INC = + F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2)); + + if (!op) Base::V = 0; + bool carry = false; + bool signbit = _AP_ROOT_op_get_bit(op.V, _AP_W2 - 1); + bool isneg = signbit && _AP_S2; + if (F2 == _AP_F) + Base::V = op.V; + else if (F2 > _AP_F) { + unsigned int sh_amt = F2 - _AP_F; + // moves bits right, handle quantization. + if (sh_amt < _AP_W2) { + Base::V = op.V >> sh_amt; + } else { + Base::V = isneg ? -1 : 0; + } + if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) { + bool qbit = _AP_ROOT_op_get_bit(op.V, F2 - _AP_F - 1); + // bit after LSB. + bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit; + enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) }; + // bits after qb. + bool r = (F2 > _AP_F + 1) ? (_AP_ROOT_op_get_range(op.V, 0, hi) != 0) : false; + carry = quantization_adjust(qb, r, isneg); + } + } else { + unsigned sh_amt = _AP_F - F2; + // moves bits left, no quantization + if (sh_amt < _AP_W) { + if (_AP_W > _AP_W2) { + // extend and then shift, avoid losing bits. + Base::V = op.V; + Base::V <<= sh_amt; + } else { + // shift and truncate. + Base::V = op.V << sh_amt; + } + } else { + Base::V = 0; + } + } + // handle overflow/underflow + if ((_AP_O != AP_WRAP || _AP_N != 0) && + ((!_AP_S && _AP_S2) || + _AP_I - _AP_S < + _AP_I2 - _AP_S2 + + (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { // saturation + bool deleted_zeros = _AP_S2 ? true : !carry; + bool deleted_ones = true; + bool neg_src = isneg; + bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); + enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 }; + bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? _AP_ROOT_op_get_bit(op.V, pos1) + : false; + if (pos1 < _AP_W2) { + bool Range1_all_ones = true; + bool Range1_all_zeros = true; + bool Range2_all_ones = true; + ap_int_base<_AP_W2, false> all_ones(-1); + + if (pos2 < _AP_W2 && pos2 >= 0) { + ap_int_base<_AP_W2, false> Range2; + Range2.V = _AP_ROOT_op_get_range(op.V, pos2, _AP_W2 - 1); + Range2_all_ones = Range2 == (all_ones >> pos2); + } else if (pos2 < 0) { + Range2_all_ones = false; + } + + if (pos1 >= 0 && pos2 < _AP_W2) { + ap_int_base<_AP_W2, false> Range1; + Range1.V = _AP_ROOT_op_get_range(op.V, pos1, _AP_W2 - 1); + Range1_all_ones = Range1 == (all_ones >> pos1); + Range1_all_zeros = !Range1.V; + } else if (pos2 == _AP_W2) { + Range1_all_ones = lD; + Range1_all_zeros = !lD; + } else if (pos1 < 0) { + Range1_all_zeros = !op.V; + Range1_all_ones = false; + } + + deleted_zeros = + deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros); + deleted_ones = + carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones; + neg_src = isneg && !(carry && Range1_all_ones); + } else + neg_src = isneg && newsignbit; + bool neg_trg = _AP_S && newsignbit; + bool overflow = (neg_trg || !deleted_zeros) && !isneg; + bool underflow = (!neg_trg || !deleted_ones) && neg_src; + if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S) + underflow |= + neg_src && + (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0 + : true); + + overflow_adjust(underflow, overflow, lD, neg_src); + } + return *this; + } // operator= + + template + INLINE ap_fixed_base& operator=( + const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + operator=(const_cast&>(op)); + return *this; + } + + /// Set this ap_fixed_base with ULL. + INLINE ap_fixed_base& setBits(ap_ulong bv) { + // TODO when ull is not be long enough... + Base::V = bv; + return *this; + } + + /// Return a ap_fixed_base object whose this->V is assigned by bv. + static INLINE ap_fixed_base bitsToFixed(ap_ulong bv) { + // TODO fix when ull is not be long enough... + ap_fixed_base t; +#ifdef __SYNTHESIS__ + t.V = bv; +#else + t.V.set_bits(bv); +#endif + return t; + } + + // Explicit conversion functions to ap_int_base. + /** Captures all integer bits, in truncate mode. + * @param[in] Cnative follow conversion from double to int. + */ + INLINE ap_int_base to_ap_int_base( + bool Cnative = true) const { + ap_int_base ret; + if (_AP_I == 0) { + ret.V = 0; + } else if (_AP_I > 0 && _AP_I <= _AP_W) { + ret.V = _AP_ROOT_op_get_range(Base::V, _AP_W - _AP_I, _AP_W - 1); + } else if (_AP_I > _AP_W) { + ret.V = _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 1); + ret.V <<= (_AP_I - _AP_W); + } + /* Consider the following case + * float f = -7.5f; + * ap_fixed<8,4> t = f; // -8 0 0 0 . 0.5 + * int i = t.to_int(); + * the result should be -7 instead of -8. + * Therefore, after truncation, the value should be increated by 1. + * For (-1, 0), carry to MSB will happen, but result 0 is still correct. + */ + if (Cnative && _AP_I < _AP_W) { + // Follow C native data type, conversion from double to int + if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1) && (_AP_I < _AP_W) && + (_AP_ROOT_op_get_range( + Base::V, 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1) != 0)) + ++ret; + } else { + // Follow OSCI library, conversion from sc_fixed to sc_int + } + return ret; + }; + + public: + template + INLINE operator ap_int_base<_AP_W2, _AP_S2>() const { + return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base()); + } + + // Explicit conversion function to C built-in integral type. + INLINE char to_char() const { return to_ap_int_base().to_char(); } + + INLINE int to_int() const { return to_ap_int_base().to_int(); } + + INLINE unsigned to_uint() const { return to_ap_int_base().to_uint(); } + + INLINE ap_slong to_int64() const { return to_ap_int_base().to_int64(); } + + INLINE ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); } + + /// covert function to double. + /** only round-half-to-even mode supported, does not obey FE env. */ + INLINE double to_double() const { +#if defined(AP_FIXED_ENABLE_CPP_FENV) + _AP_WARNING(std::fegetround() != FE_TONEAREST, + "Only FE_TONEAREST is supported"); +#endif + enum { BITS = DOUBLE_MAN + DOUBLE_EXP + 1 }; + if (!Base::V) return 0.0f; + bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign. + ap_int_base<_AP_W, false> tmp; + if (s) + tmp.V = -Base::V; // may truncate one bit extra from neg in sim. + else + tmp.V = Base::V; + int l = tmp.countLeadingZeros(); ///< number of leading zeros. + int e = _AP_I - l - 1 + DOUBLE_BIAS; ///< exponent + int lsb_index = _AP_W - l - 1 - DOUBLE_MAN; + // more than 0.5? + bool a = (lsb_index >=2) ? + (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0; + // round to even + a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0; + // ull is at least 64-bit + ap_ulong m; + // may actually left shift, ensure buffer is wide enough. + if (_AP_W > BITS) { + m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1)) + : (ap_ulong)(tmp.V << (1 - lsb_index)); + } else { + m = (ap_ulong)tmp.V; + m = (lsb_index >= 1) ? (m >> (lsb_index - 1)) + : (m << (1 - lsb_index)); + } + m += a; + m >>= 1; + //std::cout << '\n' << std::hex << m << '\n'; // TODO delete this + // carry to MSB, increase exponent + if (_AP_ctype_op_get_bit(m, DOUBLE_MAN + 1)) { + e += 1; + } + // set sign and exponent + m = _AP_ctype_op_set_bit(m, BITS - 1, s); + //std::cout << m << '\n'; // TODO delete this + m = _AP_ctype_op_set_range(m, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1, e); + //std::cout << std::hex << m << std::dec << std::endl; // TODO delete this + // cast to fp + return rawBitsToDouble(m); + } + + /// convert function to float. + /** only round-half-to-even mode supported, does not obey FE env. */ + INLINE float to_float() const { +#if defined(AP_FIXED_ENABLE_CPP_FENV) + _AP_WARNING(std::fegetround() != FE_TONEAREST, + "Only FE_TONEAREST is supported"); +#endif + enum { BITS = FLOAT_MAN + FLOAT_EXP + 1 }; + if (!Base::V) return 0.0f; + bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign. + ap_int_base<_AP_W, false> tmp; + if (s) + tmp.V = -Base::V; // may truncate one bit extra from neg in sim. + else + tmp.V = Base::V; + int l = tmp.countLeadingZeros(); ///< number of leading zeros. + int e = _AP_I - l - 1 + FLOAT_BIAS; ///< exponent + int lsb_index = _AP_W - l - 1 - FLOAT_MAN; + // more than 0.5? + bool a = (lsb_index >=2) ? + (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0; + // round to even + a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0; + // ul is at least 32-bit + unsigned long m; + // may actually left shift, ensure buffer is wide enough. + if (_AP_W > BITS) { + m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1)) + : (unsigned long)(tmp.V << (1 - lsb_index)); + } else { + m = (unsigned long)tmp.V; + m = (lsb_index >= 1) ? (m >> (lsb_index - 1)) + : (m << (1 - lsb_index)); + } + m += a; + m >>= 1; + // carry to MSB, increase exponent + if (_AP_ctype_op_get_bit(m, FLOAT_MAN + 1)) { + e += 1; + } + // set sign and exponent + m = _AP_ctype_op_set_bit(m, BITS - 1, s); + m = _AP_ctype_op_set_range(m, FLOAT_MAN, FLOAT_MAN + FLOAT_EXP - 1, e); + // cast to fp + return rawBitsToFloat(m); + } + +#if _AP_ENABLE_HALF_ == 1 + /// convert function to half. + /** only round-half-to-even mode supported, does not obey FE env. */ + INLINE half to_half() const { +#if defined(AP_FIXED_ENABLE_CPP_FENV) + _AP_WARNING(std::fegetround() != FE_TONEAREST, + "Only FE_TONEAREST is supported"); +#endif + enum { BITS = HALF_MAN + HALF_EXP + 1 }; + if (!Base::V) return 0.0f; + bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign. + ap_int_base<_AP_W, false> tmp; + if (s) + tmp.V = -Base::V; // may truncate one bit extra from neg in sim. + else + tmp.V = Base::V; + int l = tmp.countLeadingZeros(); ///< number of leading zeros. + int e = _AP_I - l - 1 + HALF_BIAS; ///< exponent + int lsb_index = _AP_W - l - 1 - HALF_MAN; + // more than 0.5? + bool a = (lsb_index >=2) ? + (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0; + // round to even + a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0; + // short is at least 16-bit + unsigned short m; + // may actually left shift, ensure buffer is wide enough. + if (_AP_W > BITS) { + m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1)) + : (unsigned short)(tmp.V << (1 - lsb_index)); + } else { + m = (unsigned short)tmp.V; + m = (lsb_index >= 1) ? (m >> (lsb_index - 1)) + : (m << (1 - lsb_index)); + } + m += a; + m >>= 1; + // carry to MSB, increase exponent + if (_AP_ctype_op_get_bit(m, HALF_MAN + 1)) { + e += 1; + } + // set sign and exponent + m = _AP_ctype_op_set_bit(m, BITS - 1, s); + m = _AP_ctype_op_set_range(m, HALF_MAN, HALF_MAN + HALF_EXP - 1, e); + // cast to fp + return rawBitsToHalf(m); + } +#endif + + // FIXME inherited from old code, this may loose precision! + INLINE operator long double() const { return (long double)to_double(); } + + INLINE operator double() const { return to_double(); } + + INLINE operator float() const { return to_float(); } + +#if _AP_ENABLE_HALF_ == 1 + INLINE operator half() const { return to_half(); } +#endif + + INLINE operator bool() const { return (bool)Base::V != 0; } + + INLINE operator char() const { return (char)to_int(); } + + INLINE operator signed char() const { return (signed char)to_int(); } + + INLINE operator unsigned char() const { return (unsigned char)to_uint(); } + + INLINE operator short() const { return (short)to_int(); } + + INLINE operator unsigned short() const { return (unsigned short)to_uint(); } + + INLINE operator int() const { return to_int(); } + + INLINE operator unsigned int() const { return to_uint(); } + +// FIXME don't assume data width... +#ifdef __x86_64__ + INLINE operator long() const { return (long)to_int64(); } + + INLINE operator unsigned long() const { return (unsigned long)to_uint64(); } +#else + INLINE operator long() const { return (long)to_int(); } + + INLINE operator unsigned long() const { return (unsigned long)to_uint(); } +#endif // ifdef __x86_64__ else + + INLINE operator ap_ulong() const { return to_uint64(); } + + INLINE operator ap_slong() const { return to_int64(); } + + INLINE int length() const { return _AP_W; }; + + // bits_to_int64 deleted. +#ifndef __SYNTHESIS__ + // Used in autowrap, when _AP_W < 64. + INLINE ap_ulong bits_to_uint64() const { + return (Base::V).to_uint64(); + } +#endif + + // Count the number of zeros from the most significant bit + // to the first one bit. Note this is only for ap_fixed_base whose + // _AP_W <= 64, otherwise will incur assertion. + INLINE int countLeadingZeros() { +#ifdef __SYNTHESIS__ + // TODO: used llvm.ctlz intrinsic ? + if (_AP_W <= 32) { + ap_int_base<32, false> t(-1ULL); + t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1); + return __builtin_ctz(t.V); + } else if (_AP_W <= 64) { + ap_int_base<64, false> t(-1ULL); + t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1); + return __builtin_ctzll(t.V); + } else { + enum {__N = (_AP_W + 63) / 64}; + int NZeros = 0; + int i = 0; + bool hitNonZero = false; + for (i = 0; i < __N - 1; ++i) { + ap_int_base<64, false> t; + t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1); + NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); + hitNonZero |= (t != 0); + } + if (!hitNonZero) { + ap_int_base<64, false> t(-1ULL); + t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64); + NZeros += __builtin_clzll(t.V); + } + return NZeros; + } +#else + return Base::V.countLeadingZeros(); +#endif + } + + // Arithmetic : Binary + // ------------------------------------------------------------------------- + template + INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) + const { + typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r, t; + r.V = Base::V; + t.V = op2.V; + r.V *= op2.V; + return r; + } + + // multiply function deleted. + + template + INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) + const { + typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r; +#ifndef __SYNTHESIS__ + enum {F2 = _AP_W2-_AP_I2, + _W1=AP_MAX(_AP_W + AP_MAX(F2, 0) + ((_AP_S2 && !_AP_S) ? 1 : 0), _AP_W2 + ((_AP_S && !_AP_S2) ? 1 : 0))}; + ap_int_base<_W1,_AP_S||_AP_S2> dividend,divisior; + ap_int_base<_W1,_AP_S> tmp1; + ap_int_base<_W1,_AP_S2> tmp2; + tmp1.V = Base::V; + tmp1.V <<= AP_MAX(F2,0); + tmp2.V = op2.V; + dividend = tmp1; + divisior = tmp2; + r.V = ((_AP_S||_AP_S2) ? dividend.V.sdiv(divisior.V): dividend.V.udiv(divisior.V)); +#else + #ifndef __SC_COMPATIBLE__ + ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0),_AP_I, _AP_S> t(*this); + #else + ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0) + AP_MAX(_AP_I2, 0),_AP_I, _AP_S> t(*this); + #endif + r.V = t.V / op2.V; +#endif +/* + enum { + F2 = _AP_W2 - _AP_I2, + shl = AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0), +#ifndef __SC_COMPATIBLE__ + shr = AP_MAX(_AP_I2, 0), +#else + shr = 0, +#endif + W3 = _AP_S2 + _AP_W + shl, + S3 = _AP_S || _AP_S2, + }; + ap_int_base dividend, t; + dividend.V = Base::V; + // multiply both by (1 << F2), and than do integer division. + dividend.V <<= (int) shl; +#ifdef __SYNTHESIS__ + // .V's have right signedness, and will have right extending. + t.V = dividend.V / op2.V; +#else + // XXX op2 may be wider than dividend, and sdiv and udiv takes the same with + // as left hand operand, so data might be truncated by mistake if not + // handled here. + t.V = S3 ? dividend.V.sdiv(op2.V) : dividend.V.udiv(op2.V); +#endif + r.V = t.V >> (int) shr; +*/ + return r; + } + +#define OP_BIN_AF(Sym, Rty) \ + template \ + INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty operator Sym( \ + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \ + op2) const { \ + typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty##_base ret, lhs(*this), \ + rhs(op2); \ + ret.V = lhs.V Sym rhs.V; \ + return ret; \ + } + + OP_BIN_AF(+, plus) + OP_BIN_AF(-, minus) + OP_BIN_AF(&, logic) + OP_BIN_AF(|, logic) + OP_BIN_AF(^, logic) + +// Arithmetic : assign +// ------------------------------------------------------------------------- +#define OP_ASSIGN_AF(Sym) \ + template \ + INLINE ap_fixed_base& operator Sym##=( \ + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \ + op2) { \ + *this = operator Sym(op2); \ + return *this; \ + } + + OP_ASSIGN_AF(*) + OP_ASSIGN_AF(/) + OP_ASSIGN_AF(+) + OP_ASSIGN_AF(-) + OP_ASSIGN_AF(&) + OP_ASSIGN_AF(|) + OP_ASSIGN_AF(^) + + // Prefix and postfix increment and decrement. + // ------------------------------------------------------------------------- + + /// Prefix increment + INLINE ap_fixed_base& operator++() { + operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1)); + return *this; + } + + /// Prefix decrement. + INLINE ap_fixed_base& operator--() { + operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1)); + return *this; + } + + /// Postfix increment + INLINE const ap_fixed_base operator++(int) { + ap_fixed_base r(*this); + operator++(); + return r; + } + + /// Postfix decrement + INLINE const ap_fixed_base operator--(int) { + ap_fixed_base r(*this); + operator--(); + return r; + } + + // Unary arithmetic. + // ------------------------------------------------------------------------- + INLINE ap_fixed_base operator+() { return *this; } + + INLINE ap_fixed_base<_AP_W + 1, _AP_I + 1, true> operator-() const { + ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this); + r.V = -r.V; + return r; + } + + INLINE ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() { + ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this); + r.V = -r.V; + return r; + } + + // Not (!) + // ------------------------------------------------------------------------- + INLINE bool operator!() const { return Base::V == 0; } + + // Bitwise complement + // ------------------------------------------------------------------------- + // XXX different from Mentor's ac_fixed. + INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const { + ap_fixed_base<_AP_W, _AP_I, _AP_S> r; + r.V = ~Base::V; + return r; + } + + // Shift + // ------------------------------------------------------------------------- + // left shift is the same as moving point right, i.e. increate I. + template + INLINE ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const { + ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r; + r.V = Base::V; + return r; + } + + template + INLINE ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const { + ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r; + r.V = Base::V; + return r; + } + + // Because the return type is the type of the the first operand, shift assign + // operators do not carry out any quantization or overflow + // While systemc, shift assigns for sc_fixed/sc_ufixed will result in + // quantization or overflow (depending on the mode of the first operand) + INLINE ap_fixed_base operator<<(unsigned int sh) const { + ap_fixed_base r; + r.V = Base::V << sh; +// TODO check shift overflow? +#ifdef __SC_COMPATIBLE__ + if (sh == 0) return r; + if (_AP_O != AP_WRAP || _AP_N != 0) { + bool neg_src = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); + bool allones, allzeros; + ap_int_base<_AP_W, false> ones(-1); + if (sh <= _AP_W) { + ap_int_base<_AP_W, false> range1; + range1.V = _AP_ROOT_op_get_range( + const_cast(this)->Base::V, _AP_W - sh, _AP_W - 1); + allones = range1 == (ones >> (_AP_W - sh)); + allzeros = range1 == 0; + } else { + allones = false; + allzeros = Base::V == 0; + } + bool overflow = !allzeros && !neg_src; + bool underflow = !allones && neg_src; + if ((_AP_O == AP_SAT_SYM) && _AP_S) + underflow |= + neg_src && + (_AP_W > 1 ? _AP_ROOT_op_get_range(r.V, 0, _AP_W - 2) == 0 : true); + bool lD = false; + if (sh < _AP_W) lD = _AP_ROOT_op_get_bit(Base::V, _AP_W - sh - 1); + r.overflow_adjust(underflow, overflow, lD, neg_src); + } +#endif + return r; + } + + INLINE ap_fixed_base operator>>(unsigned int sh) const { + ap_fixed_base r; + r.V = Base::V >> sh; +// TODO check shift overflow? +#ifdef __SC_COMPATIBLE__ + if (sh == 0) return r; + if (_AP_Q != AP_TRN) { + bool qb = false; + if (sh <= _AP_W) qb = _AP_ROOT_op_get_bit(Base::V, sh - 1); + bool rb = false; + if (sh > 1 && sh <= _AP_W) + rb = _AP_ROOT_op_get_range(const_cast(this)->Base::V, 0, + sh - 2) != 0; + else if (sh > _AP_W) + rb = Base::V != 0; + r.quantization_adjust(qb, rb, + _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)); + } +#endif + return r; + } + + // left and right shift for int + INLINE ap_fixed_base operator<<(int sh) const { + ap_fixed_base r; + bool isNeg = sh < 0; + unsigned int ush = isNeg ? -sh : sh; + if (isNeg) { + return operator>>(ush); + } else { + return operator<<(ush); + } + } + + INLINE ap_fixed_base operator>>(int sh) const { + bool isNeg = sh < 0; + unsigned int ush = isNeg ? -sh : sh; + if (isNeg) { + return operator<<(ush); + } else { + return operator>>(ush); + } + } + + // left and right shift for ap_int. + template + INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const { + // TODO the code seems not optimal. ap_fixed<8,8> << ap_int<2> needs only a + // small mux, but integer need a big one! + int sh = op2.to_int(); + return operator<<(sh); + } + + template + INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const { + int sh = op2.to_int(); + return operator>>(sh); + } + + // left and right shift for ap_uint. + template + INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const { + unsigned int sh = op2.to_uint(); + return operator<<(sh); + } + + template + INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const { + unsigned int sh = op2.to_uint(); + return operator>>(sh); + } + + // left and right shift for ap_fixed + template + INLINE ap_fixed_base operator<<( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + op2) { + return operator<<(op2.to_ap_int_base()); + } + + template + INLINE ap_fixed_base operator>>( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + op2) { + return operator>>(op2.to_ap_int_base()); + } + + // Shift assign. + // ------------------------------------------------------------------------- + + // left shift assign. + INLINE ap_fixed_base& operator<<=(const int sh) { + *this = operator<<(sh); + return *this; + } + + INLINE ap_fixed_base& operator<<=(const unsigned int sh) { + *this = operator<<(sh); + return *this; + } + + template + INLINE ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) { + *this = operator<<(sh.to_int()); + return *this; + } + + template + INLINE ap_fixed_base& operator<<=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + sh) { + *this = operator<<(sh.to_int()); + return *this; + } + + // right shift assign. + INLINE ap_fixed_base& operator>>=(const int sh) { + *this = operator>>(sh); + return *this; + } + + INLINE ap_fixed_base& operator>>=(const unsigned int sh) { + *this = operator>>(sh); + return *this; + } + + template + INLINE ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) { + *this = operator>>(sh.to_int()); + return *this; + } + + template + INLINE ap_fixed_base& operator>>=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + sh) { + *this = operator>>(sh.to_int()); + return *this; + } + +// Comparisons. +// ------------------------------------------------------------------------- +#define OP_CMP_AF(Sym) \ + template \ + INLINE bool operator Sym(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, \ + _AP_O2, _AP_N2>& op2) const { \ + enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 }; \ + if (_AP_F == F2) \ + return Base::V Sym op2.V; \ + else if (_AP_F > F2) \ + return Base::V Sym ap_fixed_base(op2).V; \ + else \ + return ap_fixed_base(*this).V Sym op2.V; \ + return false; \ + } + + OP_CMP_AF(>) + OP_CMP_AF(<) + OP_CMP_AF(>=) + OP_CMP_AF(<=) + OP_CMP_AF(==) + OP_CMP_AF(!=) +// FIXME: Move compare with double out of struct ap_fixed_base defination +// and combine it with compare operator(double, ap_fixed_base) +#define DOUBLE_CMP_AF(Sym) \ + INLINE bool operator Sym(double d) const { return to_double() Sym d; } + + DOUBLE_CMP_AF(>) + DOUBLE_CMP_AF(<) + DOUBLE_CMP_AF(>=) + DOUBLE_CMP_AF(<=) + DOUBLE_CMP_AF(==) + DOUBLE_CMP_AF(!=) + + // Bit and Slice Select + INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[]( + unsigned index) { + _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB"); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index); + } + + template + INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[]( + const ap_int_base<_AP_W2, _AP_S2>& index) { + _AP_WARNING(index < 0, "Attempting to read bit with negative index"); + _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB"); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, + index.to_int()); + } + + INLINE bool operator[](unsigned index) const { + _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB"); + return _AP_ROOT_op_get_bit(const_cast(this)->V, index); + } + + INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit( + unsigned index) { + _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB"); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index); + } + + template + INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit( + const ap_int_base<_AP_W2, _AP_S2>& index) { + _AP_WARNING(index < 0, "Attempting to read bit with negative index"); + _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB"); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, + index.to_int()); + } + + INLINE bool bit(unsigned index) const { + _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB"); + return _AP_ROOT_op_get_bit(const_cast(this)->V, index); + } + + template + INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit( + const ap_int_base<_AP_W2, true>& index) { + _AP_WARNING(index < _AP_I - _AP_W, + "Attempting to read bit with negative index"); + _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB"); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>( + this, index.to_int() + _AP_W - _AP_I); + } + + INLINE bool get_bit(int index) const { + _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB"); + _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB"); + return _AP_ROOT_op_get_bit(const_cast(this)->V, + index + _AP_W - _AP_I); + } +#if 0 + INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit( + int index) { + _AP_WARNING(index < _AP_I - _AP_W, + "Attempting to read bit with negative index"); + _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB"); + return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>( + this, index + _AP_W - _AP_I); + } +#endif + + template + INLINE bool get_bit(const ap_int_base<_AP_W2, true>& index) const { + _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB"); + _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB"); + return _AP_ROOT_op_get_bit(const_cast(this)->V, + index.to_int() + _AP_W - _AP_I); + } + + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi, + int Lo) { + _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()"); + return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo); + } + + // This is a must to strip constness to produce reference type. + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range( + int Hi, int Lo) const { + _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()"); + return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>( + const_cast(this), Hi, Lo); + } + + template + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() { + return this->range(_AP_W - 1, 0); + } + + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const { + return this->range(_AP_W - 1, 0); + } + + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + int Hi, int Lo) { + return this->range(Hi, Lo); + } + + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + int Hi, int Lo) const { + return this->range(Hi, Lo); + } + + template + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + INLINE bool is_zero() const { return Base::V == 0; } + + INLINE bool is_neg() const { + if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) return true; + return false; + } + + INLINE int wl() const { return _AP_W; } + + INLINE int iwl() const { return _AP_I; } + + INLINE ap_q_mode q_mode() const { return _AP_Q; } + + INLINE ap_o_mode o_mode() const { return _AP_O; } + + INLINE int n_bits() const { return _AP_N; } + + // print a string representation of this number in the given radix. + // Radix support is 2, 8, 10, or 16. + // The result will include a prefix indicating the radix, except for decimal, + // where no prefix is needed. The default is to output a signed representation + // of signed numbers, or an unsigned representation of unsigned numbers. For + // non-decimal formats, this can be changed by the 'sign' argument. +#ifndef __SYNTHESIS__ + std::string to_string(unsigned char radix = 2, bool sign = _AP_S) const { + // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to + // initialize sc_lv, which seems incapable of handling format "-0b". + if (radix == 2) sign = false; + + std::string str; + str.clear(); + char step = 0; + bool isNeg = sign && (Base::V < 0); + + // Extend to take care of the -MAX case. + ap_fixed_base<_AP_W + 1, _AP_I + 1> tmp(*this); + if (isNeg) { + tmp = -tmp; + str += '-'; + } + std::string prefix; + switch (radix) { + case 2: + prefix = "0b"; + step = 1; + break; + case 8: + prefix = "0o"; + step = 3; + break; + case 16: + prefix = "0x"; + step = 4; + break; + default: + break; + } + + if (_AP_I > 0) { + // Note we drop the quantization and rounding flags here. The + // integer part is always in range, and the fractional part we + // want to drop. Also, the number is always positive, because + // of the absolute value above. + ap_int_base int_part; + // [1] [ I ] d [ W - I ] + // | | | + // | W-I 0 + // W + int_part.V = _AP_ROOT_op_get_range( + tmp.V, _AP_W - _AP_I, _AP_W); + str += int_part.to_string(radix, false); + } else { + str += prefix; + str += '0'; + } + + ap_fixed_base frac_part = tmp; + + if (radix == 10) { + if (frac_part != 0) { + str += "."; + while (frac_part != 0) { + char digit = (frac_part * radix).to_char(); + str += static_cast(digit + '0'); + frac_part *= radix; + } + } + } else { + if (frac_part != 0) { + str += "."; + for (signed i = _AP_W - _AP_I - 1; i >= 0; i -= step) { + char digit = frac_part.range(i, AP_MAX(0, i - step + 1)).to_char(); + // If we have a partial bit pattern at the end, then we need + // to put it in the high-order bits of 'digit'. + int offset = AP_MIN(0, i - step + 1); + digit <<= -offset; + str += digit < 10 ? static_cast(digit + '0') + : static_cast(digit - 10 + 'a'); + } + if (radix == 16) + str += "p0"; // C99 Hex constants are required to have an exponent. + } + } + return str; + } +#else + // XXX HLS will delete this in synthesis + INLINE char* to_string(unsigned char radix = 2, bool sign = _AP_S) const { + return 0; + } +#endif +}; // struct ap_fixed_base. + +template +INLINE void b_not( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { + ret.V = ~op.V; +} + +template +INLINE void b_and( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + ret.V = op1.V & op2.V; +} + +template +INLINE void b_or( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + ret.V = op1.V | op2.V; +} + +template +INLINE void b_xor( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + ret.V = op1.V ^ op2.V; +} + +template +INLINE void neg( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2, + _AP_N2> + t; + t.V = -op.V; + ret = t; +} + +template +INLINE void lshift( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op, + int i) { + enum { + F2 = _AP_W2 - _AP_I2, + _AP_I3 = AP_MAX(_AP_I, _AP_I2), + _AP_W3 = _AP_I3 + F2, + }; + // wide buffer + ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t; + t.V = op.V; + t.V <<= i; // FIXME overflow? + // handle quantization and overflow + ret = t; +} + +template +INLINE void rshift( + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret, + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op, + int i) { + enum { + F = _AP_W - _AP_I, + F2 = _AP_W2 - _AP_I2, + F3 = AP_MAX(F, F2), + _AP_W3 = _AP_I2 + F3, + sh = F - F2, + }; + // wide buffer + ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t; + t.V = op.V; + if (sh >= 0) + t.V <<= (int) sh; + t.V >>= i; + // handle quantization and overflow + ret = t; +} + +//// FIXME +//// These partial specialization ctors allow code like +//// char c = 'a'; +//// ap_fixed_base<8, 8, true> x(c); +//// but what bout ap_fixed_base<9, 9, true> y(c) ? +// + +#ifndef __SYNTHESIS__ +INLINE std::string scientificFormat(std::string& input) { + if (input.length() == 0) return input; + + size_t decPosition = input.find('.'); + if (decPosition == std::string::npos) decPosition = input.length(); + + size_t firstNonZeroPos = 0; + for (; input[firstNonZeroPos] > '9' || input[firstNonZeroPos] < '1'; + firstNonZeroPos++) + ; + + int exp; + if (firstNonZeroPos > decPosition) + exp = decPosition - firstNonZeroPos; + else + exp = decPosition - firstNonZeroPos - 1; + std::string expString = ""; + if (exp == 0) + ; + else if (exp < 0) { + expString += "e-"; + exp = -exp; + } else + expString += "e+"; + + if (exp < 10 && exp > 0) { + expString += '0'; + expString += (char)('0' + exp); + } else if (exp != 0) { + std::string tmp; + + std::ostringstream oss; + oss << exp; + + tmp = oss.str(); + expString += tmp; + } + + int lastNonZeroPos = (int)(input.length() - 1); + for (; lastNonZeroPos >= 0; --lastNonZeroPos) + if (input[lastNonZeroPos] <= '9' && input[lastNonZeroPos] > '0') break; + + std::string ans = ""; + ans += input[firstNonZeroPos]; + if (firstNonZeroPos != (size_t)lastNonZeroPos) { + ans += '.'; + for (int i = firstNonZeroPos + 1; i <= lastNonZeroPos; i++) + if (input[i] != '.') ans += input[i]; + } + + ans += expString; + return ans; +} + +INLINE std::string reduceToPrecision(std::string& input, int precision) { + bool isZero = true; + size_t inputLen = input.length(); + for (size_t i = 0; i < inputLen && isZero; i++) + if (input[i] != '.' && input[i] != '0') isZero = false; + if (isZero) return "0"; + + // Find the first valid number, skip '-' + int FirstNonZeroPos = 0; + int LastNonZeroPos = (int)inputLen - 1; + int truncBitPosition = 0; + size_t decPosition = input.find('.'); + for (; input[FirstNonZeroPos] < '1' || input[FirstNonZeroPos] > '9'; + FirstNonZeroPos++) + ; + + for (; input[LastNonZeroPos] < '1' || input[LastNonZeroPos] > '9'; + LastNonZeroPos--) + ; + + if (decPosition == std::string::npos) decPosition = inputLen; + // Count the valid number, to decide whether we need to truncate + if ((int)decPosition > LastNonZeroPos) { + if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) return input; + truncBitPosition = FirstNonZeroPos + precision; + } else if ((int)decPosition < FirstNonZeroPos) { // This is pure decimal + if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) { + if (FirstNonZeroPos - decPosition - 1 < 4) { + return input; + } else { + if (input[0] == '-') { + std::string tmp = input.substr(1, inputLen - 1); + return std::string("-") + scientificFormat(tmp); + } else + return scientificFormat(input); + } + } + truncBitPosition = FirstNonZeroPos + precision; + } else { + if (LastNonZeroPos - FirstNonZeroPos <= precision) return input; + truncBitPosition = FirstNonZeroPos + precision + 1; + } + + // duplicate the input string, we want to add "0" before the valid numbers + // This is easy for quantization, since we may change 9999 to 10000 + std::string ans = ""; + std::string dupInput = "0"; + if (input[0] == '-') { + ans += '-'; + dupInput += input.substr(1, inputLen - 1); + } else { + dupInput += input.substr(0, inputLen); + ++truncBitPosition; + } + + // Add 'carry' after truncation, if necessary + bool carry = dupInput[truncBitPosition] > '4'; + for (int i = truncBitPosition - 1; i >= 0 && carry; i--) { + if (dupInput[i] == '.') continue; + if (dupInput[i] == '9') + dupInput[i] = '0'; + else { + ++dupInput[i]; + carry = false; + } + } + + // bits outside precision range should be set to 0 + if (dupInput[0] == '1') + FirstNonZeroPos = 0; + else { + FirstNonZeroPos = 0; + while (dupInput[FirstNonZeroPos] < '1' || dupInput[FirstNonZeroPos] > '9') + ++FirstNonZeroPos; + } + + unsigned it = FirstNonZeroPos; + int NValidNumber = 0; + while (it < dupInput.length()) { + if (dupInput[it] == '.') { + ++it; + continue; + } + ++NValidNumber; + if (NValidNumber > precision) dupInput[it] = '0'; + ++it; + } + + // Here we wanted to adjust the truncate position and the value + decPosition = dupInput.find('.'); + if (decPosition == std::string::npos) // When this is integer + truncBitPosition = (int)dupInput.length(); + else + for (truncBitPosition = (int)(dupInput.length() - 1); truncBitPosition >= 0; + --truncBitPosition) { + if (dupInput[truncBitPosition] == '.') break; + if (dupInput[truncBitPosition] != '0') { + truncBitPosition++; + break; + } + } + + if (dupInput[0] == '1') + dupInput = dupInput.substr(0, truncBitPosition); + else + dupInput = dupInput.substr(1, truncBitPosition - 1); + + decPosition = dupInput.find('.'); + if (decPosition != std::string::npos) { + size_t it = 0; + for (it = decPosition + 1; dupInput[it] == '0'; it++) + ; + if (it - decPosition - 1 < 4) { + ans += dupInput; + return ans; + } else { + ans += scientificFormat(dupInput); + return ans; + } + } else if ((int)(dupInput.length()) <= precision) { + ans += dupInput; + return ans; + } + + ans += scientificFormat(dupInput); + return ans; +} + +template +INLINE void print( + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) { + if (_AP_I > 0) { + ap_int_base<_AP_I, _AP_S> p1; + p1.V = x.V >> (_AP_W - _AP_I); + print(p1.V); // print overlaod for .V should exit + } else { + printf("0"); + } + printf("."); + if (_AP_I < _AP_W) { + ap_int_base<_AP_W - _AP_I, false> p2; + p2.V = _AP_ROOT_op_get_range(x.V, 0, _AP_W - _AP_I); + print(p2.V, false); // print overlaod for .V should exit + } +} +#endif // ifndef __SYNTHESIS__ + +// XXX the following two functions have to exist in synthesis, +// as some old HLS Video Library code uses the ostream overload, +// although HLS will later delete I/O function call. + +/// Output streaming +//----------------------------------------------------------------------------- +// XXX apcc cannot handle global std::ios_base::Init() brought in by +#ifndef AP_AUTOCC +#ifndef __SYNTHESIS__ +template +INLINE std::ostream& operator<<( + std::ostream& out, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) { + // TODO support std::ios_base::fmtflags + unsigned width = out.width(); + unsigned precision = out.precision(); + char fill = out.fill(); + std::string str = x.to_string(10, _AP_S); + str = reduceToPrecision(str, precision); + if (width > str.length()) { + for (unsigned i = 0; i < width - str.length(); ++i) + out << fill; + } + out << str; + return out; +} +#endif // ifndef __SYNTHESIS__ + +/// Input streaming +// ----------------------------------------------------------------------------- +#ifndef __SYNTHESIS__ +template +INLINE std::istream& operator>>( + std::istream& in, + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) { + double d; + in >> d; + x = ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(d); + return in; +} +#endif +#endif // ifndef AP_AUTOCC + +/// Operators mixing Integers with ap_fixed_base +// ----------------------------------------------------------------------------- +#define AF_BIN_OP_WITH_INT_SF(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE) \ + template \ + INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< \ + _AP_W2, _AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP( \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE i_op) { \ + return op.operator BIN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \ + } + +#define AF_BIN_OP_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE) \ + template \ + INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< \ + _AP_W2, _AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP( \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE i_op) { \ + return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \ + } \ + template \ + INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< \ + _AP_W2, _AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP( \ + C_TYPE i_op, \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \ + } + +#define AF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE bool operator REL_OP( \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE i_op) { \ + return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \ + } \ + template \ + INLINE bool operator REL_OP( \ + C_TYPE i_op, \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \ + } + +#define AF_ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& \ + operator ASSIGN_OP( \ + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE i_op) { \ + return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \ + } + +#define AF_ASSIGN_OP_WITH_INT_SF(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& \ + operator ASSIGN_OP( \ + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE i_op) { \ + return op.operator ASSIGN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \ + } + +#define ALL_AF_OP_WITH_INT(C_TYPE, BITS, SIGN) \ + AF_BIN_OP_WITH_INT(+, C_TYPE, (BITS), (SIGN), plus) \ + AF_BIN_OP_WITH_INT(-, C_TYPE, (BITS), (SIGN), minus) \ + AF_BIN_OP_WITH_INT(*, C_TYPE, (BITS), (SIGN), mult) \ + AF_BIN_OP_WITH_INT(/, C_TYPE, (BITS), (SIGN), div) \ + AF_BIN_OP_WITH_INT(&, C_TYPE, (BITS), (SIGN), logic) \ + AF_BIN_OP_WITH_INT(|, C_TYPE, (BITS), (SIGN), logic) \ + AF_BIN_OP_WITH_INT(^, C_TYPE, (BITS), (SIGN), logic) \ + AF_BIN_OP_WITH_INT_SF(>>, C_TYPE, (BITS), (SIGN), lhs) \ + AF_BIN_OP_WITH_INT_SF(<<, C_TYPE, (BITS), (SIGN), lhs) \ + \ + AF_ASSIGN_OP_WITH_INT(+=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT(-=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT(*=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT(/=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT(&=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT(|=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT(^=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT_SF(>>=, C_TYPE, (BITS), (SIGN)) \ + AF_ASSIGN_OP_WITH_INT_SF(<<=, C_TYPE, (BITS), (SIGN)) \ + \ + AF_REL_OP_WITH_INT(>, C_TYPE, (BITS), (SIGN)) \ + AF_REL_OP_WITH_INT(<, C_TYPE, (BITS), (SIGN)) \ + AF_REL_OP_WITH_INT(>=, C_TYPE, (BITS), (SIGN)) \ + AF_REL_OP_WITH_INT(<=, C_TYPE, (BITS), (SIGN)) \ + AF_REL_OP_WITH_INT(==, C_TYPE, (BITS), (SIGN)) \ + AF_REL_OP_WITH_INT(!=, C_TYPE, (BITS), (SIGN)) + +ALL_AF_OP_WITH_INT(bool, 1, false) +ALL_AF_OP_WITH_INT(char, 8, CHAR_IS_SIGNED) +ALL_AF_OP_WITH_INT(signed char, 8, true) +ALL_AF_OP_WITH_INT(unsigned char, 8, false) +ALL_AF_OP_WITH_INT(short, _AP_SIZE_short, true) +ALL_AF_OP_WITH_INT(unsigned short, _AP_SIZE_short, false) +ALL_AF_OP_WITH_INT(int, _AP_SIZE_int, true) +ALL_AF_OP_WITH_INT(unsigned int, _AP_SIZE_int, false) +ALL_AF_OP_WITH_INT(long, _AP_SIZE_long, true) +ALL_AF_OP_WITH_INT(unsigned long, _AP_SIZE_long, false) +ALL_AF_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) +ALL_AF_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef ALL_AF_OP_WITH_INT +#undef AF_BIN_OP_WITH_INT +#undef AF_BIN_OP_WITH_INT_SF +#undef AF_ASSIGN_OP_WITH_INT +#undef AF_ASSIGN_OP_WITH_INT_SF +#undef AF_REL_OP_WITH_INT + +/* + * ********************************************************************** + * TODO + * There is no operator defined with float/double/long double, so that + * code like + * ap_fixed<8,4> a = 1.5f; + * a += 0.5f; + * will fail in compilation. + * Operator with warning about conversion might be wanted. + * ********************************************************************** + */ + +#define AF_BIN_OP_WITH_AP_INT(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType< \ + _AP_W, _AP_I, _AP_S>::RTYPE \ + operator BIN_OP( \ + const ap_int_base<_AP_W2, _AP_S2>& i_op, \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \ + } \ + \ + template \ + INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType< \ + _AP_W2, _AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP( \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + const ap_int_base<_AP_W2, _AP_S2>& i_op) { \ + return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \ + } + +#define AF_REL_OP_WITH_AP_INT(REL_OP) \ + template \ + INLINE bool operator REL_OP( \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + const ap_int_base<_AP_W2, _AP_S2>& i_op) { \ + return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \ + } \ + \ + template \ + INLINE bool operator REL_OP( \ + const ap_int_base<_AP_W2, _AP_S2>& i_op, \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \ + } + +#define AF_ASSIGN_OP_WITH_AP_INT(ASSIGN_OP) \ + template \ + INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& \ + operator ASSIGN_OP( \ + ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + const ap_int_base<_AP_W2, _AP_S2>& i_op) { \ + return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \ + } \ + \ + template \ + INLINE ap_int_base<_AP_W2, _AP_S2>& operator ASSIGN_OP( \ + ap_int_base<_AP_W2, _AP_S2>& i_op, \ + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return i_op.operator ASSIGN_OP(op.to_ap_int_base()); \ + } + +AF_BIN_OP_WITH_AP_INT(+, plus) +AF_BIN_OP_WITH_AP_INT(-, minus) +AF_BIN_OP_WITH_AP_INT(*, mult) +AF_BIN_OP_WITH_AP_INT(/, div) +AF_BIN_OP_WITH_AP_INT(&, logic) +AF_BIN_OP_WITH_AP_INT(|, logic) +AF_BIN_OP_WITH_AP_INT(^, logic) + +#undef AF_BIN_OP_WITH_AP_INT + +AF_ASSIGN_OP_WITH_AP_INT(+=) +AF_ASSIGN_OP_WITH_AP_INT(-=) +AF_ASSIGN_OP_WITH_AP_INT(*=) +AF_ASSIGN_OP_WITH_AP_INT(/=) +AF_ASSIGN_OP_WITH_AP_INT(&=) +AF_ASSIGN_OP_WITH_AP_INT(|=) +AF_ASSIGN_OP_WITH_AP_INT(^=) + +#undef AF_ASSIGN_OP_WITH_AP_INT + +AF_REL_OP_WITH_AP_INT(==) +AF_REL_OP_WITH_AP_INT(!=) +AF_REL_OP_WITH_AP_INT(>) +AF_REL_OP_WITH_AP_INT(>=) +AF_REL_OP_WITH_AP_INT(<) +AF_REL_OP_WITH_AP_INT(<=) + +#undef AF_REL_OP_WITH_AP_INT + +// Relational Operators with double +template +INLINE bool operator==( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator==(op1); +} + +template +INLINE bool operator!=( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator!=(op1); +} + +template +INLINE bool operator>( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator<(op1); +} + +template +INLINE bool operator>=( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator<=(op1); +} + +template +INLINE bool operator<( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator>(op1); +} + +template +INLINE bool operator<=( + double op1, + const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) { + return op2.operator>=(op1); +} + +#endif // ifndef __cplusplus else + +#endif // ifndef __AP_FIXED_BASE_H__ else + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_ref.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_ref.h new file mode 100644 index 00000000..aefda0a6 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_ref.h @@ -0,0 +1,718 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_FIXED_REF_H__ +#define __AP_FIXED_REF_H__ + +#ifndef __AP_FIXED_H__ +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +#ifndef __cplusplus +#error "C++ is required to include this header file" + +#else +#ifndef __SYNTHESIS__ +#include +#endif +/// Proxy class, which allows bit selection to be used as both rvalue (for +/// reading) and lvalue (for writing) +template +struct af_bit_ref { +#ifdef _MSC_VER +#pragma warning(disable : 4521 4522) +#endif + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type; + ref_type& d_bv; + int d_index; + + public: + INLINE af_bit_ref( + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref) + : d_bv(ref.d_bv), d_index(ref.d_index) { +#ifndef __SYNTHESIS__ + _AP_WARNING(d_index < 0, "Index of bit vector (%d) cannot be negative.", + d_index); + _AP_WARNING(d_index >= _AP_W, "Index of bit vector (%d) out of range (%d).", + d_index, _AP_W); +#endif + } + + INLINE af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {} + + INLINE af_bit_ref(const ref_type* bv, int index = 0) + : d_bv(*const_cast(bv)), d_index(index) {} + + /// convert operators. + INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); } + + /// @name assign operators + // @{ + INLINE af_bit_ref& operator=(bool val) { + d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); + return *this; + } + + // Be explicit to prevent it from being deleted, as field d_bv + // is of reference type. + INLINE af_bit_ref& operator=(const af_bit_ref& val) { + return operator=(bool(val)); + } + + template + INLINE af_bit_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=(bool(val)); + } + + template + INLINE af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=(bool(val)); + } + + template + INLINE af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) { + return operator=(val != 0); + } + + template + INLINE af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + return operator=(ap_int_base<_AP_W2, false>(val)); + } + + template + INLINE af_bit_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=(ap_int_base<_AP_W2, false>(val)); + } + + template + INLINE af_bit_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val)); + } + // @} + + /// @name concatenate operators + // @{ + template + INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, op); + } + + template + INLINE ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,( + const ap_bit_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this, + op); + } + + template + INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >( + *this, op); + } + + template + INLINE ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) { + return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this, + op); + } + + template + INLINE ap_concat_ref< + 1, af_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) { + return ap_concat_ref< + 1, af_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, + op); + } + + template + INLINE ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) { + return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + op)); + } + // @} + + /// @name comparison + // @{ + template + INLINE bool operator==( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + return get() == op.get(); + } + + template + INLINE bool operator!=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + return get() != op.get(); + } + // @} + + INLINE bool operator~() const { + bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index); + return bit ? false : true; + } + + INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); } + + INLINE int length() const { return 1; } + +#ifndef __SYNTHESIS__ + std::string to_string() const { return get() ? "1" : "0"; } +#else + // XXX HLS will delete this in synthesis + INLINE char* to_string() const { return 0; } +#endif +}; // struct af_bit_ref + +// XXX apcc cannot handle global std::ios_base::Init() brought in by +#ifndef AP_AUTOCC +#ifndef __SYNTHESIS__ +template +INLINE std::ostream& operator<<( + std::ostream& os, + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) { + os << x.to_string(); + return os; +} +#endif // ifndef __SYNTHESIS__ +#endif // ifndef AP_AUTOCC + +/// Range (slice) reference. +template +struct af_range_ref { +#ifdef _MSC_VER +#pragma warning(disable : 4521 4522) +#endif + typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type; + ref_type& d_bv; + int l_index; + int h_index; + + public: + /// copy ctor + INLINE af_range_ref( + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref) + : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {} + + /// ctor from ap_fixed_base, higher and lower bound. + /** if h is less than l, the bits selected will be returned in reverse order. + */ + INLINE af_range_ref(ref_type* bv, int h, int l) + : d_bv(*bv), l_index(l), h_index(h) { +#ifndef __SYNTHESIS__ + _AP_WARNING(h < 0 || l < 0, + "Higher bound(%d) and lower(%d) bound cannot be negative.", h, + l); + _AP_WARNING(h >= _AP_W || l >= _AP_W, + "Higher bound(%d) or lower(%d) bound out of range.", h, l); + _AP_WARNING(h < l, "The bits selected will be returned in reverse order."); +#endif + } + + INLINE af_range_ref(const ref_type* bv, int h, int l) + : d_bv(*const_cast(bv)), l_index(l), h_index(h) { +#ifndef __SYNTHESIS__ + _AP_WARNING(h < 0 || l < 0, + "Higher bound(%d) and lower(%d) bound cannot be negative.", h, + l); + _AP_WARNING(h >= _AP_W || l >= _AP_W, + "Higher bound(%d) or lower(%d) bound out of range.", h, l); + _AP_WARNING(h < l, "The bits selected will be returned in reverse order."); +#endif + } + + /// @name assign operators + // @{ + +#define ASSIGN_CTYPE_TO_AF_RANGE(DATA_TYPE) \ + INLINE af_range_ref& operator=(const DATA_TYPE val) { \ + ap_int_base<_AP_W, false> loc(val); \ + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, loc.V); \ + return *this; \ + } + + ASSIGN_CTYPE_TO_AF_RANGE(bool) + ASSIGN_CTYPE_TO_AF_RANGE(char) + ASSIGN_CTYPE_TO_AF_RANGE(signed char) + ASSIGN_CTYPE_TO_AF_RANGE(unsigned char) + ASSIGN_CTYPE_TO_AF_RANGE(short) + ASSIGN_CTYPE_TO_AF_RANGE(unsigned short) + ASSIGN_CTYPE_TO_AF_RANGE(int) + ASSIGN_CTYPE_TO_AF_RANGE(unsigned int) + ASSIGN_CTYPE_TO_AF_RANGE(long) + ASSIGN_CTYPE_TO_AF_RANGE(unsigned long) + ASSIGN_CTYPE_TO_AF_RANGE(ap_slong) + ASSIGN_CTYPE_TO_AF_RANGE(ap_ulong) +#if _AP_ENABLE_HALF_ == 1 + ASSIGN_CTYPE_TO_AF_RANGE(half) +#endif + ASSIGN_CTYPE_TO_AF_RANGE(float) + ASSIGN_CTYPE_TO_AF_RANGE(double) +#undef ASSIGN_CTYPE_TO_AF_RANGE + + /// assgin using a string. XXX crucial for cosim. + INLINE af_range_ref& operator=(const char* val) { + const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); + return *this; + } + + /// assign from ap_int_base. + // NOTE Base of other assgin operators. + template + INLINE af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) { + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V); + return *this; + } + + /// assign from range reference to ap_int_base. + template + INLINE af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + const ap_int_base<_AP_W2, false> tmp(val); + return operator=(tmp); + } + + /// assign from bit reference to ap_int_base.. + template + INLINE af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + const ap_int_base<1, false> tmp((bool)val); + return operator=(tmp); + } + + /// assgin from ap_fixed_base. + template + INLINE af_range_ref& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + val) { + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V); + return *this; + } + + /// copy assgin. + // XXX This has to be explicit, otherwise it will be deleted, as d_bv is + // of reference type. + INLINE af_range_ref& operator=(const af_range_ref& val) { + ap_int_base<_AP_W, false> tmp(val); + return operator=(tmp); + } + + /// assign from range reference to ap_fixed_base. + template + INLINE af_range_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + ap_int_base<_AP_W2, false> tmp(val); + return operator=(tmp); + } + + /// assign from bit reference to ap_fixed_base. + template + INLINE af_range_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + ap_int_base<1, false> tmp((bool)val); + return operator=(tmp); + } + + /// assign from compound reference. + template + INLINE af_range_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val); + return operator=(tmp); + } + // @} + + /// @name comparison operators with ap_range_ref. + // @{ + template + INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop == rop; + } + + template + INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator==(op2)); + } + + template + INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop < rop; + } + + template + INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop > rop; + } + + template + INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator>(op2)); + } + + template + INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator<(op2)); + } + // @} + + /// @name comparison operators with af_range_ref. + // @{ + template + INLINE bool operator==( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop == rop; + } + + template + INLINE bool operator!=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + return !(operator==(op2)); + } + + template + INLINE bool operator<( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop < rop; + } + + template + INLINE bool operator>( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> rop(op2); + return lop > rop; + } + + template + INLINE bool operator<=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + return !(operator>(op2)); + } + + template + INLINE bool operator>=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) { + return !(operator<(op2)); + } + // @} + + /// @name concatenate operators. + /// @{ + /// concatenate with ap_int_base. + template + INLINE + ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<_AP_W, af_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, op); + } + + /// concatenate with ap_bit_ref. + template + INLINE ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + /// concatenate with ap_bit_ref. + template + INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) { + return ap_concat_ref<_AP_W, af_range_ref, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + /// concatenate with ap_concat_ref. + template + INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) { + return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + *this, const_cast&>(op)); + } + + /// concatenate with another af_range_ref. + template + INLINE + ap_concat_ref<_AP_W, af_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &op) { + return ap_concat_ref< + _AP_W, af_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + op)); + } + + /// concatenate with another af_bit_ref. + template + INLINE + ap_concat_ref<_AP_W, af_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) { + return ap_concat_ref< + _AP_W, af_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + op)); + } + // @} + + INLINE operator ap_ulong() const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret.to_uint64(); + } + + INLINE operator ap_int_base<_AP_W, false>() const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret; + } + + INLINE ap_int_base<_AP_W, false> to_ap_int_base() const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret; + } + + // used in ap_fixed_base::to_string() + INLINE char to_char() const { + return (char)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE int to_int() const { + return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE unsigned to_uint() const { + return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE long to_long() const { + return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE unsigned long to_ulong() const { + return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE ap_slong to_int64() const { + return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE ap_ulong to_uint64() const { + return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE ap_int_base<_AP_W, false> get() const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret; + } + + template + INLINE void set(const ap_int_base<_AP_W2, false>& val) { + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V); + } + + INLINE int length() const { + return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1; + } + +#ifndef __SYNTHESIS__ + std::string to_string(signed char rd = 2) const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret.to_string(rd); + } +#else + // XXX HLS will delete this in synthesis + INLINE char* to_string(signed char rd = 2) const { + return 0; + } +#endif +}; // struct af_range_ref + +// XXX apcc cannot handle global std::ios_base::Init() brought in by +#ifndef AP_AUTOCC +#ifndef __SYNTHESIS__ +template +INLINE std::ostream& operator<<( + std::ostream& os, + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) { + os << x.to_string(); + return os; +} +#endif +#endif // ifndef AP_AUTOCC + +#define AF_REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE bool operator REL_OP( \ + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE op2) { \ + return ap_int_base<_AP_W, false>(op) \ + REL_OP ap_int_base<_AP_W2, _AP_S2>(op2); \ + } \ + \ + template \ + INLINE bool operator REL_OP( \ + C_TYPE op2, \ + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return ap_int_base<_AP_W2, _AP_S2>(op2) \ + REL_OP ap_int_base<_AP_W, false>(op); \ + } \ + \ + template \ + INLINE bool operator REL_OP( \ + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + C_TYPE op2) { \ + return bool(op) REL_OP op2; \ + } \ + \ + template \ + INLINE bool operator REL_OP( \ + C_TYPE op2, \ + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return op2 REL_OP bool(op); \ + } + +#define AF_REF_REL_OPS_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \ + AF_REF_REL_OP_WITH_INT(>, C_TYPE, (_AP_W2), (_AP_S2)) \ + AF_REF_REL_OP_WITH_INT(<, C_TYPE, (_AP_W2), (_AP_S2)) \ + AF_REF_REL_OP_WITH_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \ + AF_REF_REL_OP_WITH_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \ + AF_REF_REL_OP_WITH_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \ + AF_REF_REL_OP_WITH_INT(!=, C_TYPE, (_AP_W2), (_AP_S2)) + +AF_REF_REL_OPS_WITH_INT(bool, 1, false) +AF_REF_REL_OPS_WITH_INT(char, 8, CHAR_IS_SIGNED) +AF_REF_REL_OPS_WITH_INT(signed char, 8, true) +AF_REF_REL_OPS_WITH_INT(unsigned char, 8, false) +AF_REF_REL_OPS_WITH_INT(short, _AP_SIZE_short, true) +AF_REF_REL_OPS_WITH_INT(unsigned short, _AP_SIZE_short, false) +AF_REF_REL_OPS_WITH_INT(int, _AP_SIZE_int, true) +AF_REF_REL_OPS_WITH_INT(unsigned int, _AP_SIZE_int, false) +AF_REF_REL_OPS_WITH_INT(long, _AP_SIZE_long, true) +AF_REF_REL_OPS_WITH_INT(unsigned long, _AP_SIZE_long, false) +AF_REF_REL_OPS_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) +AF_REF_REL_OPS_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef AF_REF_REL_OP_INT +#undef AF_REF_REL_OPS_WITH_INT + +#define AF_REF_REL_OP_WITH_AP_INT(REL_OP) \ + template \ + INLINE bool operator REL_OP( \ + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + const ap_int_base<_AP_W2, _AP_S>& op2) { \ + return ap_int_base<_AP_W, false>(op) REL_OP op2; \ + } \ + template \ + INLINE bool operator REL_OP( \ + const ap_int_base<_AP_W2, _AP_S2>& op2, \ + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return op2 REL_OP ap_int_base<_AP_W, false>(op); \ + } \ + template \ + INLINE bool operator REL_OP( \ + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + return ap_int_base<1, false>(op) REL_OP op2; \ + } \ + template \ + INLINE bool operator REL_OP( \ + const ap_int_base<_AP_W2, _AP_S2>& op2, \ + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \ + return op2 REL_OP ap_int_base<1, false>(op); \ + } + +AF_REF_REL_OP_WITH_AP_INT(>) +AF_REF_REL_OP_WITH_AP_INT(<) +AF_REF_REL_OP_WITH_AP_INT(>=) +AF_REF_REL_OP_WITH_AP_INT(<=) +AF_REF_REL_OP_WITH_AP_INT(==) +AF_REF_REL_OP_WITH_AP_INT(!=) + +#endif // ifndef __cplusplus + +#endif // ifndef __AP_FIXED_REF_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_special.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_special.h new file mode 100644 index 00000000..0f7a9f7e --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_special.h @@ -0,0 +1,230 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_FIXED_SPECIAL_H__ +#define __AP_FIXED_SPECIAL_H__ + +#ifndef __AP_FIXED_H__ +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +#ifndef __SYNTHESIS__ +#include +#include +#endif +// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of +// include. +// #include +namespace std { +template class complex; +} + +/* + TODO: Modernize the code using C++11/C++14 + 1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html + 2. move constructor +*/ + +namespace std { +/* + Specialize std::complex to zero initialization ap_fixed. + + To reduce the area cost, ap_fixed is not zero initialized, just like basic + types float or double. However, libstdc++ provides specialization for float, + double and long double, initializing image part to 0 when not specified. + + This has become a difficulty in switching legacy code from these C types to + ap_fixed. To ease the tranform of legacy code, we have to implement + specialization of std::complex<> for our type. + + As ap_fixed is a template, it is impossible to specialize only the methods + that causes default initialization of value type in std::complex<>. An + explicit full specialization of the template class has to be done, covering + all the member functions and operators of std::complex<> as specified + in standard 26.2.4 and 26.2.5. +*/ +template +class complex > { + public: + typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp; + typedef _Tp value_type; + + // 26.2.4/1 + // Constructor without argument + // Default initialize, so that in dataflow, the variable is only written once. + complex() : _M_real(_Tp()), _M_imag(_Tp()) {} + // Constructor with ap_fixed. + // Zero initialize image part when not specified, so that `C(1) == C(1,0)` + complex(const _Tp &__r, const _Tp &__i = _Tp(0)) + : _M_real(__r), _M_imag(__i) {} + + // Constructor with another complex number + template + complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {} + +#if __cplusplus >= 201103L + const _Tp& real() const { return _M_real; } + const _Tp& imag() const { return _M_imag; } +#else + _Tp& real() { return _M_real; } + const _Tp& real() const { return _M_real; } + _Tp& imag() { return _M_imag; } + const _Tp& imag() const { return _M_imag; } +#endif + + void real(_Tp __val) { _M_real = __val; } + + void imag(_Tp __val) { _M_imag = __val; } + + // Assign this complex number with ap_fixed. + // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);` + complex<_Tp> &operator=(const _Tp __t) { + _M_real = __t; + _M_imag = _Tp(0); + return *this; + } + + // 26.2.5/1 + // Add ap_fixed to this complex number. + complex<_Tp> &operator+=(const _Tp &__t) { + _M_real += __t; + return *this; + } + + // 26.2.5/3 + // Subtract ap_fixed from this complex number. + complex<_Tp> &operator-=(const _Tp &__t) { + _M_real -= __t; + return *this; + } + + // 26.2.5/5 + // Multiply this complex number by ap_fixed. + complex<_Tp> &operator*=(const _Tp &__t) { + _M_real *= __t; + _M_imag *= __t; + return *this; + } + + // 26.2.5/7 + // Divide this complex number by ap_fixed. + complex<_Tp> &operator/=(const _Tp &__t) { + _M_real /= __t; + _M_imag /= __t; + return *this; + } + + // Assign complex number to this complex number. + template + complex<_Tp> &operator=(const complex<_Up> &__z) { + _M_real = __z.real(); + _M_imag = __z.imag(); + return *this; + } + + // 26.2.5/9 + // Add complex number to this. + template + complex<_Tp> &operator+=(const complex<_Up> &__z) { + _M_real += __z.real(); + _M_imag += __z.imag(); + return *this; + } + + // 26.2.5/11 + // Subtract complex number from this. + template + complex<_Tp> &operator-=(const complex<_Up> &__z) { + _M_real -= __z.real(); + _M_imag -= __z.imag(); + return *this; + } + + // 26.2.5/13 + // Multiply this by complex number. + template + complex<_Tp> &operator*=(const complex<_Up> &__z) { + const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag(); + _M_imag = _M_real * __z.imag() + _M_imag * __z.real(); + _M_real = __r; + return *this; + } + + // 26.2.5/15 + // Divide this by complex number. + template + complex<_Tp> &operator/=(const complex<_Up> &__z) { + complex<_Tp> cj (__z.real(), -__z.imag()); + complex<_Tp> a = (*this) * cj; + complex<_Tp> b = cj * __z; + _M_real = a.real() / b.real(); + _M_imag = a.imag() / b.real(); + return *this; + } + + private: + _Tp _M_real; + _Tp _M_imag; + +}; // class complex > + +/* + Non-member operations + These operations are not required by standard in 26.2.6, but libstdc++ + defines them for + float, double or long double's specialization. +*/ +// Compare complex number with ap_fixed. +template +inline bool operator==( + const complex > &__x, + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) { + return __x.real() == __y && + __x.imag() == 0; +} + +// Compare ap_fixed with complex number. +template +inline bool operator==( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x, + const complex > &__y) { + return __x == __y.real() && + 0 == __y.imag(); +} + +// Compare complex number with ap_fixed. +template +inline bool operator!=( + const complex > &__x, + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) { + return __x.real() != __y || + __x.imag() != 0; +} + +// Compare ap_fixed with complex number. +template +inline bool operator!=( + const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x, + const complex > &__y) { + return __x != __y.real() || + 0 != __y.imag(); +} + +} // namespace std + +#endif // ifndef __AP_FIXED_SPECIAL_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int.h new file mode 100644 index 00000000..db3044d4 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int.h @@ -0,0 +1,330 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_INT_H__ +#define __AP_INT_H__ + +#include +#include +#include + +//--------------------------------------------------------------- + +/// Sign Arbitrary Precision Type. +template +struct ap_int : ap_int_base<_AP_W, true> { + typedef ap_int_base<_AP_W, true> Base; + // Constructor + INLINE ap_int() : Base() {} + + // Copy ctor + INLINE ap_int(const ap_int& op) { Base::V = op.V; } + + template + INLINE ap_int(const ap_int<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_int(const volatile ap_int<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_int(const ap_uint<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_int(const volatile ap_uint<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + INLINE ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + INLINE ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) + : Base(ref) {} + + template + INLINE ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + INLINE ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + INLINE ap_int( + const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + INLINE ap_int( + const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + INLINE ap_int(const ap_int_base<_AP_W2, _AP_S2>& op) { + Base::V = op.V; + } + + template + INLINE ap_int( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + INLINE ap_int( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + INLINE ap_int( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + +#define CTOR(TYPE) \ + INLINE ap_int(TYPE val) { Base::V = val; } + CTOR(bool) + CTOR(char) + CTOR(signed char) + CTOR(unsigned char) + CTOR(short) + CTOR(unsigned short) + CTOR(int) + CTOR(unsigned int) + CTOR(long) + CTOR(unsigned long) + CTOR(ap_slong) + CTOR(ap_ulong) +#undef CTOR + ap_int(double val) : Base(val) {} + ap_int(float val) : Base(val) {} +#if _AP_ENABLE_HALF_ == 1 + ap_int(half val) : Base(val) {} +#endif + + // ap_int_base will guess radix if radix is not provided. + INLINE ap_int(const char* s) : Base(s) {} + + INLINE ap_int(const char* s, signed char rd) : Base(s, rd) {} + + // Assignment + /* ctor will be used when right is not of proper type. */ + + INLINE ap_int& operator=(const ap_int<_AP_W>& op2) { + Base::V = op2.V; + return *this; + } + + /* cannot bind volatile reference to non-volatile type. */ + INLINE ap_int& operator=(const volatile ap_int<_AP_W>& op2) { + Base::V = op2.V; + return *this; + } + + /* cannot return volatile *this. */ + INLINE void operator=(const ap_int<_AP_W>& op2) volatile { Base::V = op2.V; } + + INLINE void operator=(const volatile ap_int<_AP_W>& op2) volatile { + Base::V = op2.V; + } + +}; // struct ap_int. + +//--------------------------------------------------------------- + +/// Unsigned Arbitrary Precision Type. +template +struct ap_uint : ap_int_base<_AP_W, false> { + typedef ap_int_base<_AP_W, false> Base; + // Constructor + INLINE ap_uint() : Base() {} + + // Copy ctor + INLINE ap_uint(const ap_uint& op) { Base::V = op.V; } + + template + INLINE ap_uint(const ap_uint<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_uint(const ap_int<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_uint(const volatile ap_uint<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_uint(const volatile ap_int<_AP_W2>& op) { + Base::V = op.V; + } + + template + INLINE ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + INLINE ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {} + + template + INLINE ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) + : Base(ref) {} + + template + INLINE ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + INLINE ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + INLINE ap_uint( + const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {} + + template + INLINE ap_uint( + const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) { + } + + template + INLINE ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op) { + Base::V = op.V; + } + + template + INLINE ap_uint( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + INLINE ap_uint( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + + template + INLINE ap_uint( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) + : Base(op) {} + +#define CTOR(TYPE) \ + INLINE ap_uint(TYPE val) { Base::V = val; } + CTOR(bool) + CTOR(char) + CTOR(signed char) + CTOR(unsigned char) + CTOR(short) + CTOR(unsigned short) + CTOR(int) + CTOR(unsigned int) + CTOR(long) + CTOR(unsigned long) + CTOR(ap_slong) + CTOR(ap_ulong) +#undef CTOR + ap_uint(double val) : Base(val) {} + ap_uint(float val) : Base(val) {} +#if _AP_ENABLE_HALF_ == 1 + ap_uint(half val) : Base(val) {} +#endif + + // ap_int_base will guess radix if radix is not provided. + INLINE ap_uint(const char* s) : Base(s) {} + + INLINE ap_uint(const char* s, signed char rd) : Base(s, rd) {} + + // Assignment + /* XXX ctor will be used when right is not of proper type. */ + + INLINE ap_uint& operator=(const ap_uint<_AP_W>& op2) { + Base::V = op2.V; + return *this; + } + + /* cannot bind volatile reference to non-volatile type. */ + INLINE ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) { + Base::V = op2.V; + return *this; + } + + /* cannot return volatile *this. */ + INLINE void operator=(const ap_uint<_AP_W>& op2) volatile { Base::V = op2.V; } + + INLINE void operator=(const volatile ap_uint<_AP_W>& op2) volatile { + Base::V = op2.V; + } + +}; // struct ap_uint. + +#define ap_bigint ap_int +#define ap_biguint ap_uint + +#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED)) +// XXX sc_trace overload for ap_fixed is already included in +// "ap_sysc/ap_sc_extras.h", so do not define in synthesis. +template +INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_int<_AP_W>& op, + const std::string& name) { + if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name); +} + +template +INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_uint<_AP_W>& op, + const std::string& name) { + if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name); +} +#endif // System C sim + +#include + +#endif // ifndef __AP_INT_H__ else + +// FIXME user should include ap_fixed.h when using ap_fixed. +// to avoid circular inclusion, must check whether this is required by +// ap_fixed.h +#ifndef __AP_FIXED_H__ +#include +#endif + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_base.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_base.h new file mode 100644 index 00000000..091552a8 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_base.h @@ -0,0 +1,1885 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_INT_BASE_H__ +#define __AP_INT_BASE_H__ + +#ifndef __AP_INT_H__ +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +#ifndef __cplusplus +#error "C++ is required to include this header file" +#else + +#include +#ifndef __SYNTHESIS__ +#if _AP_ENABLE_HALF_ == 1 +#include +#endif +#include +#include +#endif + +/* ---------------------------------------------------------------- + * ap_int_base: AutoPilot integer/Arbitrary precision integer. + * ---------------------------------------------------------------- + */ + +/* helper trait. Selecting the smallest C type that can hold the value, + * return 64 bit C type if not possible. + */ +template +struct retval; + +// at least 64 bit +template +struct retval<_AP_N, true> { + typedef ap_slong Type; +}; + +template +struct retval<_AP_N, false> { + typedef ap_ulong Type; +}; + +// at least 8 bit +template <> +struct retval<1, true> { + typedef signed char Type; +}; + +template <> +struct retval<1, false> { + typedef unsigned char Type; +}; + +// at least 16 bit +template <> +struct retval<2, true> { + typedef short Type; +}; + +template <> +struct retval<2, false> { + typedef unsigned short Type; +}; + +// at least 32 bit +template <> +struct retval<3, true> { + typedef long Type; +}; + +template <> +struct retval<3, false> { + typedef unsigned long Type; +}; + +template <> +struct retval<4, true> { + typedef long Type; +}; + +template <> +struct retval<4, false> { + typedef unsigned long Type; +}; + +// trait for letting base class to return derived class. +// Notice that derived class template is incomplete, and we cannot use +// the member of the derived class. +template +struct _ap_int_factory; +template +struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; }; +template +struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; }; + +template +struct ap_int_base : public _AP_ROOT_TYPE<_AP_W, _AP_S> { + public: + typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base; + + /* ap_int_base<_AP_W, _AP_S, true> + * typedef typename retval<(_AP_W + 7) / 8, _AP_S>::Type RetType; + * + * ap_int_base<_AP_W, _AP_S, false> + * typedef typename retval<8, _AP_S>::Type RetType; + */ + typedef typename retval::Type RetType; + + static const int width = _AP_W; + + template + struct RType { + enum { + mult_w = _AP_W + _AP_W2, + mult_s = _AP_S || _AP_S2, + plus_w = + AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1, + plus_s = _AP_S || _AP_S2, + minus_w = + AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1, + minus_s = true, + div_w = _AP_W + _AP_S2, + div_s = _AP_S || _AP_S2, + mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)), + mod_s = _AP_S, + logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)), + logic_s = _AP_S || _AP_S2 + }; + + + typedef ap_int_base mult_base; + typedef ap_int_base plus_base; + typedef ap_int_base minus_base; + typedef ap_int_base logic_base; + typedef ap_int_base div_base; + typedef ap_int_base mod_base; + typedef ap_int_base<_AP_W, _AP_S> arg1_base; + + typedef typename _ap_int_factory::type mult; + typedef typename _ap_int_factory::type plus; + typedef typename _ap_int_factory::type minus; + typedef typename _ap_int_factory::type logic; + typedef typename _ap_int_factory::type div; + typedef typename _ap_int_factory::type mod; + typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1; + typedef bool reduce; + }; + + /* Constructors. + * ---------------------------------------------------------------- + */ + /// default ctor + INLINE ap_int_base() { + /* + #ifdef __SC_COMPATIBLE__ + Base::V = 0; + #endif + */ + } + + /// copy ctor + template + INLINE ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op) { + Base::V = op.V; + } + + /// volatile copy ctor + template + INLINE ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) { + Base::V = op.V; + } + +// XXX C++11 feature. +// The explicit specifier specifies that a constructor or conversion function +// (since C++11) doesn't allow implicit conversions or copy-initialization. +// ap_int_base x = 1; +// ap_int_base foo() { return 1; } +// but allows +// ap_int_base x(1); +// ap_int_base y {1}; + +/// from all c types. +#define CTOR_FROM_INT(Type, Size, Signed) \ + INLINE ap_int_base(const Type op) { Base::V = op; } + + CTOR_FROM_INT(bool, 1, false) + CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED) + CTOR_FROM_INT(signed char, 8, true) + CTOR_FROM_INT(unsigned char, 8, false) + CTOR_FROM_INT(short, _AP_SIZE_short, true) + CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false) + CTOR_FROM_INT(int, _AP_SIZE_int, true) + CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false) + CTOR_FROM_INT(long, _AP_SIZE_long, true) + CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false) + CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true) + CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false) +#undef CTOR_FROM_INT + +#if _AP_ENABLE_HALF_ == 1 + /// ctor from half. + // TODO optimize + INLINE ap_int_base(half op) { + ap_int_base<_AP_W, _AP_S> t((float)op); + Base::V = t.V; + } +#endif + + /// ctor from float. + INLINE ap_int_base(float op) { + const int BITS = FLOAT_MAN + FLOAT_EXP + 1; + ap_int_base reg; + reg.V = floatToRawBits(op); + bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1); + + ap_int_base exp = 0; + exp.V = _AP_ROOT_op_get_range(reg.V, FLOAT_MAN, BITS - 2); + exp = exp - FLOAT_BIAS; + + ap_int_base man; + man.V = _AP_ROOT_op_get_range(reg.V, 0, FLOAT_MAN - 1); + // check for NaN + _AP_WARNING(exp == ((unsigned char)(FLOAT_BIAS + 1)) && man.V != 0, + "assign NaN to ap integer value"); + // set leading 1. + man.V = _AP_ROOT_op_set_bit(man.V, FLOAT_MAN, 1); + //if (is_neg) man = -man; + + if ((reg.V & 0x7ffffffful) == 0) { + Base::V = 0; + } else { + int sh_amt = FLOAT_MAN - exp.V; + if (sh_amt == 0) { + Base::V = man.V; + } else if (sh_amt > 0) { + if (sh_amt < FLOAT_MAN + 2) { + Base::V = man.V >> sh_amt; + } else { + if (is_neg) + Base::V = -1; + else + Base::V = 0; + } + } else { + sh_amt = -sh_amt; + if (sh_amt < _AP_W) { + Base::V = man.V; + Base::V <<= sh_amt; + } else { + Base::V = 0; + } + } + } + if (is_neg) *this = -(*this); + } + + /// ctor from double. + INLINE ap_int_base(double op) { + const int BITS = DOUBLE_MAN + DOUBLE_EXP + 1; + ap_int_base reg; + reg.V = doubleToRawBits(op); + bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1); + + ap_int_base exp = 0; + exp.V = _AP_ROOT_op_get_range(reg.V, DOUBLE_MAN, BITS - 2); + exp = exp - DOUBLE_BIAS; + + ap_int_base man; + man.V = _AP_ROOT_op_get_range(reg.V, 0, DOUBLE_MAN - 1); + // check for NaN + _AP_WARNING(exp == ((unsigned char)(DOUBLE_BIAS + 1)) && man.V != 0, + "assign NaN to ap integer value"); + // set leading 1. + man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1); + //if (is_neg) man = -man; + + if ((reg.V & 0x7fffffffffffffffull) == 0) { + Base::V = 0; + } else { + int sh_amt = DOUBLE_MAN - exp.V; + if (sh_amt == 0) { + Base::V = man.V; + } else if (sh_amt > 0) { + if (sh_amt < DOUBLE_MAN + 2) { + Base::V = man.V >> sh_amt; + } else { + if (is_neg) + Base::V = -1; + else + Base::V = 0; + } + } else { + sh_amt = -sh_amt; + if (sh_amt < _AP_W) { + Base::V = man.V; + Base::V <<= sh_amt; + } else { + Base::V = 0; + } + } + } + if (is_neg) *this = -(*this); + } + + /// from higer rank type. + template + INLINE ap_int_base( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = op.to_ap_int_base().V; + } + + template + INLINE ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) { + Base::V = (ref.get()).V; + } + + template + INLINE ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) { + Base::V = ref.operator bool(); + } + + template + INLINE ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) { + const ap_int_base::_AP_WR, + false> + tmp = ref.get(); + Base::V = tmp.V; + } + + /* radix has default value in set */ + +#ifndef __SYNTHESIS__ + INLINE ap_int_base(const char* s, signed char rd = 0) { + if (rd == 0) + rd = guess_radix(s); + unsigned int length = strlen(s); + Base::V.fromString(s, length, rd); + } +#else + // XXX __builtin_bit_from_string(...) requires const C string and radix. + INLINE ap_int_base(const char* s) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S, + AP_TRN, AP_WRAP, 0, _AP_C99); + Base::V = t; + } + INLINE ap_int_base(const char* s, signed char rd) { + typeof(Base::V) t; + _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S, + AP_TRN, AP_WRAP, 0, _AP_C99); + Base::V = t; + } +#endif + + template + INLINE ap_int_base( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + Base::V = (val.get()).V; + } + + template + INLINE ap_int_base( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + Base::V = val.operator bool(); + } + + INLINE ap_int_base read() volatile { + /*AP_DEBUG(printf("call read %d\n", Base::V););*/ + ap_int_base ret; + ret.V = Base::V; + return ret; + } + + INLINE void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile { + /*AP_DEBUG(printf("call write %d\n", op2.V););*/ + Base::V = op2.V; + } + + /* Another form of "write".*/ + template + INLINE void operator=( + const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile { + Base::V = op2.V; + } + + INLINE void operator=( + const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile { + Base::V = op2.V; + } + + template + INLINE void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile { + Base::V = op2.V; + } + + INLINE void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile { + Base::V = op2.V; + } + + template + INLINE ap_int_base& operator=( + const volatile ap_int_base<_AP_W2, _AP_S2>& op2) { + Base::V = op2.V; + return *this; + } + + template + INLINE ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) { + Base::V = op2.V; + return *this; + } + + INLINE ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) { + Base::V = op2.V; + return *this; + } + + INLINE ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) { + Base::V = op2.V; + return *this; + } + + +#define ASSIGN_OP_FROM_INT(Type, Size, Signed) \ + INLINE ap_int_base& operator=(Type op) { \ + Base::V = op; \ + return *this; \ + } + + ASSIGN_OP_FROM_INT(bool, 1, false) + ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED) + ASSIGN_OP_FROM_INT(signed char, 8, true) + ASSIGN_OP_FROM_INT(unsigned char, 8, false) + ASSIGN_OP_FROM_INT(short, _AP_SIZE_short, true) + ASSIGN_OP_FROM_INT(unsigned short, _AP_SIZE_short, false) + ASSIGN_OP_FROM_INT(int, _AP_SIZE_int, true) + ASSIGN_OP_FROM_INT(unsigned int, _AP_SIZE_int, false) + ASSIGN_OP_FROM_INT(long, _AP_SIZE_long, true) + ASSIGN_OP_FROM_INT(unsigned long, _AP_SIZE_long, false) + ASSIGN_OP_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true) + ASSIGN_OP_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef ASSIGN_OP_FROM_INT + + template + INLINE ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) { + Base::V = (bool)op2; + return *this; + } + + template + INLINE ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + Base::V = (ap_int_base<_AP_W2, false>(op2)).V; + return *this; + } + + template + INLINE ap_int_base& operator=( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) { + Base::V = op2.get().V; + return *this; + } + + template + INLINE ap_int_base& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = op.to_ap_int_base().V; + return *this; + } + + template + INLINE ap_int_base& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = (bool)op; + return *this; + } + + template + INLINE ap_int_base& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) { + Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V; + return *this; + } + + // FIXME: UG902 has clearly required user to use to_int() to convert to built-in + // types, but this implicit conversion is relied on in hls_cordic.h and hls_rsr.h. + // For example: + // int d_exp = fps_x.exp - fps_y.exp; + INLINE operator RetType() const { return (RetType)(Base::V); } + + /* Explicit conversions to C types. + * ---------------------------------------------------------------- + */ + INLINE bool to_bool() const { return (bool)(Base::V); } + INLINE char to_char() const { return (char)(Base::V); } + INLINE signed char to_schar() const { return (signed char)(Base::V); } + INLINE unsigned char to_uchar() const { return (unsigned char)(Base::V); } + INLINE short to_short() const { return (short)(Base::V); } + INLINE unsigned short to_ushort() const { return (unsigned short)(Base::V); } + INLINE int to_int() const { return (int)(Base::V); } + INLINE unsigned to_uint() const { return (unsigned)(Base::V); } + INLINE long to_long() const { return (long)(Base::V); } + INLINE unsigned long to_ulong() const { return (unsigned long)(Base::V); } + INLINE ap_slong to_int64() const { return (ap_slong)(Base::V); } + INLINE ap_ulong to_uint64() const { return (ap_ulong)(Base::V); } + INLINE float to_float() const { return (float)(Base::V); } + INLINE double to_double() const { return (double)(Base::V); } + + // TODO decide if user-defined conversion should be provided. +#if 0 + INLINE operator char() const { return (char)(Base::V); } + INLINE operator signed char() const { return (signed char)(Base::V); } + INLINE operator unsigned char() const { return (unsigned char)(Base::V); } + INLINE operator short() const { return (short)(Base::V); } + INLINE operator unsigned short() const { return (unsigned short)(Base::V); } + INLINE operator int() const { return (int)(Base::V); } + INLINE operator unsigned int () const { return (unsigned)(Base::V); } + INLINE operator long () const { return (long)(Base::V); } + INLINE operator unsigned long () const { return (unsigned long)(Base::V); } + INLINE operator ap_slong () { return (ap_slong)(Base::V); } + INLINE operator ap_ulong () { return (ap_ulong)(Base::V); } +#endif + + /* Helper methods. + ---------------------------------------------------------------- + */ + /* we cannot call a non-volatile function on a volatile instance. + * but calling a volatile function is ok. + * XXX deleted non-volatile version. + */ + INLINE int length() const volatile { return _AP_W; } + + /*Return true if the value of ap_int_base instance is zero*/ + INLINE bool iszero() const { return Base::V == 0; } + + /*Return true if the value of ap_int_base instance is zero*/ + INLINE bool is_zero() const { return Base::V == 0; } + + /* x < 0 */ + INLINE bool sign() const { + if (_AP_S && + _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) + return true; + else + return false; + } + + /* x[i] = 0 */ + INLINE void clear(int i) { + AP_ASSERT(i >= 0 && i < _AP_W, "position out of range"); + Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0); + } + + /* x[i] = !x[i]*/ + INLINE void invert(int i) { + AP_ASSERT(i >= 0 && i < _AP_W, "position out of range"); + bool val = _AP_ROOT_op_get_bit(Base::V, i); + if (val) + Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0); + else + Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1); + } + + INLINE bool test(int i) const { + AP_ASSERT(i >= 0 && i < _AP_W, "position out of range"); + return _AP_ROOT_op_get_bit(Base::V, i); + } + + // Get self. For ap_concat_ref expansion. + INLINE ap_int_base& get() { return *this; } + + // Set the ith bit into 1 + INLINE void set(int i) { + AP_ASSERT(i >= 0 && i < _AP_W, "position out of range"); + Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1); + } + + // Set the ith bit into v + INLINE void set(int i, bool v) { + AP_ASSERT(i >= 0 && i < _AP_W, "position out of range"); + Base::V = _AP_ROOT_op_set_bit(Base::V, i, v); + } + + // This is used for sc_lv and sc_bv, which is implemented by sc_uint + // Rotate an ap_int_base object n places to the left + INLINE ap_int_base& lrotate(int n) { + AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range"); + // TODO unify this. +#ifdef __SYNTHESIS__ + typeof(Base::V) l_p = Base::V << n; + typeof(Base::V) r_p = Base::V >> (_AP_W - n); + Base::V = l_p | r_p; +#else + Base::V.lrotate(n); +#endif + return *this; + } + + // This is used for sc_lv and sc_bv, which is implemented by sc_uint + // Rotate an ap_int_base object n places to the right + INLINE ap_int_base& rrotate(int n) { + AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range"); + // TODO unify this. +#ifdef __SYNTHESIS__ + typeof(Base::V) l_p = Base::V << (_AP_W - n); + typeof(Base::V) r_p = Base::V >> n; + Base::V = l_p | r_p; +#else + Base::V.rrotate(n); +#endif + return *this; + } + + // Reverse the contents of ap_int_base instance. + // I.e. LSB becomes MSB and vise versa. + INLINE ap_int_base& reverse() { + Base::V = _AP_ROOT_op_get_range(Base::V, _AP_W - 1, 0); + return *this; + } + + // Set the ith bit into v + INLINE void set_bit(int i, bool v) { + Base::V = _AP_ROOT_op_set_bit(Base::V, i, v); + } + + // Get the value of ith bit + INLINE bool get_bit(int i) const { + return (bool)_AP_ROOT_op_get_bit(Base::V, i); + } + + // complements every bit + INLINE void b_not() { Base::V = ~Base::V; } + +#define OP_ASSIGN_AP(Sym) \ + template \ + INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + Base::V Sym op2.V; \ + return *this; \ + } + + /* Arithmetic assign. + * ---------------------------------------------------------------- + */ + OP_ASSIGN_AP(*=) + OP_ASSIGN_AP(+=) + OP_ASSIGN_AP(-=) + OP_ASSIGN_AP(/=) + OP_ASSIGN_AP(%=) +#undef OP_ASSIGN_AP + + /* Bitwise assign: and, or, xor. + * ---------------------------------------------------------------- + */ +#define OP_ASSIGN_AP_CHK(Sym) \ + template \ + INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + _AP_WARNING((_AP_W != _AP_W2), \ + "Bitsize mismatch for ap_[u]int" #Sym "ap_[u]int."); \ + Base::V Sym op2.V; \ + return *this; \ + } + OP_ASSIGN_AP_CHK(&=) + OP_ASSIGN_AP_CHK(|=) + OP_ASSIGN_AP_CHK(^=) +#undef OP_ASSIGN_AP_CHK + + /* Prefix increment, decrement. + * ---------------------------------------------------------------- + */ + INLINE ap_int_base& operator++() { + operator+=((ap_int_base<1, false>)1); + return *this; + } + INLINE ap_int_base& operator--() { + operator-=((ap_int_base<1, false>)1); + return *this; + } + + /* Postfix increment, decrement + * ---------------------------------------------------------------- + */ + INLINE const typename RType<_AP_W,_AP_S>::arg1 operator++(int) { + ap_int_base t = *this; + operator+=((ap_int_base<1, false>)1); + return t; + } + INLINE const typename RType<_AP_W,_AP_S>::arg1 operator--(int) { + ap_int_base t = *this; + operator-=((ap_int_base<1, false>)1); + return t; + } + + /* Unary arithmetic. + * ---------------------------------------------------------------- + */ + INLINE typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; } + + // TODO used to be W>64 only... need check. + INLINE typename RType<1, false>::minus operator-() const { + return ap_int_base<1, false>(0) - *this; + } + + /* Not (!) + * ---------------------------------------------------------------- + */ + INLINE bool operator!() const { return Base::V == 0; } + + /* Bitwise (arithmetic) unary: complement + ---------------------------------------------------------------- + */ + // XXX different from Mentor's ac_int! + INLINE typename RType<_AP_W,_AP_S>::arg1 operator~() const { + ap_int_base<_AP_W, _AP_S> r; + r.V = ~Base::V; + return r; + } + + /* Shift (result constrained by left operand). + * ---------------------------------------------------------------- + */ + template + INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const { + bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator>>(sh); + } else + return operator<<(sh); + } + + template + INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const { + ap_int_base r; + r.V = Base::V << op2.to_uint(); + return r; + } + + template + INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const { + bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator<<(sh); + } + return operator>>(sh); + } + + template + INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const { + ap_int_base r; + r.V = Base::V >> op2.to_uint(); + return r; + } + + // FIXME we standalone operator>> for ap_int_base and ap_range_ref. +#if 0 + template + INLINE ap_int_base operator<<(const ap_range_ref<_AP_W2, _AP_S2>& op2) const { + return *this << (op2.operator ap_int_base<_AP_W2, false>()); + } + + template + INLINE ap_int_base operator>>(const ap_range_ref<_AP_W2, _AP_S2>& op2) const { + return *this >> (op2.operator ap_int_base<_AP_W2, false>()); + } +#endif + + /* Shift assign + * ---------------------------------------------------------------- + */ + template + INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) { + bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator>>=(sh); + } else + return operator<<=(sh); + } + + template + INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) { + Base::V <<= op2.to_uint(); + return *this; + } + + template + INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) { + bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1); + ap_int_base<_AP_W2, false> sh = op2; + if (isNeg) { + sh = -op2; + return operator<<=(sh); + } + return operator>>=(sh); + } + + template + INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) { + Base::V >>= op2.to_uint(); + return *this; + } + + // FIXME we standalone operator>> for ap_int_base and ap_range_ref. +#if 0 + template + INLINE ap_int_base& operator<<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return *this <<= (op2.operator ap_int_base<_AP_W2, false>()); + } + template + INLINE ap_int_base& operator>>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return *this >>= (op2.operator ap_int_base<_AP_W2, false>()); + } +#endif + + /* Equality and Relational. + * ---------------------------------------------------------------- + */ + template + INLINE bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V == op2.V; + } + template + INLINE bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return !(Base::V == op2.V); + } + template + INLINE bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V < op2.V; + } + template + INLINE bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V >= op2.V; + } + template + INLINE bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V > op2.V; + } + template + INLINE bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const { + return Base::V <= op2.V; + } + + /* Bit and Part Select + * ---------------------------------------------------------------- + */ + INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) { + _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W); + _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W); + return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + // This is a must to strip constness to produce reference type. + INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const { + _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W); + _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W); + return ap_range_ref<_AP_W, _AP_S>(const_cast(this), Hi, Lo); + } + + template + INLINE ap_range_ref<_AP_W, _AP_S> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + INLINE ap_range_ref<_AP_W, _AP_S> range( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + INLINE ap_range_ref<_AP_W, _AP_S> range() { + return this->range(_AP_W - 1, 0); + } + + INLINE ap_range_ref<_AP_W, _AP_S> range() const { + return this->range(_AP_W - 1, 0); + } + + INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) { + return this->range(Hi, Lo); + } + + INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const { + return this->range(Hi, Lo); + } + + template + INLINE ap_range_ref<_AP_W, _AP_S> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + template + INLINE ap_range_ref<_AP_W, _AP_S> operator()( + const ap_int_base<_AP_W2, _AP_S2>& HiIdx, + const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + +#if 0 + template + INLINE ap_int_base slice() const { + AP_ASSERT(Hi >= Lo && Hi < _AP_W && Lo < _AP_W, "Out of bounds in slice()"); + ap_int_base tmp ; + tmp.V = _AP_ROOT_op_get_range(Base::V, Lo, Hi); + return tmp; + } + + INLINE ap_bit_ref<_AP_W,_AP_S> operator [] ( unsigned int uindex) { + AP_ASSERT(uindex < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W,_AP_S> bvh( this, uindex ); + return bvh; + } +#endif + + INLINE ap_bit_ref<_AP_W, _AP_S> operator[](int index) { + AP_ASSERT(index >= 0, "Attempting to read bit with negative index"); + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> bvh(this, index); + return bvh; + } + + template + INLINE ap_bit_ref<_AP_W, _AP_S> operator[]( + const ap_int_base<_AP_W2, _AP_S2>& index) { + AP_ASSERT(index >= 0, "Attempting to read bit with negative index"); + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int()); + return bvh; + } + + INLINE bool operator[](int index) const { + AP_ASSERT(index >= 0, "Attempting to read bit with negative index"); + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> br(this, index); + return br.to_bool(); + } + template + INLINE bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const { + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int()); + return br.to_bool(); + } + + INLINE ap_bit_ref<_AP_W, _AP_S> bit(int index) { + AP_ASSERT(index >= 0, "Attempting to read bit with negative index"); + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> bvh(this, index); + return bvh; + } + template + INLINE ap_bit_ref<_AP_W, _AP_S> bit( + const ap_int_base<_AP_W2, _AP_S2>& index) { + AP_ASSERT(index >= 0, "Attempting to read bit with negative index"); + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int()); + return bvh; + } + + INLINE bool bit(int index) const { + AP_ASSERT(index >= 0, "Attempting to read bit with negative index"); + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W, _AP_S> br(this, index); + return br.to_bool(); + } + + template + INLINE bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const { + return bit(index.to_int()); + } + +#if 0 + template + INLINE bool operator[](_AP_T index) const { + AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB"); + ap_bit_ref<_AP_W,_AP_S> br = operator[](index); + return br.to_bool(); + } +#endif + + // Count the number of zeros from the most significant bit + // to the first one bit. + INLINE int countLeadingZeros() { +#ifdef __SYNTHESIS__ + if (_AP_W <= 32) { + ap_int_base<32, false> t(-1UL), x; + x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse + t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V); + return __builtin_ctz(t.V); // count trailing zeros. + } else if (_AP_W <= 64) { + ap_int_base<64, false> t(-1ULL); + ap_int_base<64, false> x; + x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse + t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V); + return __builtin_ctzll(t.V); // count trailing zeros. + } else { + enum { __N = (_AP_W + 63) / 64 }; + int NZeros = 0; + int i = 0; + bool hitNonZero = false; + for (i = 0; i < __N - 1; ++i) { + ap_int_base<64, false> t; + t.V = _AP_ROOT_op_get_range(this->V, _AP_W - i * 64 - 64, _AP_W - i * 64 - 1); + NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); // count leading zeros. + hitNonZero |= (t.V != 0); + } + if (!hitNonZero) { + ap_int_base<64, false> t(-1ULL); + enum { REST = (_AP_W - 1) % 64 }; + ap_int_base<64, false> x; + x.V = _AP_ROOT_op_get_range(this->V, 0, REST); + t.V = _AP_ROOT_op_set_range(t.V, 63 - REST, 63, x.V); + NZeros += __builtin_clzll(t.V); + } + return NZeros; + } +#else + return (Base::V).countLeadingZeros(); +#endif + } // countLeadingZeros + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + concat(ap_int_base<_AP_W2, _AP_S2>& a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, a2); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >(*this, a2); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + const_cast&>(*this), a2); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, a2); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const { + return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, a2); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + const_cast&>(*this), + const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this, + a2); + } + + template + INLINE ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &a2) const { + return ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + const_cast&>(*this), + const_cast< + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); + } + + template + INLINE ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref< + _AP_W, ap_int_base, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, + a2); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &a2) const { + return ap_concat_ref< + _AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + const_cast&>(*this), + const_cast&>( + a2)); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref< + _AP_W, ap_int_base, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2); + } + + template + INLINE ap_int_base operator&( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { + return *this & a2.get(); + } + + template + INLINE ap_int_base operator|( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { + return *this | a2.get(); + } + + template + INLINE ap_int_base operator^( + const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { + return *this ^ a2.get(); + } + + template + INLINE void set(const ap_int_base<_AP_W3, false>& val) { + Base::V = val.V; + } + + /* Reduce operations. + * ---------------------------------------------------------------- + */ + // XXX non-const version deleted. + INLINE bool and_reduce() const { return _AP_ROOT_op_reduce(and, Base::V); } + INLINE bool nand_reduce() const { return _AP_ROOT_op_reduce(nand, Base::V); } + INLINE bool or_reduce() const { return _AP_ROOT_op_reduce(or, Base::V); } + INLINE bool nor_reduce() const { return !(_AP_ROOT_op_reduce(or, Base::V)); } + INLINE bool xor_reduce() const { return _AP_ROOT_op_reduce (xor, Base::V); } + INLINE bool xnor_reduce() const { + return !(_AP_ROOT_op_reduce (xor, Base::V)); + } + + /* Output as a string. + * ---------------------------------------------------------------- + */ +#ifndef __SYNTHESIS__ + std::string to_string(signed char rd = 2, bool sign = _AP_S) const { + // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to + // initialize sc_lv, which seems incapable of handling format "-0b". + if (rd == 2) sign = false; + return (Base::V).to_string(rd, sign); + } +#else + INLINE char* to_string(signed char rd = 2, bool sign = _AP_S) const { + return 0; + } +#endif +}; // struct ap_int_base + +// XXX apcc cannot handle global std::ios_base::Init() brought in by +#ifndef AP_AUTOCC +#ifndef __SYNTHESIS__ +template +INLINE std::ostream& operator<<(std::ostream& os, + const ap_int_base<_AP_W, _AP_S>& x) { + std::ios_base::fmtflags ff = std::cout.flags(); + if (ff & std::cout.hex) { + os << x.to_string(16); // don't print sign + } else if (ff & std::cout.oct) { + os << x.to_string(8); // don't print sign + } else { + os << x.to_string(10); + } + return os; +} +#endif // ifndef __SYNTHESIS__ + +#ifndef __SYNTHESIS__ +template +INLINE std::istream& operator>>(std::istream& in, + ap_int_base<_AP_W, _AP_S>& op) { + std::string str; + in >> str; + const std::ios_base::fmtflags basefield = in.flags() & std::ios_base::basefield; + unsigned radix = (basefield == std::ios_base::dec) ? 0 : ( + (basefield == std::ios_base::oct) ? 8 : ( + (basefield == std::ios_base::hex) ? 16 : 0)); + op = ap_int_base<_AP_W, _AP_S>(str.c_str(), radix); + return in; +} +#endif // ifndef __SYNTHESIS__ +#endif // ifndef AP_AUTOCC + +/* Operators with another ap_int_base. + * ---------------------------------------------------------------- + */ +#define OP_BIN_AP(Sym, Rty) \ + template \ + INLINE \ + typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \ + operator Sym(const ap_int_base<_AP_W, _AP_S>& op, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + typename ap_int_base<_AP_W, _AP_S>::template RType< \ + _AP_W2, _AP_S2>::Rty##_base lhs(op); \ + typename ap_int_base<_AP_W, _AP_S>::template RType< \ + _AP_W2, _AP_S2>::Rty##_base rhs(op2); \ + typename ap_int_base<_AP_W, _AP_S>::template RType< \ + _AP_W2, _AP_S2>::Rty##_base ret; \ + ret.V = lhs.V Sym rhs.V; \ + return ret; \ + } + +OP_BIN_AP(*, mult) +OP_BIN_AP(+, plus) +OP_BIN_AP(-, minus) +OP_BIN_AP(&, logic) +OP_BIN_AP(|, logic) +OP_BIN_AP(^, logic) + +#define OP_BIN_AP2(Sym, Rty) \ + template \ + INLINE \ + typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \ + operator Sym(const ap_int_base<_AP_W, _AP_S>& op, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + typename ap_int_base<_AP_W, _AP_S>::template RType< \ + _AP_W2, _AP_S2>::Rty##_base ret; \ + ret.V = op.V Sym op2.V; \ + return ret; \ + } + +OP_BIN_AP2(/, div) +OP_BIN_AP2(%, mod) + +// shift operators are defined inside class. +// compound assignment operators are defined inside class. + +/* Operators with a pointer type. + * ---------------------------------------------------------------- + * char a[100]; + * char* ptr = a; + * ap_int<2> n = 3; + * char* ptr2 = ptr + n*2; + * avoid ambiguous errors. + */ +#define OP_BIN_WITH_PTR(BIN_OP) \ + template \ + INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op, \ + const ap_int_base<_AP_W, _AP_S>& op) { \ + ap_slong op2 = op.to_int64(); /* Not all implementation */ \ + return i_op BIN_OP op2; \ + } \ + template \ + INLINE PTR_TYPE* operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, \ + PTR_TYPE* i_op) { \ + ap_slong op2 = op.to_int64(); /* Not all implementation */ \ + return op2 BIN_OP i_op; \ + } + +OP_BIN_WITH_PTR(+) +OP_BIN_WITH_PTR(-) + +/* Operators with a native floating point types. + * ---------------------------------------------------------------- + */ +// float OP ap_int +// when ap_int's width > 64, then trunc ap_int to ap_int<64> +#define OP_BIN_WITH_FLOAT(BIN_OP, C_TYPE) \ + template \ + INLINE C_TYPE operator BIN_OP(C_TYPE i_op, \ + const ap_int_base<_AP_W, _AP_S>& op) { \ + typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; \ + return i_op BIN_OP op2; \ + } \ + template \ + INLINE C_TYPE operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, \ + C_TYPE i_op) { \ + typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op; \ + return op2 BIN_OP i_op; \ + } + +#define ALL_OP_WITH_FLOAT(C_TYPE) \ + OP_BIN_WITH_FLOAT(*, C_TYPE) \ + OP_BIN_WITH_FLOAT(/, C_TYPE) \ + OP_BIN_WITH_FLOAT(+, C_TYPE) \ + OP_BIN_WITH_FLOAT(-, C_TYPE) + +#if _AP_ENABLE_HALF_ == 1 +ALL_OP_WITH_FLOAT(half) +#endif +ALL_OP_WITH_FLOAT(float) +ALL_OP_WITH_FLOAT(double) + +// TODO no shift? + +/* Operators with a native integral types. + * ---------------------------------------------------------------- + */ +// arithmetic and bitwise operators. +#define OP_BIN_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE) \ + template \ + INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(C_TYPE i_op, const ap_int_base<_AP_W, _AP_S>& op) { \ + return ap_int_base<_AP_W2, _AP_S2>(i_op) BIN_OP(op); \ + } \ + template \ + INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, C_TYPE i_op) { \ + return op BIN_OP ap_int_base<_AP_W2, _AP_S2>(i_op); \ + } + +#define ALL_OP_BIN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \ + OP_BIN_WITH_INT(*, C_TYPE, _AP_W2, _AP_S2, mult) \ + OP_BIN_WITH_INT(+, C_TYPE, _AP_W2, _AP_S2, plus) \ + OP_BIN_WITH_INT(-, C_TYPE, _AP_W2, _AP_S2, minus) \ + OP_BIN_WITH_INT(/, C_TYPE, _AP_W2, _AP_S2, div) \ + OP_BIN_WITH_INT(%, C_TYPE, _AP_W2, _AP_S2, mod) \ + OP_BIN_WITH_INT(&, C_TYPE, _AP_W2, _AP_S2, logic) \ + OP_BIN_WITH_INT(|, C_TYPE, _AP_W2, _AP_S2, logic) \ + OP_BIN_WITH_INT(^, C_TYPE, _AP_W2, _AP_S2, logic) + +ALL_OP_BIN_WITH_INT(bool, 1, false) +ALL_OP_BIN_WITH_INT(char, 8, CHAR_IS_SIGNED) +ALL_OP_BIN_WITH_INT(signed char, 8, true) +ALL_OP_BIN_WITH_INT(unsigned char, 8, false) +ALL_OP_BIN_WITH_INT(short, _AP_SIZE_short, true) +ALL_OP_BIN_WITH_INT(unsigned short, _AP_SIZE_short, false) +ALL_OP_BIN_WITH_INT(int, _AP_SIZE_int, true) +ALL_OP_BIN_WITH_INT(unsigned int, _AP_SIZE_int, false) +ALL_OP_BIN_WITH_INT(long, _AP_SIZE_long, true) +ALL_OP_BIN_WITH_INT(unsigned long, _AP_SIZE_long, false) +ALL_OP_BIN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) +ALL_OP_BIN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef OP_BIN_WITH_INT +#undef ALL_OP_BIN_WITH_INT + +// shift operators. +#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( \ + const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \ + ap_int_base<_AP_W, _AP_S> r; \ + if (_AP_S2) \ + r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); \ + else \ + r.V = op.V << op2; \ + return r; \ + } \ + template \ + INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( \ + const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \ + ap_int_base<_AP_W, _AP_S> r; \ + if (_AP_S2) \ + r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); \ + else \ + r.V = op.V >> op2; \ + return r; \ + } + +ALL_OP_SHIFT_WITH_INT(char, 8, CHAR_IS_SIGNED) +ALL_OP_SHIFT_WITH_INT(signed char, 8, true) +ALL_OP_SHIFT_WITH_INT(short, _AP_SIZE_short, true) +ALL_OP_SHIFT_WITH_INT(int, _AP_SIZE_int, true) +ALL_OP_SHIFT_WITH_INT(long, _AP_SIZE_long, true) +ALL_OP_SHIFT_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) + +#undef ALL_OP_SHIFT_WITH_INT + +#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<( \ + const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \ + ap_int_base<_AP_W, _AP_S> r; \ + r.V = op.V << op2; \ + return r; \ + } \ + template \ + INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>( \ + const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \ + ap_int_base<_AP_W, _AP_S> r; \ + r.V = op.V >> op2; \ + return r; \ + } +ALL_OP_SHIFT_WITH_INT(bool, 1, false) +ALL_OP_SHIFT_WITH_INT(unsigned char, 8, false) +ALL_OP_SHIFT_WITH_INT(unsigned short, _AP_SIZE_short, false) +ALL_OP_SHIFT_WITH_INT(unsigned int, _AP_SIZE_int, false) +ALL_OP_SHIFT_WITH_INT(unsigned long, _AP_SIZE_long, false) +ALL_OP_SHIFT_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef ALL_OP_SHIFT_WITH_INT + +// compound assign operators. +#define OP_ASSIGN_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE ap_int_base<_AP_W, _AP_S>& operator ASSIGN_OP( \ + ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \ + return op ASSIGN_OP ap_int_base<_AP_W2, _AP_S2>(op2); \ + } + +// TODO int a; ap_int<16> b; a += b; + +#define ALL_OP_ASSIGN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(+=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(-=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(*=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(/=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(%=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(&=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(|=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(^=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(>>=, C_TYPE, _AP_W2, _AP_S2) \ + OP_ASSIGN_WITH_INT(<<=, C_TYPE, _AP_W2, _AP_S2) + +ALL_OP_ASSIGN_WITH_INT(bool, 1, false) +ALL_OP_ASSIGN_WITH_INT(char, 8, CHAR_IS_SIGNED) +ALL_OP_ASSIGN_WITH_INT(signed char, 8, true) +ALL_OP_ASSIGN_WITH_INT(unsigned char, 8, false) +ALL_OP_ASSIGN_WITH_INT(short, _AP_SIZE_short, true) +ALL_OP_ASSIGN_WITH_INT(unsigned short, _AP_SIZE_short, false) +ALL_OP_ASSIGN_WITH_INT(int, _AP_SIZE_int, true) +ALL_OP_ASSIGN_WITH_INT(unsigned int, _AP_SIZE_int, false) +ALL_OP_ASSIGN_WITH_INT(long, _AP_SIZE_long, true) +ALL_OP_ASSIGN_WITH_INT(unsigned long, _AP_SIZE_long, false) +ALL_OP_ASSIGN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) +ALL_OP_ASSIGN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef OP_ASSIGN_WITH_INT +#undef ALL_OP_ASSIGN_WITH_INT + +// equality and relational operators. +#define OP_REL_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE bool operator REL_OP(C_TYPE i_op, \ + const ap_int_base<_AP_W, _AP_S>& op) { \ + return ap_int_base<_AP_W2, _AP_S2>(i_op) REL_OP op; \ + } \ + template \ + INLINE bool operator REL_OP(const ap_int_base<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return op REL_OP ap_int_base<_AP_W2, _AP_S2>(op2); \ + } + +#define ALL_OP_REL_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \ + OP_REL_WITH_INT(>, C_TYPE, _AP_W2, _AP_S2) \ + OP_REL_WITH_INT(<, C_TYPE, _AP_W2, _AP_S2) \ + OP_REL_WITH_INT(>=, C_TYPE, _AP_W2, _AP_S2) \ + OP_REL_WITH_INT(<=, C_TYPE, _AP_W2, _AP_S2) \ + OP_REL_WITH_INT(==, C_TYPE, _AP_W2, _AP_S2) \ + OP_REL_WITH_INT(!=, C_TYPE, _AP_W2, _AP_S2) + +ALL_OP_REL_WITH_INT(bool, 1, false) +ALL_OP_REL_WITH_INT(char, 8, CHAR_IS_SIGNED) +ALL_OP_REL_WITH_INT(signed char, 8, true) +ALL_OP_REL_WITH_INT(unsigned char, 8, false) +ALL_OP_REL_WITH_INT(short, _AP_SIZE_short, true) +ALL_OP_REL_WITH_INT(unsigned short, _AP_SIZE_short, false) +ALL_OP_REL_WITH_INT(int, _AP_SIZE_int, true) +ALL_OP_REL_WITH_INT(unsigned int, _AP_SIZE_int, false) +ALL_OP_REL_WITH_INT(long, _AP_SIZE_long, true) +ALL_OP_REL_WITH_INT(unsigned long, _AP_SIZE_long, false) +ALL_OP_REL_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) +ALL_OP_REL_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef OP_REL_WITH_INT +#undef ALL_OP_BIN_WITH_INT + +#define OP_REL_WITH_DOUBLE_OR_FLOAT(Sym) \ + template \ + INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1, \ + double op2) { \ + return op1.to_double() Sym op2 ; \ + } \ + template \ + INLINE bool operator Sym(double op1, \ + const ap_int_base<_AP_W, _AP_S>& op2) { \ + return op1 Sym op2.to_double() ; \ + } \ + template \ + INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1, \ + float op2) { \ + return op1.to_double() Sym op2 ; \ + } \ + template \ + INLINE bool operator Sym(float op1, \ + const ap_int_base<_AP_W, _AP_S>& op2) { \ + return op1 Sym op2.to_double() ; \ + } + OP_REL_WITH_DOUBLE_OR_FLOAT(>) + OP_REL_WITH_DOUBLE_OR_FLOAT(<) + OP_REL_WITH_DOUBLE_OR_FLOAT(>=) + OP_REL_WITH_DOUBLE_OR_FLOAT(<=) + OP_REL_WITH_DOUBLE_OR_FLOAT(==) + OP_REL_WITH_DOUBLE_OR_FLOAT(!=) + +#undef OP_REL_WITH_DOUBLE_OR_FLOAT + + +/* Operators with ap_bit_ref. + * ------------------------------------------------------------ + */ +// arithmetic, bitwise and shift operators. +#define OP_BIN_WITH_RANGE(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + return ap_int_base<_AP_W1, false>(op1) BIN_OP op2; \ + } \ + template \ + INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1, \ + const ap_range_ref<_AP_W2, _AP_S2>& op2) { \ + return op1 BIN_OP ap_int_base<_AP_W2, false>(op2); \ + } + +OP_BIN_WITH_RANGE(+, plus) +OP_BIN_WITH_RANGE(-, minus) +OP_BIN_WITH_RANGE(*, mult) +OP_BIN_WITH_RANGE(/, div) +OP_BIN_WITH_RANGE(%, mod) +OP_BIN_WITH_RANGE(&, logic) +OP_BIN_WITH_RANGE(|, logic) +OP_BIN_WITH_RANGE(^, logic) +OP_BIN_WITH_RANGE(>>, arg1) +OP_BIN_WITH_RANGE(<<, arg1) + +#undef OP_BIN_WITH_RANGE + +// compound assignment operators. +#define OP_ASSIGN_WITH_RANGE(ASSIGN_OP) \ + template \ + INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + ap_int_base<_AP_W1, _AP_S1>& op1, ap_range_ref<_AP_W2, _AP_S2>& op2) { \ + return op1 ASSIGN_OP ap_int_base<_AP_W2, false>(op2); \ + } \ + template \ + INLINE ap_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + ap_range_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \ + ap_int_base<_AP_W1, false> tmp(op1); \ + tmp ASSIGN_OP op2; \ + op1 = tmp; \ + return op1; \ + } + +OP_ASSIGN_WITH_RANGE(+=) +OP_ASSIGN_WITH_RANGE(-=) +OP_ASSIGN_WITH_RANGE(*=) +OP_ASSIGN_WITH_RANGE(/=) +OP_ASSIGN_WITH_RANGE(%=) +OP_ASSIGN_WITH_RANGE(&=) +OP_ASSIGN_WITH_RANGE(|=) +OP_ASSIGN_WITH_RANGE(^=) +OP_ASSIGN_WITH_RANGE(>>=) +OP_ASSIGN_WITH_RANGE(<<=) + +#undef OP_ASSIGN_WITH_RANGE + +// equality and relational operators +#define OP_REL_WITH_RANGE(REL_OP) \ + template \ + INLINE bool operator REL_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + return ap_int_base<_AP_W1, false>(op1).operator REL_OP(op2); \ + } \ + template \ + INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1, \ + const ap_range_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator REL_OP(op2.operator ap_int_base<_AP_W2, false>()); \ + } + +OP_REL_WITH_RANGE(==) +OP_REL_WITH_RANGE(!=) +OP_REL_WITH_RANGE(>) +OP_REL_WITH_RANGE(>=) +OP_REL_WITH_RANGE(<) +OP_REL_WITH_RANGE(<=) + +#undef OP_REL_WITH_RANGE + +/* Operators with ap_bit_ref. + * ------------------------------------------------------------ + */ +// arithmetic, bitwise and shift operators. +#define OP_BIN_WITH_BIT(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \ + operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1, \ + const ap_bit_ref<_AP_W2, _AP_S2>& op2) { \ + return op1 BIN_OP ap_int_base<1, false>(op2); \ + } \ + template \ + INLINE typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + return ap_int_base<1, false>(op1) BIN_OP op2; \ + } + +OP_BIN_WITH_BIT(+, plus) +OP_BIN_WITH_BIT(-, minus) +OP_BIN_WITH_BIT(*, mult) +OP_BIN_WITH_BIT(/, div) +OP_BIN_WITH_BIT(%, mod) +OP_BIN_WITH_BIT(&, logic) +OP_BIN_WITH_BIT(|, logic) +OP_BIN_WITH_BIT(^, logic) +OP_BIN_WITH_BIT(>>, arg1) +OP_BIN_WITH_BIT(<<, arg1) + +#undef OP_BIN_WITH_BIT + +// compound assignment operators. +#define OP_ASSIGN_WITH_BIT(ASSIGN_OP) \ + template \ + INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + ap_int_base<_AP_W1, _AP_S1>& op1, ap_bit_ref<_AP_W2, _AP_S2>& op2) { \ + return op1 ASSIGN_OP ap_int_base<1, false>(op2); \ + } \ + template \ + INLINE ap_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + ap_bit_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \ + ap_int_base<1, false> tmp(op1); \ + tmp ASSIGN_OP op2; \ + op1 = tmp; \ + return op1; \ + } + +OP_ASSIGN_WITH_BIT(+=) +OP_ASSIGN_WITH_BIT(-=) +OP_ASSIGN_WITH_BIT(*=) +OP_ASSIGN_WITH_BIT(/=) +OP_ASSIGN_WITH_BIT(%=) +OP_ASSIGN_WITH_BIT(&=) +OP_ASSIGN_WITH_BIT(|=) +OP_ASSIGN_WITH_BIT(^=) +OP_ASSIGN_WITH_BIT(>>=) +OP_ASSIGN_WITH_BIT(<<=) + +#undef OP_ASSIGN_WITH_BIT + +// equality and relational operators. +#define OP_REL_WITH_BIT(REL_OP) \ + template \ + INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1, \ + const ap_bit_ref<_AP_W2, _AP_S2>& op2) { \ + return op1 REL_OP ap_int_base<1, false>(op2); \ + } \ + template \ + INLINE bool operator REL_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1, \ + const ap_int_base<_AP_W2, _AP_S2>& op2) { \ + return ap_int_base<1, false>(op1) REL_OP op2; \ + } + +OP_REL_WITH_BIT(==) +OP_REL_WITH_BIT(!=) +OP_REL_WITH_BIT(>) +OP_REL_WITH_BIT(>=) +OP_REL_WITH_BIT(<) +OP_REL_WITH_BIT(<=) + +#undef OP_REL_WITH_BIT + + +/* Operators with ap_concat_ref. + * ------------------------------------------------------------ + */ +// arithmetic, bitwise and shift operators. +// bitwise operators are defined in struct. +// TODO specify whether to define arithmetic and bitwise operators. +#if 0 +#define OP_BIN_WITH_CONCAT(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \ + false>::RTYPE \ + operator BIN_OP(const ap_int_base<_AP_W3, _AP_S3>& op1, \ + const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { \ + /* convert ap_concat_ref to ap_int_base */ \ + return op1 BIN_OP op2.get(); \ + } \ + template \ + INLINE typename ap_int_base<_AP_W1 + _AP_W2, \ + false>::template RType<_AP_W3, _AP_S3>::RTYPE \ + operator BIN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \ + const ap_int_base<_AP_W3, _AP_S3>& op2) { \ + /* convert ap_concat_ref to ap_int_base */ \ + return op1.get() BIN_OP op2; \ + } + +OP_BIN_WITH_CONCAT(+, plus) +OP_BIN_WITH_CONCAT(-, minus) +OP_BIN_WITH_CONCAT(*, mult) +OP_BIN_WITH_CONCAT(/, div) +OP_BIN_WITH_CONCAT(%, mod) +OP_BIN_WITH_CONCAT(&, logic) +OP_BIN_WITH_CONCAT(|, logic) +OP_BIN_WITH_CONCAT(^, logic) +OP_BIN_WITH_CONCAT(>>, arg1) +OP_BIN_WITH_CONCAT(<<, arg1) + +#undef OP_BIN_WITH_CONCAT + +// compound assignment operators. +#define OP_ASSIGN_WITH_CONCAT(ASSIGN_OP) \ + template \ + INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \ + false>::RTYPE \ + operator ASSIGN_OP( \ + const ap_int_base<_AP_W3, _AP_S3>& op1, \ + const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { \ + /* convert ap_concat_ref to ap_int_base */ \ + return op1 ASSIGN_OP op2.get(); \ + } \ + template \ + INLINE typename ap_int_base<_AP_W1 + _AP_W2, \ + false>::template RType<_AP_W3, _AP_S3>::RTYPE \ + operator ASSIGN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \ + const ap_int_base<_AP_W3, _AP_S3>& op2) { \ + /* convert ap_concat_ref to ap_int_base */ \ + ap_int_base<_AP_W1 + _AP_W2, false> tmp = op1.get(); \ + tmp ASSIGN_OP op2; \ + op1 = tmp; \ + return op1; \ + } + +OP_ASSIGN_WITH_CONCAT(+=) +OP_ASSIGN_WITH_CONCAT(-=) +OP_ASSIGN_WITH_CONCAT(*=) +OP_ASSIGN_WITH_CONCAT(/=) +OP_ASSIGN_WITH_CONCAT(%=) +OP_ASSIGN_WITH_CONCAT(&=) +OP_ASSIGN_WITH_CONCAT(|=) +OP_ASSIGN_WITH_CONCAT(^=) +OP_ASSIGN_WITH_CONCAT(>>=) +OP_ASSIGN_WITH_CONCAT(<<=) + +#undef OP_ASSIGN_WITH_CONCAT +#endif + +// equality and relational operators. +#define OP_REL_WITH_CONCAT(REL_OP) \ + template \ + INLINE bool operator REL_OP( \ + const ap_int_base<_AP_W3, _AP_S3>& op1, \ + const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) { \ + /* convert ap_concat_ref to ap_int_base */ \ + return op1 REL_OP op2.get(); \ + } \ + template \ + INLINE bool operator REL_OP( \ + const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \ + const ap_int_base<_AP_W3, _AP_S3>& op2) { \ + /* convert ap_concat_ref to ap_int_base */ \ + return op1.get() REL_OP op2; \ + } + +OP_REL_WITH_CONCAT(==) +OP_REL_WITH_CONCAT(!=) +OP_REL_WITH_CONCAT(>) +OP_REL_WITH_CONCAT(>=) +OP_REL_WITH_CONCAT(<) +OP_REL_WITH_CONCAT(<=) + +#undef OP_REL_WITH_CONCAT + +#endif // ifndef __cplusplus +#endif // ifndef __AP_INT_BASE_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_ref.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_ref.h new file mode 100644 index 00000000..421f09fd --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_ref.h @@ -0,0 +1,1346 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_INT_REF_H__ +#define __AP_INT_REF_H__ + +#ifndef __AP_INT_H__ +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +#ifndef __cplusplus +#error "C++ is required to include this header file" + +#else + +#ifndef __SYNTHESIS__ +#include +#endif + +/* Concatination reference. + ---------------------------------------------------------------- +*/ +template +struct ap_concat_ref { + enum { + _AP_WR = _AP_W1 + _AP_W2, + }; + + _AP_T1& mbv1; + _AP_T2& mbv2; + + INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref) + : mbv1(ref.mbv1), mbv2(ref.mbv2) {} + + INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {} + + template + INLINE ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> vval(val); + int W_ref1 = mbv1.length(); + int W_ref2 = mbv2.length(); + ap_int_base<_AP_W1, false> Part1; + Part1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1); + mbv1.set(Part1); + ap_int_base<_AP_W2, false> Part2; + Part2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1); + mbv2.set(Part2); + return *this; + } + + // assign op from hls supported C integral types. + // FIXME disabled to support legacy code directly assign from sc_signal + //template + //INLINE typename _ap_type::enable_if<_ap_type::is_integral::value, + // ap_concat_ref&>::type + //operator=(T val) { + // ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + // return operator=(tmpVal); + //} +#define ASSIGN_WITH_CTYPE(_Tp) \ + INLINE ap_concat_ref& operator=(_Tp val) { \ + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); \ + return operator=(tmpVal); \ + } + + ASSIGN_WITH_CTYPE(bool) + ASSIGN_WITH_CTYPE(char) + ASSIGN_WITH_CTYPE(signed char) + ASSIGN_WITH_CTYPE(unsigned char) + ASSIGN_WITH_CTYPE(short) + ASSIGN_WITH_CTYPE(unsigned short) + ASSIGN_WITH_CTYPE(int) + ASSIGN_WITH_CTYPE(unsigned int) + ASSIGN_WITH_CTYPE(long) + ASSIGN_WITH_CTYPE(unsigned long) + ASSIGN_WITH_CTYPE(ap_slong) + ASSIGN_WITH_CTYPE(ap_ulong) +#if _AP_ENABLE_HALF_ == 1 + ASSIGN_WITH_CTYPE(half) +#endif + ASSIGN_WITH_CTYPE(float) + ASSIGN_WITH_CTYPE(double) + +#undef ASSIGN_WITH_CTYPE + + // Be explicit to prevent it from being deleted, as field d_bv + // is of reference type. + INLINE ap_concat_ref& operator=( + const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + + template + INLINE ap_concat_ref& operator=( + const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + + template + INLINE ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + template + INLINE ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); + return operator=(tmpVal); + } + + template + INLINE ap_concat_ref& operator=( + const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) { + return operator=((const ap_int_base<_AP_W3, false>)(val)); + } + + template + INLINE ap_concat_ref& operator=( + const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& + val) { + return operator=(val.to_ap_int_base()); + } + + template + INLINE ap_concat_ref& operator=( + const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) { + return operator=((ap_ulong)(bool)(val)); + } + + INLINE operator ap_int_base<_AP_WR, false>() const { return get(); } + + INLINE operator ap_ulong() const { return get().to_uint64(); } + + template + INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_range_ref<_AP_W3, _AP_S3> > + operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_range_ref<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(ap_int_base<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >(*this, a2); + } + + template + INLINE + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> > + operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) { + // FIXME op's life does not seem long enough + ap_int_base<_AP_W3, _AP_S3> op(a2); + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, + ap_int_base<_AP_W3, _AP_S3> >( + *this, const_cast&>(op)); + } + + template + INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> > + operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4, + ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> > + operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) { + return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4, + ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref< + _AP_WR, ap_concat_ref, _AP_W3, + af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> > + operator,( + const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) { + return ap_concat_ref< + _AP_WR, ap_concat_ref, _AP_W3, + af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >( + *this, + const_cast< + af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_WR, ap_concat_ref, 1, + af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> > + operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> + &a2) { + return ap_concat_ref< + _AP_WR, ap_concat_ref, 1, + af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >( + *this, + const_cast&>( + a2)); + } + + template + INLINE ap_int_base operator&( + const ap_int_base<_AP_W3, _AP_S3>& a2) { + return get() & a2; + } + + template + INLINE ap_int_base operator|( + const ap_int_base<_AP_W3, _AP_S3>& a2) { + return get() | a2; + } + + template + INLINE ap_int_base operator^( + const ap_int_base<_AP_W3, _AP_S3>& a2) { + return get() ^ a2; + } + +#if 0 + template + INLINE ap_int_base slice() { + ap_int_base<_AP_WR, false> bv = get(); + return bv.slice(); + } +#endif + + INLINE ap_int_base<_AP_WR, false> get() const { + ap_int_base<_AP_WR, false> tmpVal(0); + int W_ref1 = mbv1.length(); + int W_ref2 = mbv2.length(); + ap_int_base<_AP_W2, false> v2(mbv2); + ap_int_base<_AP_W1, false> v1(mbv1); + tmpVal.V = _AP_ROOT_op_set_range(tmpVal.V, 0, W_ref2 - 1, v2.V); + tmpVal.V = + _AP_ROOT_op_set_range(tmpVal.V, W_ref2, W_ref1 + W_ref2 - 1, v1.V); + return tmpVal; + } + + template + INLINE void set(const ap_int_base<_AP_W3, false>& val) { + ap_int_base<_AP_W1 + _AP_W2, false> vval(val); + int W_ref1 = mbv1.length(); + int W_ref2 = mbv2.length(); + ap_int_base<_AP_W1, false> tmpVal1; + tmpVal1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1); + mbv1.set(tmpVal1); + ap_int_base<_AP_W2, false> tmpVal2; + tmpVal2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1); + mbv2.set(tmpVal2); + } + + INLINE int length() const { return mbv1.length() + mbv2.length(); } +}; // struct ap_concat_ref + +/* Range (slice) reference. + ---------------------------------------------------------------- +*/ +template +struct ap_range_ref { + // struct ssdm_int or its sim model. + // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed + // and then we can retire af_range_ref. + typedef ap_int_base<_AP_W, _AP_S> ref_type; + ref_type& d_bv; + int l_index; + int h_index; + + public: + INLINE ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref) + : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {} + + INLINE ap_range_ref(ref_type* bv, int h, int l) + : d_bv(*bv), l_index(l), h_index(h) {} + + INLINE ap_range_ref(const ref_type* bv, int h, int l) + : d_bv(*const_cast(bv)), l_index(l), h_index(h) {} + + INLINE operator ap_int_base<_AP_W, false>() const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret; + } + + INLINE operator ap_ulong() const { return to_uint64(); } + + /// @name assign operators + // @{ + + // FIXME disabled to work-around lagacy code assigning from sc_signal, + // which dependes on implicit type conversion. + // + // /// assign from hls supported C integral types. + // template + // INLINE typename _ap_type::enable_if<_ap_type::is_integral::value, + // ap_range_ref&>::type + // operator=(T val) { + // ap_int_base<_AP_W, false> tmp(val); + // d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); + // return *this; + // } +#define ASSIGN_WITH_CTYPE(_Tp) \ + INLINE ap_range_ref& operator=(_Tp val) { \ + ap_int_base<_AP_W, false> tmp(val); \ + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); \ + return *this; \ + } + + ASSIGN_WITH_CTYPE(bool) + ASSIGN_WITH_CTYPE(char) + ASSIGN_WITH_CTYPE(signed char) + ASSIGN_WITH_CTYPE(unsigned char) + ASSIGN_WITH_CTYPE(short) + ASSIGN_WITH_CTYPE(unsigned short) + ASSIGN_WITH_CTYPE(int) + ASSIGN_WITH_CTYPE(unsigned int) + ASSIGN_WITH_CTYPE(long) + ASSIGN_WITH_CTYPE(unsigned long) + ASSIGN_WITH_CTYPE(ap_slong) + ASSIGN_WITH_CTYPE(ap_ulong) +#if _AP_ENABLE_HALF_ == 1 + ASSIGN_WITH_CTYPE(half) +#endif + ASSIGN_WITH_CTYPE(float) + ASSIGN_WITH_CTYPE(double) + +#undef ASSIGN_WITH_CTYPE + + /// assign using string. XXX crucial for cosim. + INLINE ap_range_ref& operator=(const char* val) { + const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); + return *this; + } + + /// assign from ap_int_base. + template + INLINE ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) { + ap_int_base<_AP_W, false> tmp(val); + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); + return *this; + } + + /// copy assign operator + // XXX Be explicit to prevent it from being deleted, as field d_bv + // is of reference type. + INLINE ap_range_ref& operator=(const ap_range_ref& val) { + return operator=((const ap_int_base<_AP_W, false>)val); + } + + /// assign from range reference to ap_int_base. + template + INLINE ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + return operator=((const ap_int_base<_AP_W2, false>)val); + } + + /// assign from bit reference to ap_int_base. + template + INLINE ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=((ap_ulong)(bool)(val)); + } + + /// assign from ap_fixed_base. + template + INLINE ap_range_ref& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& + val) { + return operator=(val.to_ap_int_base()); + } + + /// assign from range reference to ap_fixed_base. + template + INLINE ap_range_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((const ap_int_base<_AP_W2, false>)val); + } + + /// assign from bit reference to ap_fixed_base. + template + INLINE ap_range_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((ap_ulong)(bool)(val)); + } + + /// assign from compound reference. + template + INLINE ap_range_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val)); + } + // @} + + template + INLINE + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_range_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >(*this, a2); + } + + INLINE + ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> > + operator,(ap_int_base<_AP_W, _AP_S>& a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W, + ap_int_base<_AP_W, _AP_S> >(*this, a2); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, + ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > + operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref< + _AP_W, ap_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) { + return ap_concat_ref< + _AP_W, ap_range_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast< + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); + } + + template + INLINE + ap_concat_ref<_AP_W, ap_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> + &a2) { + return ap_concat_ref< + _AP_W, ap_range_ref, 1, + af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + a2)); + } + + template + INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> hop(op2); + return lop == hop; + } + + template + INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator==(op2)); + } + + template + INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> hop(op2); + return lop < hop; + } + + template + INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + ap_int_base<_AP_W, false> lop(*this); + ap_int_base<_AP_W2, false> hop(op2); + return lop <= hop; + } + + template + INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator<=(op2)); + } + + template + INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) { + return !(operator<(op2)); + } + + template + INLINE ap_range_ref<_AP_W, _AP_S>& operator|=( + const ap_range_ref<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V |= (op2.d_bv).V; + return *this; + }; + + template + INLINE ap_range_ref<_AP_W, _AP_S>& operator|=( + const ap_int_base<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V |= op2.V; + return *this; + }; + + template + INLINE ap_range_ref<_AP_W, _AP_S>& operator&=( + const ap_range_ref<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V &= (op2.d_bv).V; + return *this; + }; + + template + INLINE ap_range_ref<_AP_W, _AP_S>& operator&=( + const ap_int_base<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V &= op2.V; + return *this; + }; + + template + INLINE ap_range_ref<_AP_W, _AP_S>& operator^=( + const ap_range_ref<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V ^= (op2.d_bv).V; + return *this; + }; + + template + INLINE ap_range_ref<_AP_W, _AP_S>& operator^=( + const ap_int_base<_AP_W2, _AP_S2>& op2) { + (this->d_bv).V ^= op2.V; + return *this; + }; + + INLINE ap_int_base<_AP_W, false> get() const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret; + } + + template + INLINE void set(const ap_int_base<_AP_W2, false>& val) { + d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V); + } + + INLINE int length() const { + return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1; + } + + INLINE int to_int() const { + return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE unsigned to_uint() const { + return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE long to_long() const { + return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE unsigned long to_ulong() const { + return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE ap_slong to_int64() const { + return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE ap_ulong to_uint64() const { + return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index)); + } + + INLINE bool and_reduce() const { + bool ret = true; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + for (unsigned i = low; i != high; ++i) { +#ifdef __SYNTHESIS__ +#pragma HLS unroll +#endif + ret &= _AP_ROOT_op_get_bit(d_bv.V, i); + } + return ret; + } + + INLINE bool or_reduce() const { + bool ret = false; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + for (unsigned i = low; i != high; ++i) { +#ifdef __SYNTHESIS__ +#pragma HLS unroll +#endif + ret |= _AP_ROOT_op_get_bit(d_bv.V, i); + } + return ret; + } + + INLINE bool xor_reduce() const { + bool ret = false; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + for (unsigned i = low; i != high; ++i) { +#ifdef __SYNTHESIS__ +#pragma HLS unroll +#endif + ret ^= _AP_ROOT_op_get_bit(d_bv.V, i); + } + return ret; + } +#ifndef __SYNTHESIS__ + std::string to_string(signed char radix = 2) const { + ap_int_base<_AP_W, false> ret; + ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index); + return ret.to_string(radix); + } +#else + // XXX HLS will delete this in synthesis + INLINE char* to_string(signed char radix = 2) const { + return 0; + } +#endif +}; // struct ap_range_ref + +// XXX apcc cannot handle global std::ios_base::Init() brought in by +#ifndef AP_AUTOCC +#ifndef __SYNTHESIS__ +template +INLINE std::ostream& operator<<(std::ostream& os, + const ap_range_ref<_AP_W, _AP_S>& x) { + std::ios_base::fmtflags ff = std::cout.flags(); + if (ff & std::cout.hex) { + os << x.to_string(16); // don't print sign + } else if (ff & std::cout.oct) { + os << x.to_string(8); // don't print sign + } else { + os << x.to_string(10); + } + return os; +} +#endif // ifndef __SYNTHESIS__ + +#ifndef __SYNTHESIS__ +template +INLINE std::istream& operator>>(std::istream& in, + ap_range_ref<_AP_W, _AP_S>& op) { + std::string str; + in >> str; + op = ap_int_base<_AP_W, _AP_S>(str.c_str()); + return in; +} +#endif // ifndef __SYNTHESIS__ +#endif // ifndef AP_AUTOCC + +/* Bit reference. + ---------------------------------------------------------------- +*/ +template +struct ap_bit_ref { + // struct ssdm_int or its sim model. + // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed + // and then we can retire af_bit_ref. + typedef ap_int_base<_AP_W, _AP_S> ref_type; + ref_type& d_bv; + int d_index; + + public: + // copy ctor + INLINE ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref) + : d_bv(ref.d_bv), d_index(ref.d_index) {} + + INLINE ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {} + + INLINE ap_bit_ref(const ref_type* bv, int index = 0) + : d_bv(*const_cast(bv)), d_index(index) {} + + INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); } + INLINE bool to_bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); } + + // assign op from hls supported C integral types. + // FIXME disabled to support sc_signal. + // NOTE this used to be unsigned long long. + //template + //INLINE typename _ap_type::enable_if<_ap_type::is_integral::value, + // ap_bit_ref&>::type + //operator=(T val) { + // d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); + // return *this; + //} +#define ASSIGN_WITH_CTYPE(_Tp) \ + INLINE ap_bit_ref& operator=(_Tp val) { \ + d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); \ + return *this; \ + } + + ASSIGN_WITH_CTYPE(bool) + ASSIGN_WITH_CTYPE(char) + ASSIGN_WITH_CTYPE(signed char) + ASSIGN_WITH_CTYPE(unsigned char) + ASSIGN_WITH_CTYPE(short) + ASSIGN_WITH_CTYPE(unsigned short) + ASSIGN_WITH_CTYPE(int) + ASSIGN_WITH_CTYPE(unsigned int) + ASSIGN_WITH_CTYPE(long) + ASSIGN_WITH_CTYPE(unsigned long) + ASSIGN_WITH_CTYPE(ap_slong) + ASSIGN_WITH_CTYPE(ap_ulong) + +#undef ASSIGN_WITH_CTYPE + +#define ASSIGN_WITH_CTYPE_FP(_Tp) \ + INLINE ap_bit_ref& operator=(_Tp val) { \ + bool tmp_val = val; \ + d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index,tmp_val); \ + return *this; \ + } + +#if _AP_ENABLE_HALF_ == 1 + ASSIGN_WITH_CTYPE_FP(half) +#endif + ASSIGN_WITH_CTYPE_FP(float) + ASSIGN_WITH_CTYPE_FP(double) + +#undef ASSIGN_WITH_CTYPE_FP + + + template + INLINE ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) { + return operator=((ap_ulong)(val.V != 0)); + } + + template + INLINE ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) { + return operator=((ap_int_base<_AP_W2, false>)val); + } + + // Be explicit to prevent it from being deleted, as field d_bv + // is of reference type. + INLINE ap_bit_ref& operator=(const ap_bit_ref& val) { + return operator=((ap_ulong)(bool)val); + } + + template + INLINE ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=((ap_ulong)(bool)val); + } + + template + INLINE ap_bit_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((const ap_int_base<_AP_W2, false>)val); + } + + template + INLINE ap_bit_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((ap_ulong)(bool)val); + } + + template + INLINE ap_bit_ref& operator=( + const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) { + return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, a2); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) { + ap_int_base<_AP_W2, _AP_S2> op(a2); + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> > + operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) { + ap_int_base<_AP_W2, _AP_S2> op(a2); + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >( + *this, const_cast&>(op)); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> > + operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,( + const ap_bit_ref<_AP_W2, _AP_S2> &a2) { + return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > + operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { + return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3, + ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( + *this, const_cast&>(a2)); + } + + template + INLINE ap_concat_ref< + 1, ap_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref< + 1, ap_bit_ref, _AP_W2, + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast< + af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); + } + + template + INLINE ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> > + operator,( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { + return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2, + _AP_Q2, _AP_O2, _AP_N2> >( + *this, + const_cast&>( + a2)); + } + + template + INLINE bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) { + return get() == op.get(); + } + + template + INLINE bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) { + return get() != op.get(); + } + + INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); } + + INLINE bool get() { return _AP_ROOT_op_get_bit(d_bv.V, d_index); } + + template + INLINE void set(const ap_int_base<_AP_W3, false>& val) { + operator=(val); + } + + INLINE bool operator~() const { + bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index); + return bit ? false : true; + } + + INLINE int length() const { return 1; } + +#ifndef __SYNTHESIS__ + std::string to_string() const { return get() ? "1" : "0"; } +#else + // XXX HLS will delete this in synthesis + INLINE char* to_string() const { return 0; } +#endif +}; // struct ap_bit_ref + +/* ap_range_ref with int. + * ------------------------------------------------------------ + */ +// equality and relational operators. +#define REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE bool operator REL_OP(const ap_range_ref<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return ap_int_base<_AP_W, false>(op) \ + REL_OP ap_int_base<_AP_W2, _AP_S2>(op2); \ + } \ + template \ + INLINE bool operator REL_OP(const ap_bit_ref<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return bool(op) REL_OP op2; \ + } \ + template \ + INLINE bool operator REL_OP(C_TYPE op2, \ + const ap_bit_ref<_AP_W, _AP_S>& op) { \ + return op2 REL_OP bool(op); \ + } \ + template \ + INLINE bool operator REL_OP( \ + const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, C_TYPE op2) { \ + return ap_int_base<_AP_W + _AP_W1, false>(op) \ + REL_OP ap_int_base<_AP_W2, _AP_S2>(op2); \ + } + +// Make the line shorter than 5000 chars +#define REF_REL_WITH_INT_1(C_TYPE, _AP_WI, _AP_SI) \ + REF_REL_OP_WITH_INT(>, C_TYPE, _AP_WI, _AP_SI) \ + REF_REL_OP_WITH_INT(<, C_TYPE, _AP_WI, _AP_SI) \ + REF_REL_OP_WITH_INT(>=, C_TYPE, _AP_WI, _AP_SI) \ + REF_REL_OP_WITH_INT(<=, C_TYPE, _AP_WI, _AP_SI) + +REF_REL_WITH_INT_1(bool, 1, false) +REF_REL_WITH_INT_1(char, 8, CHAR_IS_SIGNED) +REF_REL_WITH_INT_1(signed char, 8, true) +REF_REL_WITH_INT_1(unsigned char, 8, false) +REF_REL_WITH_INT_1(short, _AP_SIZE_short, true) +REF_REL_WITH_INT_1(unsigned short, _AP_SIZE_short, false) +REF_REL_WITH_INT_1(int, _AP_SIZE_int, true) +REF_REL_WITH_INT_1(unsigned int, _AP_SIZE_int, false) +REF_REL_WITH_INT_1(long, _AP_SIZE_long, true) +REF_REL_WITH_INT_1(unsigned long, _AP_SIZE_long, false) +REF_REL_WITH_INT_1(ap_slong, _AP_SIZE_ap_slong, true) +REF_REL_WITH_INT_1(ap_ulong, _AP_SIZE_ap_slong, false) + +// Make the line shorter than 5000 chars +#define REF_REL_WITH_INT_2(C_TYPE, _AP_WI, _AP_SI) \ + REF_REL_OP_WITH_INT(==, C_TYPE, _AP_WI, _AP_SI) \ + REF_REL_OP_WITH_INT(!=, C_TYPE, _AP_WI, _AP_SI) + +REF_REL_WITH_INT_2(bool, 1, false) +REF_REL_WITH_INT_2(char, 8, CHAR_IS_SIGNED) +REF_REL_WITH_INT_2(signed char, 8, true) +REF_REL_WITH_INT_2(unsigned char, 8, false) +REF_REL_WITH_INT_2(short, _AP_SIZE_short, true) +REF_REL_WITH_INT_2(unsigned short, _AP_SIZE_short, false) +REF_REL_WITH_INT_2(int, _AP_SIZE_int, true) +REF_REL_WITH_INT_2(unsigned int, _AP_SIZE_int, false) +REF_REL_WITH_INT_2(long, _AP_SIZE_long, true) +REF_REL_WITH_INT_2(unsigned long, _AP_SIZE_long, false) +REF_REL_WITH_INT_2(ap_slong, _AP_SIZE_ap_slong, true) +REF_REL_WITH_INT_2(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef REF_REL_OP_WITH_INT +#undef REF_REL_WITH_INT_1 +#undef REF_REL_WITH_INT_2 + +#define REF_BIN_OP_WITH_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE typename ap_int_base<_AP_W, false>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& op, C_TYPE op2) { \ + return ap_int_base<_AP_W, false>(op) \ + BIN_OP ap_int_base<_AP_W2, _AP_S2>(op2); \ + } \ + template \ + INLINE typename ap_int_base<_AP_W2, _AP_S2>::template RType<_AP_W, \ + false>::RTYPE \ + operator BIN_OP(C_TYPE op2, const ap_range_ref<_AP_W, _AP_S>& op) { \ + return ap_int_base<_AP_W2, _AP_S2>(op2) \ + BIN_OP ap_int_base<_AP_W, false>(op); \ + } + +// arithmetic operators. +#define REF_BIN_OP_WITH_INT_ARITH(C_TYPE, _AP_W2, _AP_S2) \ + REF_BIN_OP_WITH_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2)) + +REF_BIN_OP_WITH_INT_ARITH(bool, 1, false) +REF_BIN_OP_WITH_INT_ARITH(char, 8, CHAR_IS_SIGNED) +REF_BIN_OP_WITH_INT_ARITH(signed char, 8, true) +REF_BIN_OP_WITH_INT_ARITH(unsigned char, 8, false) +REF_BIN_OP_WITH_INT_ARITH(short, _AP_SIZE_short, true) +REF_BIN_OP_WITH_INT_ARITH(unsigned short, _AP_SIZE_short, false) +REF_BIN_OP_WITH_INT_ARITH(int, _AP_SIZE_int, true) +REF_BIN_OP_WITH_INT_ARITH(unsigned int, _AP_SIZE_int, false) +REF_BIN_OP_WITH_INT_ARITH(long, _AP_SIZE_long, true) +REF_BIN_OP_WITH_INT_ARITH(unsigned long, _AP_SIZE_long, false) +REF_BIN_OP_WITH_INT_ARITH(ap_slong, _AP_SIZE_ap_slong, true) +REF_BIN_OP_WITH_INT_ARITH(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef REF_BIN_OP_WITH_INT_ARITH + +// bitwise and shift operators +#define REF_BIN_OP_WITH_INT_BITS(C_TYPE, _AP_W2, _AP_S2) \ + REF_BIN_OP_WITH_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_WITH_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2)) + +REF_BIN_OP_WITH_INT_BITS(bool, 1, false) +REF_BIN_OP_WITH_INT_BITS(char, 8, CHAR_IS_SIGNED) +REF_BIN_OP_WITH_INT_BITS(signed char, 8, true) +REF_BIN_OP_WITH_INT_BITS(unsigned char, 8, false) +REF_BIN_OP_WITH_INT_BITS(short, _AP_SIZE_short, true) +REF_BIN_OP_WITH_INT_BITS(unsigned short, _AP_SIZE_short, false) +REF_BIN_OP_WITH_INT_BITS(int, _AP_SIZE_int, true) +REF_BIN_OP_WITH_INT_BITS(unsigned int, _AP_SIZE_int, false) +REF_BIN_OP_WITH_INT_BITS(long, _AP_SIZE_long, true) +REF_BIN_OP_WITH_INT_BITS(unsigned long, _AP_SIZE_long, false) +REF_BIN_OP_WITH_INT_BITS(ap_slong, _AP_SIZE_ap_slong, true) +REF_BIN_OP_WITH_INT_BITS(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef REF_BIN_OP_WITH_INT_BITS + +/* ap_range_ref with ap_range_ref + * ------------------------------------------------------------ + */ +#define REF_BIN_OP(BIN_OP, RTYPE) \ + template \ + INLINE \ + typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \ + operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& lhs, \ + const ap_range_ref<_AP_W2, _AP_S2>& rhs) { \ + return (lhs.operator ap_int_base<_AP_W, false>())BIN_OP( \ + rhs.operator ap_int_base<_AP_W2, false>()); \ + } + +REF_BIN_OP(+, plus) +REF_BIN_OP(-, minus) +REF_BIN_OP(*, mult) +REF_BIN_OP(/, div) +REF_BIN_OP(%, mod) +REF_BIN_OP(&, logic) +REF_BIN_OP(|, logic) +REF_BIN_OP(^, logic) +REF_BIN_OP(>>, arg1) +REF_BIN_OP(<<, arg1) + +/* ap_concat_ref with ap_concat_ref. + * ------------------------------------------------------------ + */ + +//************************************************************************ +// Implement +// ap_int_base = ap_concat_ref OP ap_concat_ref +// for operators +, -, *, /, %, >>, <<, &, |, ^ +// Without these operators the operands are converted to int64 and +// larger results lose informations (higher order bits). +// +// operand OP +// / | +// left-concat right-concat +// / | / | +// +// +// _AP_LW1, _AP_LT1 (width and type of left-concat's left side) +// _AP_LW2, _AP_LT2 (width and type of left-concat's right side) +// Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2 +// +// In Verilog 2001 result of concatenation is always unsigned even +// when both sides are signed. +//************************************************************************ + +#undef SYN_CONCAT_REF_BIN_OP + +#define SYN_CONCAT_REF_BIN_OP(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType< \ + _AP_RW1 + _AP_RW2, false>::RTYPE \ + operator BIN_OP( \ + const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs, \ + const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) { \ + return lhs.get() BIN_OP rhs.get(); \ + } + +SYN_CONCAT_REF_BIN_OP(+, plus) +SYN_CONCAT_REF_BIN_OP(-, minus) +SYN_CONCAT_REF_BIN_OP(*, mult) +SYN_CONCAT_REF_BIN_OP(/, div) +SYN_CONCAT_REF_BIN_OP(%, mod) +SYN_CONCAT_REF_BIN_OP(&, logic) +SYN_CONCAT_REF_BIN_OP(|, logic) +SYN_CONCAT_REF_BIN_OP(^, logic) +SYN_CONCAT_REF_BIN_OP(>>, arg1) +SYN_CONCAT_REF_BIN_OP(<<, arg1) + +#undef SYN_CONCAT_REF_BIN_OP + +#define CONCAT_OP_WITH_INT(C_TYPE, _AP_WI, _AP_SI) \ + template \ + INLINE ap_int_base<_AP_W + _AP_WI, false> operator,( \ + const ap_int_base<_AP_W, _AP_S> &op1, C_TYPE op2) { \ + ap_int_base<_AP_WI + _AP_W, false> val(op2); \ + ap_int_base<_AP_WI + _AP_W, false> ret(op1); \ + ret <<= _AP_WI; \ + if (_AP_SI) { \ + val <<= _AP_W; \ + val >>= _AP_W; \ + } \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_WI, false> operator,( \ + C_TYPE op1, const ap_int_base<_AP_W, _AP_S> &op2) { \ + ap_int_base<_AP_WI + _AP_W, false> val(op1); \ + ap_int_base<_AP_WI + _AP_W, false> ret(op2); \ + if (_AP_S) { \ + ret <<= _AP_WI; \ + ret >>= _AP_WI; \ + } \ + ret |= val << _AP_W; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_WI, false> operator,( \ + const ap_range_ref<_AP_W, _AP_S> &op1, C_TYPE op2) { \ + ap_int_base<_AP_WI + _AP_W, false> val(op2); \ + ap_int_base<_AP_WI + _AP_W, false> ret(op1); \ + ret <<= _AP_WI; \ + if (_AP_SI) { \ + val <<= _AP_W; \ + val >>= _AP_W; \ + } \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_WI, false> operator,( \ + C_TYPE op1, const ap_range_ref<_AP_W, _AP_S> &op2) { \ + ap_int_base<_AP_WI + _AP_W, false> val(op1); \ + ap_int_base<_AP_WI + _AP_W, false> ret(op2); \ + int len = op2.length(); \ + val <<= len; \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_WI + 1, false> operator,( \ + const ap_bit_ref<_AP_W, _AP_S> &op1, C_TYPE op2) { \ + ap_int_base<_AP_WI + 1, false> val(op2); \ + val[_AP_WI] = op1; \ + return val; \ + } \ + template \ + INLINE ap_int_base<_AP_WI + 1, false> operator,( \ + C_TYPE op1, const ap_bit_ref<_AP_W, _AP_S> &op2) { \ + ap_int_base<_AP_WI + 1, false> val(op1); \ + val <<= 1; \ + val[0] = op2; \ + return val; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,( \ + const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, C_TYPE op2) { \ + ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op2); \ + ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op1); \ + if (_AP_SI) { \ + val <<= _AP_W + _AP_W2; \ + val >>= _AP_W + _AP_W2; \ + } \ + ret <<= _AP_WI; \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,( \ + C_TYPE op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { \ + ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op1); \ + ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op2); \ + int len = op2.length(); \ + val <<= len; \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_WI, false> operator,( \ + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, \ + C_TYPE op2) { \ + ap_int_base<_AP_WI + _AP_W, false> val(op2); \ + ap_int_base<_AP_WI + _AP_W, false> ret(op1); \ + if (_AP_SI) { \ + val <<= _AP_W; \ + val >>= _AP_W; \ + } \ + ret <<= _AP_WI; \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<_AP_W + _AP_WI, false> operator,( \ + C_TYPE op1, \ + const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { \ + ap_int_base<_AP_WI + _AP_W, false> val(op1); \ + ap_int_base<_AP_WI + _AP_W, false> ret(op2); \ + int len = op2.length(); \ + val <<= len; \ + ret |= val; \ + return ret; \ + } \ + template \ + INLINE ap_int_base<1 + _AP_WI, false> operator,( \ + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1, \ + C_TYPE op2) { \ + ap_int_base<_AP_WI + 1, _AP_SI> val(op2); \ + val[_AP_WI] = op1; \ + return val; \ + } \ + template \ + INLINE ap_int_base<1 + _AP_WI, false> operator,( \ + C_TYPE op1, \ + const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) { \ + ap_int_base<_AP_WI + 1, _AP_SI> val(op1); \ + val <<= 1; \ + val[0] = op2; \ + return val; \ + } + +CONCAT_OP_WITH_INT(bool, 1, false) +CONCAT_OP_WITH_INT(char, 8, CHAR_IS_SIGNED) +CONCAT_OP_WITH_INT(signed char, 8, true) +CONCAT_OP_WITH_INT(unsigned char, 8, false) +CONCAT_OP_WITH_INT(short, _AP_SIZE_short, true) +CONCAT_OP_WITH_INT(unsigned short, _AP_SIZE_short, false) +CONCAT_OP_WITH_INT(int, _AP_SIZE_int, true) +CONCAT_OP_WITH_INT(unsigned int, _AP_SIZE_int, false) +CONCAT_OP_WITH_INT(long, _AP_SIZE_long, true) +CONCAT_OP_WITH_INT(unsigned long, _AP_SIZE_long, false) +CONCAT_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true) +CONCAT_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false) + +#undef CONCAT_OP_WITH_INT + +#define CONCAT_SHIFT_WITH_INT(C_TYPE, OP) \ + template \ + INLINE ap_uint<_AP_W + _AP_W1> operator OP( \ + const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, C_TYPE rhs) { \ + return ap_uint<_AP_W + _AP_W1>(lhs).get() OP int(rhs); \ + } + +// FIXME int(rhs) may loose precision. + +CONCAT_SHIFT_WITH_INT(int, <<) +CONCAT_SHIFT_WITH_INT(unsigned int, <<) +CONCAT_SHIFT_WITH_INT(long, <<) +CONCAT_SHIFT_WITH_INT(unsigned long, <<) +CONCAT_SHIFT_WITH_INT(ap_slong, <<) +CONCAT_SHIFT_WITH_INT(ap_ulong, <<) + +CONCAT_SHIFT_WITH_INT(int, >>) +CONCAT_SHIFT_WITH_INT(unsigned int, >>) +CONCAT_SHIFT_WITH_INT(long, >>) +CONCAT_SHIFT_WITH_INT(unsigned long, >>) +CONCAT_SHIFT_WITH_INT(ap_slong, >>) +CONCAT_SHIFT_WITH_INT(ap_ulong, >>) + +#endif // ifndef __cplusplus +#endif // ifndef __AP_INT_REF_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_special.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_special.h new file mode 100644 index 00000000..3afc6192 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_special.h @@ -0,0 +1,223 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_INT_SPECIAL_H__ +#define __AP_INT_SPECIAL_H__ + +#ifndef __AP_INT_H__ +#error "Only ap_fixed.h and ap_int.h can be included directly in user code." +#endif + +#ifndef __SYNTHESIS__ +#include +#include +#endif +// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of +// include. +// #include +namespace std { +template class complex; +} + +/* + TODO: Modernize the code using C++11/C++14 + 1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html + 2. move constructor +*/ + +namespace std { +/* + Specialize std::complex to zero initialization ap_int. + + To reduce the area cost, ap_int is not zero initialized, just like basic + types float or double. However, libstdc++ provides specialization for float, + double and long double, initializing image part to 0 when not specified. + + This has become a difficulty in switching legacy code from these C types to + ap_int. To ease the tranform of legacy code, we have to implement + specialization of std::complex<> for our type. + + As ap_int is a template, it is impossible to specialize only the methods + that causes default initialization of value type in std::complex<>. An + explicit full specialization of the template class has to be done, covering + all the member functions and operators of std::complex<> as specified + in standard 26.2.4 and 26.2.5. +*/ +template +class complex > { + public: + typedef ap_int<_AP_W> _Tp; + typedef _Tp value_type; + + // 26.2.4/1 + // Constructor without argument + // Default initialize, so that in dataflow, the variable is only written once. + complex() : _M_real(_Tp()), _M_imag(_Tp()) {} + // Constructor with ap_int. + // Zero initialize image part when not specified, so that `C(1) == C(1,0)` + complex(const _Tp &__r, const _Tp &__i = _Tp(0)) + : _M_real(__r), _M_imag(__i) {} + + // Constructor with another complex number + template + complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {} + +#if __cplusplus >= 201103L + const _Tp& real() const { return _M_real; } + const _Tp& imag() const { return _M_imag; } +#else + _Tp& real() { return _M_real; } + const _Tp& real() const { return _M_real; } + _Tp& imag() { return _M_imag; } + const _Tp& imag() const { return _M_imag; } +#endif + + void real(_Tp __val) { _M_real = __val; } + + void imag(_Tp __val) { _M_imag = __val; } + + // Assign this complex number with ap_int. + // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);` + complex<_Tp> &operator=(const _Tp __t) { + _M_real = __t; + _M_imag = _Tp(0); + return *this; + } + + // 26.2.5/1 + // Add ap_int to this complex number. + complex<_Tp> &operator+=(const _Tp &__t) { + _M_real += __t; + return *this; + } + + // 26.2.5/3 + // Subtract ap_int from this complex number. + complex<_Tp> &operator-=(const _Tp &__t) { + _M_real -= __t; + return *this; + } + + // 26.2.5/5 + // Multiply this complex number by ap_int. + complex<_Tp> &operator*=(const _Tp &__t) { + _M_real *= __t; + _M_imag *= __t; + return *this; + } + + // 26.2.5/7 + // Divide this complex number by ap_int. + complex<_Tp> &operator/=(const _Tp &__t) { + _M_real /= __t; + _M_imag /= __t; + return *this; + } + + // Assign complex number to this complex number. + template + complex<_Tp> &operator=(const complex<_Up> &__z) { + _M_real = __z.real(); + _M_imag = __z.imag(); + return *this; + } + + // 26.2.5/9 + // Add complex number to this. + template + complex<_Tp> &operator+=(const complex<_Up> &__z) { + _M_real += __z.real(); + _M_imag += __z.imag(); + return *this; + } + + // 26.2.5/11 + // Subtract complex number from this. + template + complex<_Tp> &operator-=(const complex<_Up> &__z) { + _M_real -= __z.real(); + _M_imag -= __z.imag(); + return *this; + } + + // 26.2.5/13 + // Multiply this by complex number. + template + complex<_Tp> &operator*=(const complex<_Up> &__z) { + const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag(); + _M_imag = _M_real * __z.imag() + _M_imag * __z.real(); + _M_real = __r; + return *this; + } + + // 26.2.5/15 + // Divide this by complex number. + template + complex<_Tp> &operator/=(const complex<_Up> &__z) { + complex<_Tp> cj (__z.real(), -__z.imag()); + complex<_Tp> a = (*this) * cj; + complex<_Tp> b = cj * __z; + _M_real = a.real() / b.real(); + _M_imag = a.imag() / b.real(); + return *this; + } + + private: + _Tp _M_real; + _Tp _M_imag; + +}; // class complex > + + +/* + Non-member operations + These operations are not required by standard in 26.2.6, but libstdc++ + defines them for + float, double or long double's specialization. +*/ +// Compare complex number with ap_int. +template +inline bool operator==(const complex > &__x, const ap_int<_AP_W> &__y) { + return __x.real() == __y && + __x.imag() == 0; +} + +// Compare ap_int with complex number. +template +inline bool operator==(const ap_int<_AP_W> &__x, const complex > &__y) { + return __x == __y.real() && + 0 == __y.imag(); +} + +// Compare complex number with ap_int. +template +inline bool operator!=(const complex > &__x, const ap_int<_AP_W> &__y) { + return __x.real() != __y || + __x.imag() != 0; +} + +// Compare ap_int with complex number. +template +inline bool operator!=(const ap_int<_AP_W> &__x, const complex > &__y) { + return __x != __y.real() || + 0 != __y.imag(); +} + +} // namespace std + +#endif // ifndef __AP_INT_SPECIAL_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_shift_reg.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_shift_reg.h new file mode 100644 index 00000000..94dba51e --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_shift_reg.h @@ -0,0 +1,138 @@ +/* +#- (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved. +#- +#- This file contains confidential and proprietary information +#- of Xilinx, Inc. and is protected under U.S. and +#- international copyright and other intellectual property +#- laws. +#- +#- DISCLAIMER +#- This disclaimer is not a license and does not grant any +#- rights to the materials distributed herewith. Except as +#- otherwise provided in a valid license issued to you by +#- Xilinx, and to the maximum extent permitted by applicable +#- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +#- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +#- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +#- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +#- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +#- (2) Xilinx shall not be liable (whether in contract or tort, +#- including negligence, or under any other theory of +#- liability) for any loss or damage of any kind or nature +#- related to, arising under or in connection with these +#- materials, including for any direct, or any indirect, +#- special, incidental, or consequential loss or damage +#- (including loss of data, profits, goodwill, or any type of +#- loss or damage suffered as a result of any action brought +#- by a third party) even if such damage or loss was +#- reasonably foreseeable or Xilinx had been advised of the +#- possibility of the same. +#- +#- CRITICAL APPLICATIONS +#- Xilinx products are not designed or intended to be fail- +#- safe, or for use in any application requiring fail-safe +#- performance, such as life-support or safety devices or +#- systems, Class III medical devices, nuclear facilities, +#- applications related to the deployment of airbags, or any +#- other applications that could lead to death, personal +#- injury, or severe property or environmental damage +#- (individually and collectively, "Critical +#- Applications"). Customer assumes the sole risk and +#- liability of any use of Xilinx products in Critical +#- Applications, subject only to applicable laws and +#- regulations governing limitations on product liability. +#- +#- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +#- PART OF THIS FILE AT ALL TIMES. +#- ************************************************************************ + + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __SIM_AP_SHIFT_REG_H__ +#define __SIM_AP_SHIFT_REG_H__ + + +/* + * This file contains a C++ model of shift register. + * It defines C level simulation model. + */ +#ifndef __cplusplus +#error C++ is required to include this header file +#else + +#include + +////////////////////////////////////////////// +// C level simulation model for ap_shift_reg +////////////////////////////////////////////// +template +class ap_shift_reg +{ + public: + /// Constructors + ap_shift_reg() { } + ap_shift_reg(const char* name) { } + /// Destructor + virtual ~ap_shift_reg() { } + + private: + /// Make copy constructor and assignment operator private + ap_shift_reg(const ap_shift_reg< __SHIFT_T__, __SHIFT_DEPTH__ >& shreg) + { + for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i) + Array[i] = shreg.Array[i]; + } + + ap_shift_reg& operator = (const ap_shift_reg< __SHIFT_T__, + __SHIFT_DEPTH__ >& shreg) + { + for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i) + Array[i] = shreg.Array[i]; + return *this; + } + + public: + // Shift the queue, push to back and read from a given address. + __SHIFT_T__ shift(__SHIFT_T__ DataIn, + unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true) + { + assert(Addr < __SHIFT_DEPTH__ && + "Out-of-bound shift is found in ap_shift_reg."); + __SHIFT_T__ ret = Array[Addr]; + if (Enable) { + for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i) + Array[i] = Array[i-1]; + Array[0] = DataIn; + } + return ret; + } + + // Read from a given address. + __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const + { + assert(Addr < __SHIFT_DEPTH__ && + "Out-of-bound read is found in ap_shift_reg."); + return Array[Addr]; + } + + protected: + __SHIFT_T__ Array[__SHIFT_DEPTH__]; +}; + +#endif //__cplusplus + +#endif //__SIM_AP_SHIFT_REG_H__ + + diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/etc/ap_private.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/etc/ap_private.h new file mode 100644 index 00000000..0c29a0ac --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/etc/ap_private.h @@ -0,0 +1,7199 @@ +/* + * Copyright 2011-2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __AP_PRIVATE_H__ +#define __AP_PRIVATE_H__ + +// common macros and type declarations are now defined in ap_common.h, and +// ap_private becomes part of it. +#ifndef __AP_COMMON_H__ +#error "etc/ap_private.h cannot be included directly." +#endif + +// forward declarations +//template +//class ap_private; // moved to ap_common.h +template +struct _private_range_ref; +template +struct _private_bit_ref; + +// TODO clean up this part. +#ifndef LLVM_SUPPORT_MATHEXTRAS_H +#define LLVM_SUPPORT_MATHEXTRAS_H + +#ifdef _MSC_VER +#if _MSC_VER <= 1500 +typedef __int8 int8_t; +typedef unsigned __int8 uint8_t; +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif +#else +#include +#endif + +#ifndef INLINE +#define INLINE inline +// Enable to debug ap_int/ap_fixed +// #define INLINE __attribute__((weak)) +#endif + +// NOTE: The following support functions use the _32/_64 extensions instead of +// type overloading so that signed and unsigned integers can be used without +// ambiguity. +namespace AESL_std { +template +DataType INLINE min(DataType a, DataType b) { + return (a >= b) ? b : a; +} + +template +DataType INLINE max(DataType a, DataType b) { + return (a >= b) ? a : b; +} +} // namespace AESL_std + +// TODO clean up included headers. +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ap_private_ops { +/// Hi_32 - This function returns the high 32 bits of a 64 bit value. +static INLINE uint32_t Hi_32(uint64_t Value) { + return static_cast(Value >> 32); +} + +/// Lo_32 - This function returns the low 32 bits of a 64 bit value. +static INLINE uint32_t Lo_32(uint64_t Value) { + return static_cast(Value); +} + +template +INLINE bool isNegative(const ap_private<_AP_W, false>& a) { + return false; +} + +template +INLINE bool isNegative(const ap_private<_AP_W, true>& a) { + enum { + APINT_BITS_PER_WORD = 64, + _AP_N = (_AP_W + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD + }; + static const uint64_t sign_mask = 1ULL << ((_AP_W - 1) % APINT_BITS_PER_WORD); + return (sign_mask & a.get_pVal(_AP_N - 1)) != 0; +} + +/// CountLeadingZeros_32 - this function performs the platform optimal form of +/// counting the number of zeros from the most significant bit to the first one +/// bit. Ex. CountLeadingZeros_32(0x00F000FF) == 8. +/// Returns 32 if the word is zero. +static INLINE unsigned CountLeadingZeros_32(uint32_t Value) { + unsigned Count; // result +#if __GNUC__ >= 4 +// PowerPC is defined for __builtin_clz(0) +#if !defined(__ppc__) && !defined(__ppc64__) + if (Value == 0) return 32; +#endif + Count = __builtin_clz(Value); +#else + if (Value == 0) return 32; + Count = 0; + // bisecton method for count leading zeros + for (unsigned Shift = 32 >> 1; Shift; Shift >>= 1) { + uint32_t Tmp = (Value) >> (Shift); + if (Tmp) { + Value = Tmp; + } else { + Count |= Shift; + } + } +#endif + return Count; +} + +/// CountLeadingZeros_64 - This function performs the platform optimal form +/// of counting the number of zeros from the most significant bit to the first +/// one bit (64 bit edition.) +/// Returns 64 if the word is zero. +static INLINE unsigned CountLeadingZeros_64(uint64_t Value) { + unsigned Count; // result +#if __GNUC__ >= 4 +// PowerPC is defined for __builtin_clzll(0) +#if !defined(__ppc__) && !defined(__ppc64__) + if (!Value) return 64; +#endif + Count = __builtin_clzll(Value); +#else + if (sizeof(long) == sizeof(int64_t)) { + if (!Value) return 64; + Count = 0; + // bisecton method for count leading zeros + for (unsigned Shift = 64 >> 1; Shift; Shift >>= 1) { + uint64_t Tmp = (Value) >> (Shift); + if (Tmp) { + Value = Tmp; + } else { + Count |= Shift; + } + } + } else { + // get hi portion + uint32_t Hi = Hi_32(Value); + + // if some bits in hi portion + if (Hi) { + // leading zeros in hi portion plus all bits in lo portion + Count = CountLeadingZeros_32(Hi); + } else { + // get lo portion + uint32_t Lo = Lo_32(Value); + // same as 32 bit value + Count = CountLeadingZeros_32(Lo) + 32; + } + } +#endif + return Count; +} + +/// CountTrailingZeros_64 - This function performs the platform optimal form +/// of counting the number of zeros from the least significant bit to the first +/// one bit (64 bit edition.) +/// Returns 64 if the word is zero. +static INLINE unsigned CountTrailingZeros_64(uint64_t Value) { +#if __GNUC__ >= 4 + return (Value != 0) ? __builtin_ctzll(Value) : 64; +#else + static const unsigned Mod67Position[] = { + 64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54, 4, + 64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5, 32, + 65, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27, 29, 50, 43, 46, 31, + 37, 21, 57, 52, 8, 26, 49, 45, 36, 56, 7, 48, 35, 6, 34, 33, 0}; + return Mod67Position[(uint64_t)(-(int64_t)Value & (int64_t)Value) % 67]; +#endif +} + +/// CountPopulation_64 - this function counts the number of set bits in a value, +/// (64 bit edition.) +static INLINE unsigned CountPopulation_64(uint64_t Value) { +#if __GNUC__ >= 4 + return __builtin_popcountll(Value); +#else + uint64_t v = Value - (((Value) >> 1) & 0x5555555555555555ULL); + v = (v & 0x3333333333333333ULL) + (((v) >> 2) & 0x3333333333333333ULL); + v = (v + ((v) >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); +#endif +} + +static INLINE uint32_t countLeadingOnes_64(uint64_t __V, uint32_t skip) { + uint32_t Count = 0; + if (skip) (__V) <<= (skip); + while (__V && (__V & (1ULL << 63))) { + Count++; + (__V) <<= 1; + } + return Count; +} + +static INLINE std::string oct2Bin(char oct) { + switch (oct) { + case '\0': { + return ""; + } + case '.': { + return "."; + } + case '0': { + return "000"; + } + case '1': { + return "001"; + } + case '2': { + return "010"; + } + case '3': { + return "011"; + } + case '4': { + return "100"; + } + case '5': { + return "101"; + } + case '6': { + return "110"; + } + case '7': { + return "111"; + } + } + assert(0 && "Invalid character in digit string"); + return ""; +} + +static INLINE std::string hex2Bin(char hex) { + switch (hex) { + case '\0': { + return ""; + } + case '.': { + return "."; + } + case '0': { + return "0000"; + } + case '1': { + return "0001"; + } + case '2': { + return "0010"; + } + case '3': { + return "0011"; + } + case '4': { + return "0100"; + } + case '5': { + return "0101"; + } + case '6': { + return "0110"; + } + case '7': { + return "0111"; + } + case '8': { + return "1000"; + } + case '9': { + return "1001"; + } + case 'A': + case 'a': { + return "1010"; + } + case 'B': + case 'b': { + return "1011"; + } + case 'C': + case 'c': { + return "1100"; + } + case 'D': + case 'd': { + return "1101"; + } + case 'E': + case 'e': { + return "1110"; + } + case 'F': + case 'f': { + return "1111"; + } + } + assert(0 && "Invalid character in digit string"); + return ""; +} + +static INLINE uint32_t decode_digit(char cdigit, int radix) { + uint32_t digit = 0; + if (radix == 16) { +#define isxdigit(c) \ + (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \ + ((c) >= 'A' && (c) <= 'F')) +#define isdigit(c) ((c) >= '0' && (c) <= '9') + if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string"); + if (isdigit(cdigit)) + digit = cdigit - '0'; + else if (cdigit >= 'a') + digit = cdigit - 'a' + 10; + else if (cdigit >= 'A') + digit = cdigit - 'A' + 10; + else + assert(0 && "huh? we shouldn't get here"); + } else if (isdigit(cdigit)) { + digit = cdigit - '0'; + } else { + assert(0 && "Invalid character in digit string"); + } +#undef isxdigit +#undef isdigit + return digit; +} + +// Determine the radix of "val". +static INLINE std::string parseString(const std::string& input, unsigned char& radix) { + size_t len = input.length(); + if (len == 0) { + if (radix == 0) radix = 10; + return input; + } + + size_t startPos = 0; + // Trim whitespace + while (input[startPos] == ' ' && startPos < len) startPos++; + while (input[len - 1] == ' ' && startPos < len) len--; + + std::string val = input.substr(startPos, len - startPos); + // std::cout << "val = " << val << "\n"; + len = val.length(); + startPos = 0; + + // If the length of the string is less than 2, then radix + // is decimal and there is no exponent. + if (len < 2) { + if (radix == 0) radix = 10; + return val; + } + + bool isNegative = false; + std::string ans; + + // First check to see if we start with a sign indicator + if (val[0] == '-') { + ans = "-"; + ++startPos; + isNegative = true; + } else if (val[0] == '+') + ++startPos; + + if (len - startPos < 2) { + if (radix == 0) radix = 10; + return val; + } + + if (val.substr(startPos, 2) == "0x" || val.substr(startPos, 2) == "0X") { + // If we start with "0x", then the radix is hex. + radix = 16; + startPos += 2; + } else if (val.substr(startPos, 2) == "0b" || + val.substr(startPos, 2) == "0B") { + // If we start with "0b", then the radix is binary. + radix = 2; + startPos += 2; + } else if (val.substr(startPos, 2) == "0o" || + val.substr(startPos, 2) == "0O") { + // If we start with "0o", then the radix is octal. + radix = 8; + startPos += 2; + } else if (radix == 0) { + radix = 10; + } + + int exp = 0; + if (radix == 10) { + // If radix is decimal, then see if there is an + // exponent indicator. + size_t expPos = val.find('e'); + bool has_exponent = true; + if (expPos == std::string::npos) expPos = val.find('E'); + if (expPos == std::string::npos) { + // No exponent indicator, so the mantissa goes to the end. + expPos = len; + has_exponent = false; + } + // std::cout << "startPos = " << startPos << " " << expPos << "\n"; + + ans += val.substr(startPos, expPos - startPos); + if (has_exponent) { + // Parse the exponent. + std::istringstream iss(val.substr(expPos + 1, len - expPos - 1)); + iss >> exp; + } + } else { + // Check for a binary exponent indicator. + size_t expPos = val.find('p'); + bool has_exponent = true; + if (expPos == std::string::npos) expPos = val.find('P'); + if (expPos == std::string::npos) { + // No exponent indicator, so the mantissa goes to the end. + expPos = len; + has_exponent = false; + } + + // std::cout << "startPos = " << startPos << " " << expPos << "\n"; + + assert(startPos <= expPos); + // Convert to binary as we go. + for (size_t i = startPos; i < expPos; ++i) { + if (radix == 16) { + ans += hex2Bin(val[i]); + } else if (radix == 8) { + ans += oct2Bin(val[i]); + } else { // radix == 2 + ans += val[i]; + } + } + // End in binary + radix = 2; + if (has_exponent) { + // Parse the exponent. + std::istringstream iss(val.substr(expPos + 1, len - expPos - 1)); + iss >> exp; + } + } + if (exp == 0) return ans; + + size_t decPos = ans.find('.'); + if (decPos == std::string::npos) decPos = ans.length(); + if ((int)decPos + exp >= (int)ans.length()) { + int i = decPos; + for (; i < (int)ans.length() - 1; ++i) ans[i] = ans[i + 1]; + for (; i < (int)ans.length(); ++i) ans[i] = '0'; + for (; i < (int)decPos + exp; ++i) ans += '0'; + return ans; + } else if ((int)decPos + exp < (int)isNegative) { + std::string dupAns = "0."; + if (ans[0] == '-') dupAns = "-0."; + for (int i = 0; i < isNegative - (int)decPos - exp; ++i) dupAns += '0'; + for (size_t i = isNegative; i < ans.length(); ++i) + if (ans[i] != '.') dupAns += ans[i]; + return dupAns; + } + + if (exp > 0) + for (size_t i = decPos; i < decPos + exp; ++i) ans[i] = ans[i + 1]; + else { + if (decPos == ans.length()) ans += ' '; + for (int i = decPos; i > (int)decPos + exp; --i) ans[i] = ans[i - 1]; + } + ans[decPos + exp] = '.'; + return ans; +} + +/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from +/// the multi-digit integer array, x[], propagating the borrowed 1 value until +/// no further borrowing is neeeded or it runs out of "digits" in x. The result +/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted. +/// In other words, if y > x then this function returns 1, otherwise 0. +/// @returns the borrow out of the subtraction +static INLINE bool sub_1(uint64_t x[], uint32_t len, uint64_t y) { + for (uint32_t i = 0; i < len; ++i) { + uint64_t __X = x[i]; + x[i] -= y; + if (y > __X) + y = 1; // We have to "borrow 1" from next "digit" + else { + y = 0; // No need to borrow + break; // Remaining digits are unchanged so exit early + } + } + return (y != 0); +} + +/// add_1 - This function adds a single "digit" integer, y, to the multiple +/// "digit" integer array, x[]. x[] is modified to reflect the addition and +/// 1 is returned if there is a carry out, otherwise 0 is returned. +/// @returns the carry of the addition. +static INLINE bool add_1(uint64_t dest[], uint64_t x[], uint32_t len, + uint64_t y) { + for (uint32_t i = 0; i < len; ++i) { + dest[i] = y + x[i]; + if (dest[i] < y) + y = 1; // Carry one to next digit. + else { + y = 0; // No need to carry so exit early + break; + } + } + return (y != 0); +} + +/// add - This function adds the integer array x to the integer array Y and +/// places the result in dest. +/// @returns the carry out from the addition +/// @brief General addition of 64-bit integer arrays +static INLINE bool add(uint64_t* dest, const uint64_t* x, const uint64_t* y, + uint32_t destlen, uint32_t xlen, uint32_t ylen, + bool xsigned, bool ysigned) { + bool carry = false; + uint32_t len = AESL_std::min(xlen, ylen); + uint32_t i; + for (i = 0; i < len && i < destlen; ++i) { + uint64_t limit = + AESL_std::min(x[i], y[i]); // must come first in case dest == x + dest[i] = x[i] + y[i] + carry; + carry = dest[i] < limit || (carry && dest[i] == limit); + } + if (xlen > ylen) { + const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0; + for (i = ylen; i < xlen && i < destlen; i++) { + uint64_t limit = AESL_std::min(x[i], yext); + dest[i] = x[i] + yext + carry; + carry = (dest[i] < limit) || (carry && dest[i] == limit); + } + } else if (ylen > xlen) { + const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0; + for (i = xlen; i < ylen && i < destlen; i++) { + uint64_t limit = AESL_std::min(xext, y[i]); + dest[i] = xext + y[i] + carry; + carry = (dest[i] < limit) || (carry && dest[i] == limit); + } + } + return carry; +} + +/// @returns returns the borrow out. +/// @brief Generalized subtraction of 64-bit integer arrays. +static INLINE bool sub(uint64_t* dest, const uint64_t* x, const uint64_t* y, + uint32_t destlen, uint32_t xlen, uint32_t ylen, + bool xsigned, bool ysigned) { + bool borrow = false; + uint32_t i; + uint32_t len = AESL_std::min(xlen, ylen); + for (i = 0; i < len && i < destlen; ++i) { + uint64_t x_tmp = borrow ? x[i] - 1 : x[i]; + borrow = y[i] > x_tmp || (borrow && x[i] == 0); + dest[i] = x_tmp - y[i]; + } + if (xlen > ylen) { + const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0; + for (i = ylen; i < xlen && i < destlen; i++) { + uint64_t x_tmp = borrow ? x[i] - 1 : x[i]; + borrow = yext > x_tmp || (borrow && x[i] == 0); + dest[i] = x_tmp - yext; + } + } else if (ylen > xlen) { + const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0; + for (i = xlen; i < ylen && i < destlen; i++) { + uint64_t x_tmp = borrow ? xext - 1 : xext; + borrow = y[i] > x_tmp || (borrow && xext == 0); + dest[i] = x_tmp - y[i]; + } + } + return borrow; +} + +/// Subtracts the RHS ap_private from this ap_private +/// @returns this, after subtraction +/// @brief Subtraction assignment operator. + +/// Multiplies an integer array, x by a a uint64_t integer and places the result +/// into dest. +/// @returns the carry out of the multiplication. +/// @brief Multiply a multi-digit ap_private by a single digit (64-bit) integer. +static INLINE uint64_t mul_1(uint64_t dest[], const uint64_t x[], uint32_t len, + uint64_t y) { + // Split y into high 32-bit part (hy) and low 32-bit part (ly) + uint64_t ly = y & 0xffffffffULL, hy = (y) >> 32; + uint64_t carry = 0; + static const uint64_t two_power_32 = 1ULL << 32; + // For each digit of x. + for (uint32_t i = 0; i < len; ++i) { + // Split x into high and low words + uint64_t lx = x[i] & 0xffffffffULL; + uint64_t hx = (x[i]) >> 32; + // hasCarry - A flag to indicate if there is a carry to the next digit. + // hasCarry == 0, no carry + // hasCarry == 1, has carry + // hasCarry == 2, no carry and the calculation result == 0. + uint8_t hasCarry = 0; + dest[i] = carry + lx * ly; + // Determine if the add above introduces carry. + hasCarry = (dest[i] < carry) ? 1 : 0; + carry = hx * ly + ((dest[i]) >> 32) + (hasCarry ? two_power_32 : 0); + // The upper limit of carry can be (2^32 - 1)(2^32 - 1) + + // (2^32 - 1) + 2^32 = 2^64. + hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0); + + carry += (lx * hy) & 0xffffffffULL; + dest[i] = ((carry) << 32) | (dest[i] & 0xffffffffULL); + carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? two_power_32 : 0) + + ((carry) >> 32) + ((lx * hy) >> 32) + hx * hy; + } + return carry; +} + +/// Multiplies integer array x by integer array y and stores the result into +/// the integer array dest. Note that dest's size must be >= xlen + ylen in +/// order to +/// do a full precision computation. If it is not, then only the low-order words +/// are returned. +/// @brief Generalized multiplicate of integer arrays. +static INLINE void mul(uint64_t dest[], const uint64_t x[], uint32_t xlen, + const uint64_t y[], uint32_t ylen, uint32_t destlen) { + assert(xlen > 0); + assert(ylen > 0); + assert(destlen >= xlen + ylen); + if (xlen < destlen) dest[xlen] = mul_1(dest, x, xlen, y[0]); + for (uint32_t i = 1; i < ylen; ++i) { + uint64_t ly = y[i] & 0xffffffffULL, hy = (y[i]) >> 32; + uint64_t carry = 0, lx = 0, hx = 0; + for (uint32_t j = 0; j < xlen; ++j) { + lx = x[j] & 0xffffffffULL; + hx = (x[j]) >> 32; + // hasCarry - A flag to indicate if has carry. + // hasCarry == 0, no carry + // hasCarry == 1, has carry + // hasCarry == 2, no carry and the calculation result == 0. + uint8_t hasCarry = 0; + uint64_t resul = carry + lx * ly; + hasCarry = (resul < carry) ? 1 : 0; + carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + ((resul) >> 32); + hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0); + carry += (lx * hy) & 0xffffffffULL; + resul = ((carry) << 32) | (resul & 0xffffffffULL); + if (i + j < destlen) dest[i + j] += resul; + carry = + (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) + + ((carry) >> 32) + (dest[i + j] < resul ? 1 : 0) + ((lx * hy) >> 32) + + hx * hy; + } + if (i + xlen < destlen) dest[i + xlen] = carry; + } +} + +/// Implementation of Knuth's Algorithm D (Division of nonnegative integers) +/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The +/// variables here have the same names as in the algorithm. Comments explain +/// the algorithm and any deviation from it. +static INLINE void KnuthDiv(uint32_t* u, uint32_t* v, uint32_t* q, uint32_t* r, + uint32_t m, uint32_t n) { + assert(u && "Must provide dividend"); + assert(v && "Must provide divisor"); + assert(q && "Must provide quotient"); + assert(u != v && u != q && v != q && "Must us different memory"); + assert(n > 1 && "n must be > 1"); + + // Knuth uses the value b as the base of the number system. In our case b + // is 2^31 so we just set it to -1u. + uint64_t b = uint64_t(1) << 32; + + // DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n'); + // DEBUG(cerr << "KnuthDiv: original:"); + // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) << + // u[i]); + // DEBUG(cerr << " by"); + // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) << + // v[i-1]); + // DEBUG(cerr << '\n'); + // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of + // u and v by d. Note that we have taken Knuth's advice here to use a power + // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of + // 2 allows us to shift instead of multiply and it is easy to determine the + // shift amount from the leading zeros. We are basically normalizing the u + // and v so that its high bits are shifted to the top of v's range without + // overflow. Note that this can require an extra word in u so that u must + // be of length m+n+1. + uint32_t shift = CountLeadingZeros_32(v[n - 1]); + uint32_t v_carry = 0; + uint32_t u_carry = 0; + if (shift) { + for (uint32_t i = 0; i < m + n; ++i) { + uint32_t u_tmp = (u[i]) >> (32 - shift); + u[i] = ((u[i]) << (shift)) | u_carry; + u_carry = u_tmp; + } + for (uint32_t i = 0; i < n; ++i) { + uint32_t v_tmp = (v[i]) >> (32 - shift); + v[i] = ((v[i]) << (shift)) | v_carry; + v_carry = v_tmp; + } + } + u[m + n] = u_carry; + // DEBUG(cerr << "KnuthDiv: normal:"); + // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) << + // u[i]); + // DEBUG(cerr << " by"); + // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) << + // v[i-1]); + // DEBUG(cerr << '\n'); + + // D2. [Initialize j.] Set j to m. This is the loop counter over the places. + int j = m; + do { + // DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n'); + // D3. [Calculate q'.]. + // Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q') + // Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r') + // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease + // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test + // on v[n-2] determines at high speed most of the cases in which the trial + // value qp is one too large, and it eliminates all cases where qp is two + // too large. + uint64_t dividend = ((uint64_t(u[j + n]) << 32) + u[j + n - 1]); + // DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n'); + uint64_t qp = dividend / v[n - 1]; + uint64_t rp = dividend % v[n - 1]; + if (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2]) { + qp--; + rp += v[n - 1]; + if (rp < b && (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2])) qp--; + } + // DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n'); + + // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with + // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation + // consists of a simple multiplication by a one-place number, combined with + // a subtraction. + bool isNeg = false; + for (uint32_t i = 0; i < n; ++i) { + uint64_t u_tmp = uint64_t(u[j + i]) | ((uint64_t(u[j + i + 1])) << 32); + uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]); + bool borrow = subtrahend > u_tmp; + /*DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp + << ", subtrahend == " << subtrahend + << ", borrow = " << borrow << '\n');*/ + + uint64_t result = u_tmp - subtrahend; + uint32_t k = j + i; + u[k++] = (uint32_t)(result & (b - 1)); // subtract low word + u[k++] = (uint32_t)((result) >> 32); // subtract high word + while (borrow && k <= m + n) { // deal with borrow to the left + borrow = u[k] == 0; + u[k]--; + k++; + } + isNeg |= borrow; + /*DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ", u[j+i+1] == " << + u[j+i+1] << '\n');*/ + } + /*DEBUG(cerr << "KnuthDiv: after subtraction:"); + DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]); + DEBUG(cerr << '\n');*/ + // The digits (u[j+n]...u[j]) should be kept positive; if the result of + // this step is actually negative, (u[j+n]...u[j]) should be left as the + // true value plus b**(n+1), namely as the b's complement of + // the true value, and a "borrow" to the left should be remembered. + // + if (isNeg) { + bool carry = true; // true because b's complement is "complement + 1" + for (uint32_t i = 0; i <= m + n; ++i) { + u[i] = ~u[i] + carry; // b's complement + carry = carry && u[i] == 0; + } + } + /*DEBUG(cerr << "KnuthDiv: after complement:"); + DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]); + DEBUG(cerr << '\n');*/ + + // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was + // negative, go to step D6; otherwise go on to step D7. + q[j] = (uint32_t)qp; + if (isNeg) { + // D6. [Add back]. The probability that this step is necessary is very + // small, on the order of only 2/b. Make sure that test data accounts for + // this possibility. Decrease q[j] by 1 + q[j]--; + // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]). + // A carry will occur to the left of u[j+n], and it should be ignored + // since it cancels with the borrow that occurred in D4. + bool carry = false; + for (uint32_t i = 0; i < n; i++) { + uint32_t limit = AESL_std::min(u[j + i], v[i]); + u[j + i] += v[i] + carry; + carry = u[j + i] < limit || (carry && u[j + i] == limit); + } + u[j + n] += carry; + } + /*DEBUG(cerr << "KnuthDiv: after correction:"); + DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]); + DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');*/ + + // D7. [Loop on j.] Decrease j by one. Now if j >= 0, go back to D3. + } while (--j >= 0); + + /*DEBUG(cerr << "KnuthDiv: quotient:"); + DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]); + DEBUG(cerr << '\n');*/ + + // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired + // remainder may be obtained by dividing u[...] by d. If r is non-null we + // compute the remainder (urem uses this). + if (r) { + // The value d is expressed by the "shift" value above since we avoided + // multiplication by d by using a shift left. So, all we have to do is + // shift right here. In order to mak + if (shift) { + uint32_t carry = 0; + // DEBUG(cerr << "KnuthDiv: remainder:"); + for (int i = n - 1; i >= 0; i--) { + r[i] = ((u[i]) >> (shift)) | carry; + carry = (u[i]) << (32 - shift); + // DEBUG(cerr << " " << r[i]); + } + } else { + for (int i = n - 1; i >= 0; i--) { + r[i] = u[i]; + // DEBUG(cerr << " " << r[i]); + } + } + // DEBUG(cerr << '\n'); + } + // DEBUG(cerr << std::setbase(10) << '\n'); +} + +template +void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords, + const ap_private<_AP_W, _AP_S>& RHS, uint32_t rhsWords, + ap_private<_AP_W, _AP_S>* Quotient, + ap_private<_AP_W, _AP_S>* Remainder) { + assert(lhsWords >= rhsWords && "Fractional result"); + enum { APINT_BITS_PER_WORD = 64 }; + // First, compose the values into an array of 32-bit words instead of + // 64-bit words. This is a necessity of both the "short division" algorithm + // and the the Knuth "classical algorithm" which requires there to be native + // operations for +, -, and * on an m bit value with an m*2 bit result. We + // can't use 64-bit operands here because we don't have native results of + // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't + // work on large-endian machines. + uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8); + uint32_t n = rhsWords * 2; + uint32_t m = (lhsWords * 2) - n; + + // Allocate space for the temporary values we need either on the stack, if + // it will fit, or on the heap if it won't. + uint32_t SPACE[128]; + uint32_t* __U = 0; + uint32_t* __V = 0; + uint32_t* __Q = 0; + uint32_t* __R = 0; + if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) { + __U = &SPACE[0]; + __V = &SPACE[m + n + 1]; + __Q = &SPACE[(m + n + 1) + n]; + if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)]; + } else { + __U = new uint32_t[m + n + 1]; + __V = new uint32_t[n]; + __Q = new uint32_t[m + n]; + if (Remainder) __R = new uint32_t[n]; + } + + // Initialize the dividend + memset(__U, 0, (m + n + 1) * sizeof(uint32_t)); + for (unsigned i = 0; i < lhsWords; ++i) { + uint64_t tmp = LHS.get_pVal(i); + __U[i * 2] = (uint32_t)(tmp & mask); + __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8); + } + __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm. + + // Initialize the divisor + memset(__V, 0, (n) * sizeof(uint32_t)); + for (unsigned i = 0; i < rhsWords; ++i) { + uint64_t tmp = RHS.get_pVal(i); + __V[i * 2] = (uint32_t)(tmp & mask); + __V[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8); + } + + // initialize the quotient and remainder + memset(__Q, 0, (m + n) * sizeof(uint32_t)); + if (Remainder) memset(__R, 0, n * sizeof(uint32_t)); + + // Now, adjust m and n for the Knuth division. n is the number of words in + // the divisor. m is the number of words by which the dividend exceeds the + // divisor (i.e. m+n is the length of the dividend). These sizes must not + // contain any zero words or the Knuth algorithm fails. + for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) { + n--; + m++; + } + for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--; + + // If we're left with only a single word for the divisor, Knuth doesn't work + // so we implement the short division algorithm here. This is much simpler + // and faster because we are certain that we can divide a 64-bit quantity + // by a 32-bit quantity at hardware speed and short division is simply a + // series of such operations. This is just like doing short division but we + // are using base 2^32 instead of base 10. + assert(n != 0 && "Divide by zero?"); + if (n == 1) { + uint32_t divisor = __V[0]; + uint32_t remainder = 0; + for (int i = m + n - 1; i >= 0; i--) { + uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i]; + if (partial_dividend == 0) { + __Q[i] = 0; + remainder = 0; + } else if (partial_dividend < divisor) { + __Q[i] = 0; + remainder = (uint32_t)partial_dividend; + } else if (partial_dividend == divisor) { + __Q[i] = 1; + remainder = 0; + } else { + __Q[i] = (uint32_t)(partial_dividend / divisor); + remainder = (uint32_t)(partial_dividend - (__Q[i] * divisor)); + } + } + if (__R) __R[0] = remainder; + } else { + // Now we're ready to invoke the Knuth classical divide algorithm. In this + // case n > 1. + KnuthDiv(__U, __V, __Q, __R, m, n); + } + + // If the caller wants the quotient + if (Quotient) { + // Set up the Quotient value's memory. + if (Quotient->BitWidth != LHS.BitWidth) { + if (Quotient->isSingleWord()) Quotient->set_VAL(0); + } else + Quotient->clear(); + + // The quotient is in Q. Reconstitute the quotient into Quotient's low + // order words. + if (lhsWords == 1) { + uint64_t tmp = + uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2)); + Quotient->set_VAL(tmp); + } else { + assert(!Quotient->isSingleWord() && + "Quotient ap_private not large enough"); + for (unsigned i = 0; i < lhsWords; ++i) + Quotient->set_pVal( + i, uint64_t(__Q[i * 2]) | + ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2))); + } + Quotient->clearUnusedBits(); + } + + // If the caller wants the remainder + if (Remainder) { + // Set up the Remainder value's memory. + if (Remainder->BitWidth != RHS.BitWidth) { + if (Remainder->isSingleWord()) Remainder->set_VAL(0); + } else + Remainder->clear(); + + // The remainder is in R. Reconstitute the remainder into Remainder's low + // order words. + if (rhsWords == 1) { + uint64_t tmp = + uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2)); + Remainder->set_VAL(tmp); + } else { + assert(!Remainder->isSingleWord() && + "Remainder ap_private not large enough"); + for (unsigned i = 0; i < rhsWords; ++i) + Remainder->set_pVal( + i, uint64_t(__R[i * 2]) | + ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2))); + } + Remainder->clearUnusedBits(); + } + + // Clean up the memory we allocated. + if (__U != &SPACE[0]) { + delete[] __U; + delete[] __V; + delete[] __Q; + delete[] __R; + } +} + +template +void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords, + uint64_t RHS, ap_private<_AP_W, _AP_S>* Quotient, + ap_private<_AP_W, _AP_S>* Remainder) { + uint32_t rhsWords = 1; + assert(lhsWords >= rhsWords && "Fractional result"); + enum { APINT_BITS_PER_WORD = 64 }; + // First, compose the values into an array of 32-bit words instead of + // 64-bit words. This is a necessity of both the "short division" algorithm + // and the the Knuth "classical algorithm" which requires there to be native + // operations for +, -, and * on an m bit value with an m*2 bit result. We + // can't use 64-bit operands here because we don't have native results of + // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't + // work on large-endian machines. + uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8); + uint32_t n = 2; + uint32_t m = (lhsWords * 2) - n; + + // Allocate space for the temporary values we need either on the stack, if + // it will fit, or on the heap if it won't. + uint32_t SPACE[128]; + uint32_t* __U = 0; + uint32_t* __V = 0; + uint32_t* __Q = 0; + uint32_t* __R = 0; + if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) { + __U = &SPACE[0]; + __V = &SPACE[m + n + 1]; + __Q = &SPACE[(m + n + 1) + n]; + if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)]; + } else { + __U = new uint32_t[m + n + 1]; + __V = new uint32_t[n]; + __Q = new uint32_t[m + n]; + if (Remainder) __R = new uint32_t[n]; + } + + // Initialize the dividend + memset(__U, 0, (m + n + 1) * sizeof(uint32_t)); + for (unsigned i = 0; i < lhsWords; ++i) { + uint64_t tmp = LHS.get_pVal(i); + __U[i * 2] = tmp & mask; + __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8); + } + __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm. + + // Initialize the divisor + memset(__V, 0, (n) * sizeof(uint32_t)); + __V[0] = RHS & mask; + __V[1] = (RHS) >> (sizeof(uint32_t) * 8); + + // initialize the quotient and remainder + memset(__Q, 0, (m + n) * sizeof(uint32_t)); + if (Remainder) memset(__R, 0, n * sizeof(uint32_t)); + + // Now, adjust m and n for the Knuth division. n is the number of words in + // the divisor. m is the number of words by which the dividend exceeds the + // divisor (i.e. m+n is the length of the dividend). These sizes must not + // contain any zero words or the Knuth algorithm fails. + for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) { + n--; + m++; + } + for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--; + + // If we're left with only a single word for the divisor, Knuth doesn't work + // so we implement the short division algorithm here. This is much simpler + // and faster because we are certain that we can divide a 64-bit quantity + // by a 32-bit quantity at hardware speed and short division is simply a + // series of such operations. This is just like doing short division but we + // are using base 2^32 instead of base 10. + assert(n != 0 && "Divide by zero?"); + if (n == 1) { + uint32_t divisor = __V[0]; + uint32_t remainder = 0; + for (int i = m + n - 1; i >= 0; i--) { + uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i]; + if (partial_dividend == 0) { + __Q[i] = 0; + remainder = 0; + } else if (partial_dividend < divisor) { + __Q[i] = 0; + remainder = partial_dividend; + } else if (partial_dividend == divisor) { + __Q[i] = 1; + remainder = 0; + } else { + __Q[i] = partial_dividend / divisor; + remainder = partial_dividend - (__Q[i] * divisor); + } + } + if (__R) __R[0] = remainder; + } else { + // Now we're ready to invoke the Knuth classical divide algorithm. In this + // case n > 1. + KnuthDiv(__U, __V, __Q, __R, m, n); + } + + // If the caller wants the quotient + if (Quotient) { + // Set up the Quotient value's memory. + if (Quotient->BitWidth != LHS.BitWidth) { + if (Quotient->isSingleWord()) Quotient->set_VAL(0); + } else + Quotient->clear(); + + // The quotient is in Q. Reconstitute the quotient into Quotient's low + // order words. + if (lhsWords == 1) { + uint64_t tmp = + uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2)); + Quotient->set_VAL(tmp); + } else { + assert(!Quotient->isSingleWord() && + "Quotient ap_private not large enough"); + for (unsigned i = 0; i < lhsWords; ++i) + Quotient->set_pVal( + i, uint64_t(__Q[i * 2]) | + ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2))); + } + Quotient->clearUnusedBits(); + } + + // If the caller wants the remainder + if (Remainder) { + // Set up the Remainder value's memory. + if (Remainder->BitWidth != 64 /* RHS.BitWidth */) { + if (Remainder->isSingleWord()) Remainder->set_VAL(0); + } else + Remainder->clear(); + + // The remainder is in __R. Reconstitute the remainder into Remainder's low + // order words. + if (rhsWords == 1) { + uint64_t tmp = + uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2)); + Remainder->set_VAL(tmp); + } else { + assert(!Remainder->isSingleWord() && + "Remainder ap_private not large enough"); + for (unsigned i = 0; i < rhsWords; ++i) + Remainder->set_pVal( + i, uint64_t(__R[i * 2]) | + ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2))); + } + Remainder->clearUnusedBits(); + } + + // Clean up the memory we allocated. + if (__U != &SPACE[0]) { + delete[] __U; + delete[] __V; + delete[] __Q; + delete[] __R; + } +} + +/// @brief Logical right-shift function. +template +INLINE ap_private<_AP_W, _AP_S, _AP_C> lshr( + const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) { + return LHS.lshr(shiftAmt); +} + +/// Left-shift the ap_private by shiftAmt. +/// @brief Left-shift function. +template +INLINE ap_private<_AP_W, _AP_S, _AP_C> shl( + const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) { + return LHS.shl(shiftAmt); +} + +} // namespace ap_private_ops + +#endif // LLVM_SUPPORT_MATHEXTRAS_H + +/// This enumeration just provides for internal constants used in this +/// translation unit. +enum { + MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified + ///< Note that this must remain synchronized with IntegerType::MIN_INT_BITS + MAX_INT_BITS = (1 << 23) - 1 ///< Maximum number of bits that can be specified + ///< Note that this must remain synchronized with IntegerType::MAX_INT_BITS +}; + +//===----------------------------------------------------------------------===// +// ap_private Class +//===----------------------------------------------------------------------===// + +/// ap_private - This class represents arbitrary precision constant integral +/// values. +/// It is a functional replacement for common case unsigned integer type like +/// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width +/// integer sizes and large integer value types such as 3-bits, 15-bits, or more +/// than 64-bits of precision. ap_private provides a variety of arithmetic +/// operators +/// and methods to manipulate integer values of any bit-width. It supports both +/// the typical integer arithmetic and comparison operations as well as bitwise +/// manipulation. +/// +/// The class has several invariants worth noting: +/// * All bit, byte, and word positions are zero-based. +/// * Once the bit width is set, it doesn't change except by the Truncate, +/// SignExtend, or ZeroExtend operations. +/// * All binary operators must be on ap_private instances of the same bit +/// width. +/// Attempting to use these operators on instances with different bit +/// widths will yield an assertion. +/// * The value is stored canonically as an unsigned value. For operations +/// where it makes a difference, there are both signed and unsigned variants +/// of the operation. For example, sdiv and udiv. However, because the bit +/// widths must be the same, operations such as Mul and Add produce the same +/// results regardless of whether the values are interpreted as signed or +/// not. +/// * In general, the class tries to follow the style of computation that LLVM +/// uses in its IR. This simplifies its use for LLVM. +/// +/// @brief Class for arbitrary precision integers. + +#if defined(_MSC_VER) +#if _MSC_VER < 1400 && !defined(for) +#define for if (0); else for +#endif +typedef unsigned __int64 ap_ulong; +typedef signed __int64 ap_slong; +#else +typedef unsigned long long ap_ulong; +typedef signed long long ap_slong; +#endif +template +struct valtype; + +template +struct valtype<_AP_N8, false> { + typedef uint64_t Type; +}; + +template +struct valtype<_AP_N8, true> { + typedef int64_t Type; +}; + +template <> +struct valtype<1, false> { + typedef unsigned char Type; +}; +template <> +struct valtype<2, false> { + typedef unsigned short Type; +}; +template <> +struct valtype<3, false> { + typedef unsigned int Type; +}; +template <> +struct valtype<4, false> { + typedef unsigned int Type; +}; +template <> +struct valtype<1, true> { + typedef signed char Type; +}; +template <> +struct valtype<2, true> { + typedef short Type; +}; +template <> +struct valtype<3, true> { + typedef int Type; +}; +template <> +struct valtype<4, true> { + typedef int Type; +}; + +template +struct ap_private_enable_if {}; +template <> +struct ap_private_enable_if { + static const bool isValid = true; +}; + +// When bitwidth < 64 +template +class ap_private<_AP_W, _AP_S, true> { + // SFINAE pattern. Only consider this class when _AP_W <= 64 + const static bool valid = ap_private_enable_if<_AP_W <= 64>::isValid; + +#ifdef _MSC_VER +#pragma warning(disable : 4521 4522) +#endif + public: + typedef typename valtype<(_AP_W + 7) / 8, _AP_S>::Type ValType; + typedef ap_private<_AP_W, _AP_S> Type; + template + struct RType { + enum { + mult_w = _AP_W + _AP_W2, + mult_s = _AP_S || _AP_S2, + plus_w = + AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1, + plus_s = _AP_S || _AP_S2, + minus_w = + AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1, + minus_s = true, + div_w = _AP_W + _AP_S2, + div_s = _AP_S || _AP_S2, + mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)), + mod_s = _AP_S, + logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)), + logic_s = _AP_S || _AP_S2 + }; + typedef ap_private mult; + typedef ap_private plus; + typedef ap_private minus; + typedef ap_private logic; + typedef ap_private div; + typedef ap_private mod; + typedef ap_private<_AP_W, _AP_S> arg1; + typedef bool reduce; + }; + enum { APINT_BITS_PER_WORD = sizeof(uint64_t) * 8 }; + enum { + excess_bits = (_AP_W % APINT_BITS_PER_WORD) + ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD) + : 0 + }; + static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits)); + static const uint64_t not_mask = ~mask; + static const uint64_t sign_bit_mask = 1ULL << (APINT_BITS_PER_WORD - 1); + template + struct sign_ext_mask { + static const uint64_t mask = ~0ULL << _AP_W1; + }; + static const int width = _AP_W; + + enum { + BitWidth = _AP_W, + _AP_N = 1, + }; + ValType VAL; ///< Used to store the <= 64 bits integer value. +#ifdef AP_CANARY + ValType CANARY; + void check_canary() { assert(CANARY == (ValType)0xDEADBEEFDEADBEEF); } + void set_canary() { CANARY = (ValType)0xDEADBEEFDEADBEEF; } +#else + void check_canary() {} + void set_canary() {} +#endif + + INLINE ValType& get_VAL(void) { return VAL; } + INLINE ValType get_VAL(void) const { return VAL; } + INLINE ValType get_VAL(void) const volatile { return VAL; } + INLINE void set_VAL(uint64_t value) { VAL = (ValType)value; } + INLINE ValType& get_pVal(int i) { return VAL; } + INLINE ValType get_pVal(int i) const { return VAL; } + INLINE const uint64_t* get_pVal() const { + assert(0 && "invalid usage"); + return 0; + } + INLINE ValType get_pVal(int i) const volatile { return VAL; } + INLINE uint64_t* get_pVal() const volatile { + assert(0 && "invalid usage"); + return 0; + } + INLINE void set_pVal(int i, uint64_t value) { VAL = (ValType)value; } + + INLINE uint32_t getBitWidth() const { return BitWidth; } + + template + ap_private<_AP_W, _AP_S>& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + ap_private<_AP_W, _AP_S>& operator=( + const volatile ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(RHS.get_VAL()); // TODO check here about ap_private + clearUnusedBits(); + return *this; + } + + void operator=(const ap_private& RHS) volatile { + // Don't do anything for X = X + VAL = RHS.get_VAL(); // No need to check because no harm done by copying. + clearUnusedBits(); + } + + ap_private& operator=(const ap_private& RHS) { + // Don't do anything for X = X + VAL = RHS.get_VAL(); // No need to check because no harm done by copying. + clearUnusedBits(); + return *this; + } + + void operator=(const volatile ap_private& RHS) volatile { + // Don't do anything for X = X + VAL = RHS.get_VAL(); // No need to check because no harm done by copying. + clearUnusedBits(); + } + + ap_private& operator=(const volatile ap_private& RHS) { + // Don't do anything for X = X + VAL = RHS.get_VAL(); // No need to check because no harm done by copying. + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + *this = ap_private<_AP_W2, false>(op2); + return *this; + } + +#define ASSIGN_OP_FROM_INT(C_TYPE) \ + INLINE ap_private& operator=(const C_TYPE v) { \ + set_canary(); \ + this->VAL = (ValType)v; \ + clearUnusedBits(); \ + check_canary(); \ + return *this; \ + } + +ASSIGN_OP_FROM_INT(bool) +ASSIGN_OP_FROM_INT(char) +ASSIGN_OP_FROM_INT(signed char) +ASSIGN_OP_FROM_INT(unsigned char) +ASSIGN_OP_FROM_INT(short) +ASSIGN_OP_FROM_INT(unsigned short) +ASSIGN_OP_FROM_INT(int) +ASSIGN_OP_FROM_INT(unsigned int) +ASSIGN_OP_FROM_INT(long) +ASSIGN_OP_FROM_INT(unsigned long) +ASSIGN_OP_FROM_INT(ap_slong) +ASSIGN_OP_FROM_INT(ap_ulong) +#if 0 +ASSIGN_OP_FROM_INT(half) +ASSIGN_OP_FROM_INT(float) +ASSIGN_OP_FROM_INT(double) +#endif +#undef ASSIGN_OP_FROM_INT + + // XXX This is a must to prevent pointer being converted to bool. + INLINE ap_private& operator=(const char* s) { + ap_private tmp(s); // XXX direct-initialization, as ctor is explicit. + operator=(tmp); + return *this; + } + + private: + explicit INLINE ap_private(uint64_t* val) : VAL(val[0]) { + set_canary(); + clearUnusedBits(); + check_canary(); + } + + INLINE bool isSingleWord() const { return true; } + + public: + INLINE void fromString(const char* strStart, uint32_t slen, uint8_t radix) { + bool isNeg = strStart[0] == '-'; + if (isNeg) { + strStart++; + slen--; + } + + if (strStart[0] == '0' && (strStart[1] == 'b' || strStart[1] == 'B')) { + //if(radix == 0) radix = 2; + _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", strStart, 2, radix); + strStart += 2; + slen -=2; + } else if (strStart[0] == '0' && (strStart[1] == 'o' || strStart[1] == 'O')) { + //if (radix == 0) radix = 8; + _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", strStart, 8, radix); + strStart += 2; + slen -=2; + } else if (strStart[0] == '0' && (strStart[1] == 'x' || strStart[1] == 'X')) { + //if (radix == 0) radix = 16; + _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", strStart, 16, radix); + strStart += 2; + slen -=2; + } else if (strStart[0] == '0' && (strStart[1] == 'd' || strStart[1] == 'D')) { + //if (radix == 0) radix = 10; + _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", strStart, 10, radix); + strStart += 2; + slen -=2; + } else if (radix == 0) { + //radix = 2; // XXX default value + } + + // Check our assumptions here + assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) && + "Radix should be 2, 8, 10, or 16!"); + assert(strStart && "String is null?"); + + // Clear bits. + uint64_t tmpVAL = VAL = 0; + + switch (radix) { + case 2: + // sscanf(strStart,"%b",&VAL); + // tmpVAL = *strStart =='1' ? ~0ULL : 0; + for (; *strStart; ++strStart) { + assert((*strStart == '0' || *strStart == '1') && + ("Wrong binary number")); + tmpVAL <<= 1; + tmpVAL |= (*strStart - '0'); + } + break; + case 8: +#ifdef _MSC_VER + sscanf_s(strStart, "%llo", &tmpVAL, slen + 1); +#else +#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__) + sscanf(strStart, "%lo", &tmpVAL); +#else + sscanf(strStart, "%llo", &tmpVAL); +#endif //__x86_64__ +#endif //_MSC_VER + break; + case 10: +#ifdef _MSC_VER + sscanf_s(strStart, "%llu", &tmpVAL, slen + 1); +#else +#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__) + sscanf(strStart, "%lu", &tmpVAL); +#else + sscanf(strStart, "%llu", &tmpVAL); +#endif //__x86_64__ +#endif //_MSC_VER + break; + case 16: +#ifdef _MSC_VER + sscanf_s(strStart, "%llx", &tmpVAL, slen + 1); +#else +#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__) + sscanf(strStart, "%lx", &tmpVAL); +#else + sscanf(strStart, "%llx", &tmpVAL); +#endif //__x86_64__ +#endif //_MSC_VER + break; + default: + assert(true && "Unknown radix"); + // error + } + VAL = isNeg ? (ValType)(-tmpVAL) : (ValType)(tmpVAL); + + clearUnusedBits(); + } + + private: + INLINE ap_private(const std::string& val, uint8_t radix = 2) : VAL(0) { + assert(!val.empty() && "String empty?"); + set_canary(); + fromString(val.c_str(), val.size(), radix); + check_canary(); + } + + INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) + : VAL(0) { + set_canary(); + fromString(strStart, slen, radix); + check_canary(); + } + + INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) + : VAL(bigVal[0]) { + set_canary(); + clearUnusedBits(); + check_canary(); + } + + public: + INLINE ap_private() { + set_canary(); + clearUnusedBits(); + check_canary(); + } + +#define CTOR(TYPE) \ + INLINE ap_private(TYPE v) : VAL((ValType)v) { \ + set_canary(); \ + clearUnusedBits(); \ + check_canary(); \ + } + CTOR(bool) + CTOR(char) + CTOR(signed char) + CTOR(unsigned char) + CTOR(short) + CTOR(unsigned short) + CTOR(int) + CTOR(unsigned int) + CTOR(long) + CTOR(unsigned long) + CTOR(ap_slong) + CTOR(ap_ulong) +#if 0 + CTOR(half) + CTOR(float) + CTOR(double) +#endif +#undef CTOR + + template + INLINE ap_private(const ap_private<_AP_W1, _AP_S1, _AP_OPT>& that) + : VAL((ValType)that.get_VAL()) { + set_canary(); + clearUnusedBits(); + check_canary(); + } + + template + INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, _AP_OPT>& that) + : VAL((ValType)that.get_VAL()) { + set_canary(); + clearUnusedBits(); + check_canary(); + } + + explicit INLINE ap_private(const char* val) { + set_canary(); + unsigned char radix = 10; + std::string str = ap_private_ops::parseString(val, radix); // will set radix. + std::string::size_type pos = str.find('.'); + // trunc all fraction part + if (pos != std::string::npos) str = str.substr(pos); + + ap_private<_AP_W, _AP_S> ap_private_val(str, radix); + operator=(ap_private_val); + check_canary(); + } + + INLINE ap_private(const char* val, signed char rd) { + set_canary(); + unsigned char radix = rd; + std::string str = ap_private_ops::parseString(val, radix); // will set radix. + std::string::size_type pos = str.find('.'); + // trunc all fraction part + if (pos != std::string::npos) str = str.substr(pos); + + ap_private<_AP_W, _AP_S> ap_private_val(str, radix); + operator=(ap_private_val); + check_canary(); + } + + INLINE ~ap_private() { check_canary(); } + + INLINE bool isNegative() const { + static const uint64_t sign_mask = 1ULL << (_AP_W - 1); + return _AP_S && (sign_mask & VAL); + } + + INLINE bool isPositive() const { return !isNegative(); } + + INLINE bool isStrictlyPositive() const { return !isNegative() && VAL != 0; } + + INLINE bool isAllOnesValue() const { return (mask & VAL) == mask; } + + INLINE bool operator==(const ap_private<_AP_W, _AP_S>& RHS) const { + return VAL == RHS.get_VAL(); + } + INLINE bool operator==(const ap_private<_AP_W, !_AP_S>& RHS) const { + return (uint64_t)VAL == (uint64_t)RHS.get_VAL(); + } + + INLINE bool operator==(uint64_t Val) const { return ((uint64_t)VAL == Val); } + INLINE bool operator!=(uint64_t Val) const { return ((uint64_t)VAL != Val); } + INLINE bool operator!=(const ap_private<_AP_W, _AP_S>& RHS) const { + return VAL != RHS.get_VAL(); + } + INLINE bool operator!=(const ap_private<_AP_W, !_AP_S>& RHS) const { + return (uint64_t)VAL != (uint64_t)RHS.get_VAL(); + } + + /// postfix increment. + const ap_private operator++(int) { + ap_private orig(*this); + VAL++; + clearUnusedBits(); + return orig; + } + + /// prefix increment. + const ap_private operator++() { + ++VAL; + clearUnusedBits(); + return *this; + } + + /// postfix decrement. + const ap_private operator--(int) { + ap_private orig(*this); + --VAL; + clearUnusedBits(); + return orig; + } + + /// prefix decrement. + const ap_private operator--() { + --VAL; + clearUnusedBits(); + return *this; + } + + /// one's complement. + INLINE ap_private<_AP_W + !_AP_S, true> operator~() const { + ap_private<_AP_W + !_AP_S, true> Result(*this); + Result.flip(); + return Result; + } + + /// two's complement. + INLINE typename RType<1, false>::minus operator-() const { + return ap_private<1, false>(0) - (*this); + } + + /// logic negation. + INLINE bool operator!() const { return !VAL; } + + INLINE std::string toString(uint8_t radix, bool wantSigned) const; + INLINE std::string toStringUnsigned(uint8_t radix = 10) const { + return toString(radix, false); + } + INLINE std::string toStringSigned(uint8_t radix = 10) const { + return toString(radix, true); + } + INLINE void clear() { VAL = 0; } + INLINE ap_private& clear(uint32_t bitPosition) { + VAL &= ~(1ULL << (bitPosition)); + clearUnusedBits(); + return *this; + } + + INLINE ap_private ashr(uint32_t shiftAmt) const { + if (_AP_S) + return ap_private((shiftAmt == BitWidth) ? 0 + : ((int64_t)VAL) >> (shiftAmt)); + else + return ap_private((shiftAmt == BitWidth) ? 0 + : ((uint64_t)VAL) >> (shiftAmt)); + } + + INLINE ap_private lshr(uint32_t shiftAmt) const { + return ap_private((shiftAmt == BitWidth) + ? ap_private(0) + : ap_private((VAL & mask) >> (shiftAmt))); + } + + INLINE ap_private shl(uint32_t shiftAmt) const +// just for clang compiler +#if defined(__clang__) && !defined(__CLANG_3_1__) + __attribute__((no_sanitize("undefined"))) +#endif + { + if (shiftAmt > BitWidth) { + if (!isNegative()) + return ap_private(0); + else + return ap_private(-1); + } + if (shiftAmt == BitWidth) + return ap_private(0); + else + return ap_private((VAL) << (shiftAmt)); + // return ap_private((shiftAmt == BitWidth) ? ap_private(0ULL) : + // ap_private(VAL << shiftAmt)); + } + + INLINE int64_t getSExtValue() const { return VAL; } + + // XXX XXX this function is used in CBE + INLINE uint64_t getZExtValue() const { return VAL & mask; } + + template + INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) { + set_canary(); + *this = ref.get(); + check_canary(); + } + + template + INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) { + set_canary(); + *this = ((uint64_t)(bool)ref); + check_canary(); + } + +// template +// INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) { +// set_canary(); +// *this = ref.get(); +// check_canary(); +// } +// +// template +// INLINE ap_private( +// const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { +// set_canary(); +// *this = ((val.operator ap_private<_AP_W2, false>())); +// check_canary(); +// } +// +// template +// INLINE ap_private( +// const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { +// set_canary(); +// *this = (uint64_t)(bool)val; +// check_canary(); +// } + + INLINE void write(const ap_private<_AP_W, _AP_S>& op2) volatile { + *this = (op2); + } + + // Explicit conversions to C interger types + //----------------------------------------------------------- + INLINE operator ValType() const { return get_VAL(); } + + INLINE int to_uchar() const { return (unsigned char)get_VAL(); } + + INLINE int to_char() const { return (signed char)get_VAL(); } + + INLINE int to_ushort() const { return (unsigned short)get_VAL(); } + + INLINE int to_short() const { return (short)get_VAL(); } + + INLINE int to_int() const { + // ap_private<64 /* _AP_W */, _AP_S> res(V); + return (int)get_VAL(); + } + + INLINE unsigned to_uint() const { return (unsigned)get_VAL(); } + + INLINE long to_long() const { return (long)get_VAL(); } + + INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); } + + INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); } + + INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); } + + INLINE double to_double() const { + if (isNegative()) + return roundToDouble(true); + else + return roundToDouble(false); + } + + INLINE unsigned length() const { return _AP_W; } + + INLINE bool isMinValue() const { return VAL == 0; } + template + INLINE ap_private& operator&=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(((uint64_t)VAL) & RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator|=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(((uint64_t)VAL) | RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator^=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(((uint64_t)VAL) ^ RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(((uint64_t)VAL) * RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(((uint64_t)VAL) + RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) { + VAL = (ValType)(((uint64_t)VAL) - RHS.get_VAL()); + clearUnusedBits(); + return *this; + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::logic operator&( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (RType<_AP_W1, _AP_S1>::logic_w <= 64) { + typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) & + RHS.get_VAL()); + return Ret; + } else { + typename RType<_AP_W1, _AP_S1>::logic Ret = *this; + return Ret & RHS; + } + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::logic operator^( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (RType<_AP_W1, _AP_S1>::logic_w <= 64) { + typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) ^ + RHS.get_VAL()); + return Ret; + } else { + typename RType<_AP_W1, _AP_S1>::logic Ret = *this; + return Ret ^ RHS; + } + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::logic operator|( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (RType<_AP_W1, _AP_S1>::logic_w <= 64) { + typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) | + RHS.get_VAL()); + return Ret; + } else { + typename RType<_AP_W1, _AP_S1>::logic Ret = *this; + return Ret | RHS; + } + } + + INLINE ap_private And(const ap_private& RHS) const { + return ap_private(VAL & RHS.get_VAL()); + } + + INLINE ap_private Or(const ap_private& RHS) const { + return ap_private(VAL | RHS.get_VAL()); + } + + INLINE ap_private Xor(const ap_private& RHS) const { + return ap_private(VAL ^ RHS.get_VAL()); + } +#if 1 + template + INLINE typename RType<_AP_W1, _AP_S1>::mult operator*( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (RType<_AP_W1, _AP_S1>::mult_w <= 64) { + typename RType<_AP_W1, _AP_S1>::mult Result(((uint64_t)VAL) * + RHS.get_VAL()); + return Result; + } else { + typename RType<_AP_W1, _AP_S1>::mult Result(*this); + Result *= RHS; + return Result; + } + } +#endif + INLINE ap_private Mul(const ap_private& RHS) const { + return ap_private(VAL * RHS.get_VAL()); + } + + INLINE ap_private Add(const ap_private& RHS) const { + return ap_private(VAL + RHS.get_VAL()); + } + + INLINE ap_private Sub(const ap_private& RHS) const { + return ap_private(VAL - RHS.get_VAL()); + } + + INLINE ap_private& operator&=(uint64_t RHS) { + VAL &= (ValType)RHS; + clearUnusedBits(); + return *this; + } + INLINE ap_private& operator|=(uint64_t RHS) { + VAL |= (ValType)RHS; + clearUnusedBits(); + return *this; + } + INLINE ap_private& operator^=(uint64_t RHS) { + VAL ^= (ValType)RHS; + clearUnusedBits(); + return *this; + } + INLINE ap_private& operator*=(uint64_t RHS) { + VAL *= (ValType)RHS; + clearUnusedBits(); + return *this; + } + INLINE ap_private& operator+=(uint64_t RHS) { + VAL += (ValType)RHS; + clearUnusedBits(); + return *this; + } + INLINE ap_private& operator-=(uint64_t RHS) { + VAL -= (ValType)RHS; + clearUnusedBits(); + return *this; + } + + INLINE bool isMinSignedValue() const { + static const uint64_t min_mask = ~(~0ULL << (_AP_W - 1)); + return BitWidth == 1 ? VAL == 1 + : (ap_private_ops::isNegative<_AP_W>(*this) && + ((min_mask & VAL) == 0)); + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::plus operator+( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (RType<_AP_W1, _AP_S1>::plus_w <= 64) + return typename RType<_AP_W1, _AP_S1>::plus( + RType<_AP_W1, _AP_S1>::plus_s + ? int64_t(((uint64_t)VAL) + RHS.get_VAL()) + : uint64_t(((uint64_t)VAL) + RHS.get_VAL())); + typename RType<_AP_W1, _AP_S1>::plus Result = RHS; + Result += VAL; + return Result; + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::minus operator-( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (RType<_AP_W1, _AP_S1>::minus_w <= 64) + return typename RType<_AP_W1, _AP_S1>::minus( + int64_t(((uint64_t)VAL) - RHS.get_VAL())); + typename RType<_AP_W1, _AP_S1>::minus Result = *this; + Result -= RHS; + return Result; + } + + INLINE uint32_t countPopulation() const { + return ap_private_ops::CountPopulation_64(VAL); + } + INLINE uint32_t countLeadingZeros() const { + int remainder = BitWidth % 64; + int excessBits = (64 - remainder) % 64; + uint32_t Count = ap_private_ops::CountLeadingZeros_64(VAL); + if (Count) Count -= excessBits; + return AESL_std::min(Count, (uint32_t)_AP_W); + } + + /// HiBits - This function returns the high "numBits" bits of this ap_private. + INLINE ap_private<_AP_W, _AP_S> getHiBits(uint32_t numBits) const { + ap_private<_AP_W, _AP_S> ret(*this); + ret = (ret) >> (BitWidth - numBits); + return ret; + } + + /// LoBits - This function returns the low "numBits" bits of this ap_private. + INLINE ap_private<_AP_W, _AP_S> getLoBits(uint32_t numBits) const { + ap_private<_AP_W, _AP_S> ret(((uint64_t)VAL) << (BitWidth - numBits)); + ret = (ret) >> (BitWidth - numBits); + return ret; + // return ap_private(numBits, (VAL << (BitWidth - numBits))>> (BitWidth - + // numBits)); + } + + INLINE ap_private<_AP_W, _AP_S>& set(uint32_t bitPosition) { + VAL |= (1ULL << (bitPosition)); + clearUnusedBits(); + return *this; // clearUnusedBits(); + } + + INLINE void set() { + VAL = (ValType)~0ULL; + clearUnusedBits(); + } + + template + INLINE void set(const ap_private<_AP_W3, false>& val) { + operator=(ap_private<_AP_W3, _AP_S>(val)); + } + + INLINE void set(const ap_private& val) { operator=(val); } + + INLINE void clearUnusedBits(void) volatile +// just for clang compiler +#if defined(__clang__) && !defined(__CLANG_3_1__) + __attribute__((no_sanitize("undefined"))) +#endif + { + enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 }; + VAL = (ValType)( + _AP_S + ? ((((int64_t)VAL) << (excess_bits)) >> (excess_bits)) + : (excess_bits ? (((uint64_t)VAL) << (excess_bits)) >> (excess_bits) + : (uint64_t)VAL)); + } + + INLINE void clearUnusedBitsToZero(void) { + enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 }; + static uint64_t mask = ~0ULL >> (excess_bits); + VAL &= mask; + } + + INLINE ap_private udiv(const ap_private& RHS) const { + return ap_private((uint64_t)VAL / RHS.get_VAL()); + } + + /// Signed divide this ap_private by ap_private RHS. + /// @brief Signed division function for ap_private. + INLINE ap_private sdiv(const ap_private& RHS) const { + if (isNegative()) + if (RHS.isNegative()) + return ((uint64_t)(0 - (*this))) / (uint64_t)(0 - RHS); + else + return 0 - ((uint64_t)(0 - (*this)) / (uint64_t)(RHS)); + else if (RHS.isNegative()) + return 0 - (this->udiv((ap_private)(0 - RHS))); + return this->udiv(RHS); + } + + template + INLINE ap_private urem(const ap_private<_AP_W, _AP_S2>& RHS) const { + assert(RHS.get_VAL() != 0 && "Divide by 0"); + return ap_private(((uint64_t)VAL) % ((uint64_t)RHS.get_VAL())); + } + + /// Signed remainder operation on ap_private. + /// @brief Function for signed remainder operation. + template + INLINE ap_private srem(const ap_private<_AP_W, _AP_S2>& RHS) const { + if (isNegative()) { + ap_private lhs = 0 - (*this); + if (RHS.isNegative()) { + ap_private rhs = 0 - RHS; + return 0 - (lhs.urem(rhs)); + } else + return 0 - (lhs.urem(RHS)); + } else if (RHS.isNegative()) { + ap_private rhs = 0 - RHS; + return this->urem(rhs); + } + return this->urem(RHS); + } + + template + INLINE bool eq(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return (*this) == RHS; + } + + template + INLINE bool ne(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return !((*this) == RHS); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// the validity of the less-than relationship. + /// @returns true if *this < RHS when both are considered unsigned. + /// @brief Unsigned less than comparison + template + INLINE bool ult(const ap_private<_AP_W1, _AP_S1>& RHS) const { + if (_AP_W1 <= 64) { + uint64_t lhsZext = ((uint64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W); + uint64_t rhsZext = + ((uint64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1); + return lhsZext < rhsZext; + } else + return RHS.uge(*this); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// validity of the less-than relationship. + /// @returns true if *this < RHS when both are considered signed. + /// @brief Signed less than comparison + template + INLINE bool slt(const ap_private<_AP_W1, _AP_S1>& RHS) const +// just for clang compiler +#if defined(__clang__) && !defined(__CLANG_3_1__) + __attribute__((no_sanitize("undefined"))) +#endif + { + if (_AP_W1 <= 64) { + int64_t lhsSext = ((int64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W); + int64_t rhsSext = + ((int64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1); + return lhsSext < rhsSext; + } else + return RHS.sge(*this); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// validity of the less-or-equal relationship. + /// @returns true if *this <= RHS when both are considered unsigned. + /// @brief Unsigned less or equal comparison + template + INLINE bool ule(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return ult(RHS) || eq(RHS); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// validity of the less-or-equal relationship. + /// @returns true if *this <= RHS when both are considered signed. + /// @brief Signed less or equal comparison + template + INLINE bool sle(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return slt(RHS) || eq(RHS); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// the validity of the greater-than relationship. + /// @returns true if *this > RHS when both are considered unsigned. + /// @brief Unsigned greather than comparison + template + INLINE bool ugt(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return !ult(RHS) && !eq(RHS); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// the validity of the greater-than relationship. + /// @returns true if *this > RHS when both are considered signed. + /// @brief Signed greather than comparison + template + INLINE bool sgt(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return !slt(RHS) && !eq(RHS); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// validity of the greater-or-equal relationship. + /// @returns true if *this >= RHS when both are considered unsigned. + /// @brief Unsigned greater or equal comparison + template + INLINE bool uge(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return !ult(RHS); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// validity of the greater-or-equal relationship. + /// @returns true if *this >= RHS when both are considered signed. + /// @brief Signed greather or equal comparison + template + INLINE bool sge(const ap_private<_AP_W1, _AP_S1>& RHS) const { + return !slt(RHS); + } + + INLINE ap_private abs() const { + if (isNegative()) return -(*this); + return *this; + } + + INLINE ap_private<_AP_W, false> get() const { + ap_private<_AP_W, false> ret(*this); + return ret; + } + + INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen, + uint8_t radix) { + return _AP_W; + } + + INLINE uint32_t getActiveBits() const { + uint32_t bits = _AP_W - countLeadingZeros(); + return bits ? bits : 1; + } + + INLINE double roundToDouble(bool isSigned = false) const { + return isSigned ? double((int64_t)VAL) : double((uint64_t)VAL); + } + + /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise + * versa*/ + INLINE ap_private& reverse() { + for (int i = 0; i < _AP_W / 2; ++i) { + bool tmp = operator[](i); + if (operator[](_AP_W - 1 - i)) + set(i); + else + clear(i); + if (tmp) + set(_AP_W - 1 - i); + else + clear(_AP_W - 1 - i); + } + clearUnusedBits(); + return *this; + } + + /*Return true if the value of ap_private instance is zero*/ + INLINE bool iszero() const { return isMinValue(); } + + INLINE bool to_bool() const { return !iszero(); } + + /* x < 0 */ + INLINE bool sign() const { + if (isNegative()) return true; + return false; + } + + /* x[i] = !x[i] */ + INLINE void invert(int i) { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + flip(i); + } + + /* x[i] */ + INLINE bool test(int i) const { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + return operator[](i); + } + + // This is used for sc_lv and sc_bv, which is implemented by sc_uint + // Rotate an ap_private object n places to the left + INLINE void lrotate(int n) { + assert(n >= 0 && "Attempting to shift negative index"); + assert(n < _AP_W && "Shift value larger than bit width"); + operator=(shl(n) | lshr(_AP_W - n)); + } + + // This is used for sc_lv and sc_bv, which is implemented by sc_uint + // Rotate an ap_private object n places to the right + INLINE void rrotate(int n) { + assert(n >= 0 && "Attempting to shift negative index"); + assert(n < _AP_W && "Shift value larger than bit width"); + operator=(lshr(n) | shl(_AP_W - n)); + } + + // Set the ith bit into v + INLINE void set(int i, bool v) { + assert(i >= 0 && "Attempting to write bit with negative index"); + assert(i < _AP_W && "Attempting to write bit beyond MSB"); + v ? set(i) : clear(i); + } + + // Set the ith bit into v + INLINE void set_bit(int i, bool v) { + assert(i >= 0 && "Attempting to write bit with negative index"); + assert(i < _AP_W && "Attempting to write bit beyond MSB"); + v ? set(i) : clear(i); + } + + // Get the value of ith bit + INLINE bool get_bit(int i) const { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + return (((1ULL << i) & VAL) != 0); + } + + /// Toggle all bits. + INLINE ap_private& flip() { + VAL = (ValType)((~0ULL ^ VAL) & mask); + clearUnusedBits(); + return *this; + } + + /// Toggles a given bit to its opposite value. + INLINE ap_private& flip(uint32_t bitPosition) { + assert(bitPosition < BitWidth && "Out of the bit-width range!"); + set_bit(bitPosition, !get_bit(bitPosition)); + return *this; + } + + // complements every bit + INLINE void b_not() { flip(); } + +// Binary Arithmetic +//----------------------------------------------------------- +#define OP_BIN_AP(Sym, Rty, Fun) \ + template \ + INLINE typename RType<_AP_W2, _AP_S2>::Rty operator Sym( \ + const ap_private<_AP_W2, _AP_S2>& op) const { \ + typename RType<_AP_W2, _AP_S2>::Rty lhs(*this); \ + typename RType<_AP_W2, _AP_S2>::Rty rhs(op); \ + return lhs.Fun(rhs); \ + } + +/// Bitwise and, or, xor +// OP_BIN_AP(&,logic, And) +// OP_BIN_AP(|,logic, Or) +// OP_BIN_AP(^,logic, Xor) +#undef OP_BIN_AP + + template + INLINE typename RType<_AP_W2, _AP_S2>::div operator/( + const ap_private<_AP_W2, _AP_S2>& op) const { + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + lhs = *this; + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + rhs = op; + return typename RType<_AP_W2, _AP_S2>::div( + (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs)); + } + + template + INLINE typename RType<_AP_W2, _AP_S2>::mod operator%( + const ap_private<_AP_W2, _AP_S2>& op) const { + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + lhs = *this; + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + rhs = op; + typename RType<_AP_W2, _AP_S2>::mod res = + typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs) + : lhs.urem(rhs)); + return res; + } + +#define OP_ASSIGN_AP_2(Sym) \ + template \ + INLINE ap_private<_AP_W, _AP_S>& operator Sym##=( \ + const ap_private<_AP_W2, _AP_S2>& op) { \ + *this = operator Sym(op); \ + return *this; \ + } + + OP_ASSIGN_AP_2(/) + OP_ASSIGN_AP_2(%) +#undef OP_ASSIGN_AP_2 + +/// Bitwise assign: and, or, xor +//------------------------------------------------------------- +// OP_ASSIGN_AP(&) +// OP_ASSIGN_AP(^) +// OP_ASSIGN_AP(|) + +#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED) \ + INLINE ap_private operator<<(const TYPE op) const { \ + if (op >= _AP_W) return ap_private(0); \ + if (SIGNED && op < 0) return *this >> (0 - op); \ + return shl(op); \ + } + + // OP_LEFT_SHIFT_CTYPE(bool, false) + OP_LEFT_SHIFT_CTYPE(char, CHAR_IS_SIGNED) + OP_LEFT_SHIFT_CTYPE(signed char, true) + OP_LEFT_SHIFT_CTYPE(unsigned char, false) + OP_LEFT_SHIFT_CTYPE(short, true) + OP_LEFT_SHIFT_CTYPE(unsigned short, false) + OP_LEFT_SHIFT_CTYPE(int, true) + OP_LEFT_SHIFT_CTYPE(unsigned int, false) + OP_LEFT_SHIFT_CTYPE(long, true) + OP_LEFT_SHIFT_CTYPE(unsigned long, false) + OP_LEFT_SHIFT_CTYPE(long long, true) + OP_LEFT_SHIFT_CTYPE(unsigned long long, false) +#if 0 + OP_LEFT_SHIFT_CTYPE(half, false) + OP_LEFT_SHIFT_CTYPE(float, false) + OP_LEFT_SHIFT_CTYPE(double, false) +#endif + +#undef OP_LEFT_SHIFT_CTYPE + + template + INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const { + if (_AP_S2 == false) { + uint32_t sh = op2.to_uint(); + return *this << sh; + } else { + int sh = op2.to_int(); + return *this << sh; + } + } + +#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED) \ + INLINE ap_private operator>>(const TYPE op) const { \ + if (op >= _AP_W) { \ + if (isNegative()) \ + return ap_private(-1); \ + else \ + return ap_private(0); \ + } \ + if ((SIGNED) && op < 0) return *this << (0 - op); \ + if (_AP_S) \ + return ashr(op); \ + else \ + return lshr(op); \ + } + + // OP_RIGHT_SHIFT_CTYPE(bool, false) + OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED) + OP_RIGHT_SHIFT_CTYPE(signed char, true) + OP_RIGHT_SHIFT_CTYPE(unsigned char, false) + OP_RIGHT_SHIFT_CTYPE(short, true) + OP_RIGHT_SHIFT_CTYPE(unsigned short, false) + OP_RIGHT_SHIFT_CTYPE(int, true) + OP_RIGHT_SHIFT_CTYPE(unsigned int, false) + OP_RIGHT_SHIFT_CTYPE(long, true) + OP_RIGHT_SHIFT_CTYPE(unsigned long, false) + OP_RIGHT_SHIFT_CTYPE(unsigned long long, false) + OP_RIGHT_SHIFT_CTYPE(long long, true) +#if 0 + OP_RIGHT_SHIFT_CTYPE(half, false) + OP_RIGHT_SHIFT_CTYPE(float, false) + OP_RIGHT_SHIFT_CTYPE(double, false) +#endif + +#undef OP_RIGHT_SHIFT_CTYPE + + template + INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const { + if (_AP_S2 == false) { + uint32_t sh = op2.to_uint(); + return *this >> sh; + } else { + int sh = op2.to_int(); + return *this >> sh; + } + } + + /// Shift assign + //----------------------------------------------------------------- + + //INLINE const ap_private& operator<<=(uint32_t shiftAmt) { + // VAL <<= shiftAmt; + // clearUnusedBits(); + // return *this; + //} + +#define OP_ASSIGN_AP(Sym) \ + template \ + INLINE ap_private& operator Sym##=(int op) { \ + *this = operator Sym(op); \ + clearUnusedBits(); \ + return *this; \ + } \ + INLINE ap_private& operator Sym##=(unsigned int op) { \ + *this = operator Sym(op); \ + clearUnusedBits(); \ + return *this; \ + } \ + template \ + INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \ + *this = operator Sym(op); \ + clearUnusedBits(); \ + return *this; \ + } + + OP_ASSIGN_AP(>>) + OP_ASSIGN_AP(<<) +#undef OP_ASSIGN_AP + + /// Comparisons + //----------------------------------------------------------------- + template + INLINE bool operator==(const ap_private<_AP_W1, _AP_S1>& op) const { + enum { _AP_MAX_W = AP_MAX(AP_MAX(_AP_W, _AP_W1), 32) }; + ap_private<_AP_MAX_W, false> lhs(*this); + ap_private<_AP_MAX_W, false> rhs(op); + if (_AP_MAX_W <= 64) { + return (uint64_t)lhs.get_VAL() == (uint64_t)rhs.get_VAL(); + } else + return lhs == rhs; + } + + template + INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const { + return !(*this == op); + } + + template + INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const { + enum { + _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)) + }; + ap_private<_AP_MAX_W, _AP_S> lhs(*this); + ap_private<_AP_MAX_W, _AP_S2> rhs(op); + // this will follow gcc rule for comparison + // between different bitwidth and signness + if (_AP_S == _AP_S2) + return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs); + else if (_AP_W < 32 && _AP_W2 < 32) + // different signness but both bitwidth is less than 32 + return lhs.sgt(rhs); + else + // different signness but bigger bitwidth + // is greater or equal to 32 + if (_AP_S) + if (_AP_W2 >= _AP_W) + return lhs.ugt(rhs); + else + return lhs.sgt(rhs); + else if (_AP_W >= _AP_W2) + return lhs.ugt(rhs); + else + return lhs.sgt(rhs); + } + + template + INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const { + return !(*this > op); + } + + template + INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const { + enum { + _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)) + }; + ap_private<_AP_MAX_W, _AP_S> lhs(*this); + ap_private<_AP_MAX_W, _AP_S2> rhs(op); + if (_AP_S == _AP_S2) + return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs); + else if (_AP_W < 32 && _AP_W2 < 32) + return lhs.slt(rhs); + else if (_AP_S) + if (_AP_W2 >= _AP_W) + return lhs.ult(rhs); + else + return lhs.slt(rhs); + else if (_AP_W >= _AP_W2) + return lhs.ult(rhs); + else + return lhs.slt(rhs); + } + + template + INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const { + return !(*this < op); + } + + /// Bit and Part Select + //-------------------------------------------------------------- + // FIXME now _private_range_ref refs to _AP_ROOT_TYPE(struct ssdm_int). + INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) { + return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const { + return _private_range_ref<_AP_W, _AP_S>( + const_cast*>(this), Hi, Lo); + } + + INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const { + return _private_range_ref<_AP_W, _AP_S>( + (const_cast*>(this)), Hi, Lo); + } + + INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) { + return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index); + } + + template + INLINE _private_bit_ref<_AP_W, _AP_S> operator[]( + const ap_private<_AP_W2, _AP_S2>& index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int()); + } + + INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index); + } + + template + INLINE const _private_bit_ref<_AP_W, _AP_S> operator[]( + const ap_private<_AP_W2, _AP_S2>& index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index.to_int()); + } + + INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index); + } + + template + INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int()); + } + + INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index); + } + + template + INLINE const _private_bit_ref<_AP_W, _AP_S> bit( + const ap_private<_AP_W2, _AP_S2>& index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index.to_int()); + } + +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> > +// concat(const ap_private<_AP_W2, _AP_S2>& a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> > +// concat(ap_private<_AP_W2, _AP_S2>& a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(const ap_private<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(const ap_private<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// *this, const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(ap_private<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// const_cast&>(*this), a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(ap_private<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> > +// operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> > +// operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> > +// operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> > +// operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > +// operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( +// const_cast&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > +// operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this, +// a2); +// } +// +// template +// INLINE ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> +// &a2) const { +// return ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// const_cast&>(*this), +// const_cast< +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { +// return ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, +// a2); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> +// &a2) const { +// return ap_concat_ref< +// _AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// const_cast&>(*this), +// const_cast&>( +// a2)); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,( +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { +// return ap_concat_ref< +// _AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2); +// } +// +// template +// INLINE ap_private operator&( +// const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { +// return *this & a2.get(); +// } +// +// template +// INLINE ap_private operator|( +// const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { +// return *this | a2.get(); +// } +// +// template +// INLINE ap_private operator^( +// const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { +// return *this ^ a2.get(); +// } + + // Reduce operation + //----------------------------------------------------------- + INLINE bool and_reduce() const { return (VAL & mask) == mask; } + + INLINE bool nand_reduce() const { return (VAL & mask) != mask; } + + INLINE bool or_reduce() const { return (bool)VAL; } + + INLINE bool nor_reduce() const { return VAL == 0; } + + INLINE bool xor_reduce() const { + unsigned int i = countPopulation(); + return (i % 2) ? true : false; + } + + INLINE bool xnor_reduce() const { + unsigned int i = countPopulation(); + return (i % 2) ? false : true; + } + + INLINE std::string to_string(uint8_t radix = 2, bool sign = false) const { + return toString(radix, radix == 10 ? _AP_S : sign); + } +}; // End of class ap_private <_AP_W, _AP_S, true> + +template +std::string ap_private<_AP_W, _AP_S, true>::toString(uint8_t radix, + bool wantSigned) const { + assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) && + "Radix should be 2, 8, 10, or 16!"); + static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", "a", "b", "c", "d", "e", "f"}; + std::string result; + if (radix != 10) { + // For the 2, 8 and 16 bit cases, we can just shift instead of divide + // because the number of bits per digit (1,3 and 4 respectively) divides + // equaly. We just shift until there value is zero. + + // First, check for a zero value and just short circuit the logic below. + if (*this == (uint64_t)(0)) { + // Always generate a radix indicator because fixed-point + // formats require it. + switch (radix) { + case 2: + result = "0b0"; + break; + case 8: + result = "0o0"; + break; + case 16: + result = "0x0"; + break; + default: + assert("invalid radix" && 0); + } + } else { + ap_private<_AP_W, false, true> tmp(*this); + size_t insert_at = 0; + bool leading_zero = true; + if (wantSigned && isNegative()) { + // They want to print the signed version and it is a negative value + // Flip the bits and add one to turn it into the equivalent positive + // value and put a '-' in the result. + tmp.flip(); + tmp++; + result = "-"; + insert_at = 1; + leading_zero = false; + } + switch (radix) { + case 2: + result += "0b"; + break; + case 8: + result += "0o"; + break; + case 16: + result += "0x"; + break; + default: + assert("invalid radix" && 0); + } + insert_at += 2; + + // Just shift tmp right for each digit width until it becomes zero + uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1)); + uint64_t mask = radix - 1; + ap_private<_AP_W, false, true> zero(0); + unsigned bits = 0; + bool msb = false; + while (tmp.ne(zero)) { + unsigned digit = (unsigned)(tmp.get_VAL() & mask); + result.insert(insert_at, digits[digit]); + tmp = tmp.lshr(shift); + bits++; + msb = (digit >> (shift - 1)) == 1; + } + bits *= shift; + if (bits < _AP_W && leading_zero && msb) + result.insert(insert_at, digits[0]); + } + return result; + } + + ap_private<_AP_W, false, true> tmp(*this); + ap_private<6, false, true> divisor(radix); + ap_private<_AP_W, _AP_S, true> zero(0); + size_t insert_at = 0; + if (wantSigned && isNegative()) { + // They want to print the signed version and it is a negative value + // Flip the bits and add one to turn it into the equivalent positive + // value and put a '-' in the result. + tmp.flip(); + tmp++; + result = "-"; + insert_at = 1; + } + if (tmp == ap_private<_AP_W, false, true>(0ULL)) + result = "0"; + else + while (tmp.ne(zero)) { + ap_private<_AP_W, false, true> APdigit = tmp % divisor; + ap_private<_AP_W, false, true> tmp2 = tmp / divisor; + uint32_t digit = (uint32_t)(APdigit.getZExtValue()); + assert(digit < radix && "divide failed"); + result.insert(insert_at, digits[digit]); + tmp = tmp2; + } + return result; + +} // End of ap_private<_AP_W, _AP_S, true>::toString() + +// bitwidth > 64 +template +class ap_private<_AP_W, _AP_S, false> { + // SFINAE pattern. Only consider this class when _AP_W > 64 + const static bool valid = ap_private_enable_if<(_AP_W > 64)>::isValid; + +#ifdef _MSC_VER +#pragma warning(disable : 4521 4522) +#endif + public: + enum { BitWidth = _AP_W, _AP_N = (_AP_W + 63) / 64 }; + static const int width = _AP_W; + + private: + /// This constructor is used only internally for speed of construction of + /// temporaries. It is unsafe for general use so it is not public. + + /* Constructors */ + /// Note that numWords can be smaller or larger than the corresponding bit + /// width but any extraneous bits will be dropped. + /// @param numWords the number of words in bigVal + /// @param bigVal a sequence of words to form the initial value of the + /// ap_private + /// @brief Construct an ap_private, initialized as bigVal[]. + INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) { + set_canary(); + assert(bigVal && "Null pointer detected!"); + { + // Get memory, cleared to 0 + memset(pVal, 0, _AP_N * sizeof(uint64_t)); + + // Calculate the number of words to copy + uint32_t words = AESL_std::min(numWords, _AP_N); + // Copy the words from bigVal to pVal + memcpy(pVal, bigVal, words * APINT_WORD_SIZE); + if (words >= _AP_W) clearUnusedBits(); + // Make sure unused high bits are cleared + } + check_canary(); + } + + /// This constructor interprets Val as a string in the given radix. The + /// interpretation stops when the first charater that is not suitable for the + /// radix is encountered. Acceptable radix values are 2, 8, 10 and 16. It is + /// an error for the value implied by the string to require more bits than + /// numBits. + /// @param val the string to be interpreted + /// @param radix the radix of Val to use for the intepretation + /// @brief Construct an ap_private from a string representation. + INLINE ap_private(const std::string& val, uint8_t radix = 2) { + set_canary(); + assert(!val.empty() && "The input string is empty."); + const char* c_str = val.c_str(); + fromString(c_str, val.size(), radix); + check_canary(); + } + + /// This constructor interprets the slen characters starting at StrStart as + /// a string in the given radix. The interpretation stops when the first + /// character that is not suitable for the radix is encountered. Acceptable + /// radix values are 2, 8, 10 and 16. It is an error for the value implied by + /// the string to require more bits than numBits. + /// @param strStart the start of the string to be interpreted + /// @param slen the maximum number of characters to interpret + /// @param radix the radix to use for the conversion + /// @brief Construct an ap_private from a string representation. + /// This method does not consider whether it is negative or not. + INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) { + set_canary(); + fromString(strStart, slen, radix); + check_canary(); + } + + INLINE void report() { + _AP_ERROR(_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024, + "ap_%sint<%d>: Bitwidth exceeds the " + "default max value %d. Please use macro " + "AP_INT_MAX_W to set a larger max value.", + _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024); + } + /// This union is used to store the integer value. When the + /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal. + + /// This enum is used to hold the constants we needed for ap_private. + // uint64_t VAL; ///< Used to store the <= 64 bits integer value. + uint64_t pVal[_AP_N]; ///< Used to store the >64 bits integer value. +#ifdef AP_CANARY + uint64_t CANARY; + INLINE void check_canary() { assert(CANARY == (uint64_t)0xDEADBEEFDEADBEEF); } + INLINE void set_canary() { CANARY = (uint64_t)0xDEADBEEFDEADBEEF; } +#else + INLINE void check_canary() {} + INLINE void set_canary() {} +#endif + + public: + typedef typename valtype<8, _AP_S>::Type ValType; + typedef ap_private<_AP_W, _AP_S> Type; + // FIXME remove friend type? + template + friend struct ap_fixed_base; + /// return type of variety of operations + //---------------------------------------------------------- + template + struct RType { + enum { + mult_w = _AP_W + _AP_W2, + mult_s = _AP_S || _AP_S2, + plus_w = + AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1, + plus_s = _AP_S || _AP_S2, + minus_w = + AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1, + minus_s = true, + div_w = _AP_W + _AP_S2, + div_s = _AP_S || _AP_S2, + mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)), + mod_s = _AP_S, + logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)), + logic_s = _AP_S || _AP_S2 + }; + typedef ap_private mult; + typedef ap_private plus; + typedef ap_private minus; + typedef ap_private logic; + typedef ap_private div; + typedef ap_private mod; + typedef ap_private<_AP_W, _AP_S> arg1; + typedef bool reduce; + }; + + INLINE uint64_t& get_VAL(void) { return pVal[0]; } + INLINE uint64_t get_VAL(void) const { return pVal[0]; } + INLINE uint64_t get_VAL(void) const volatile { return pVal[0]; } + INLINE void set_VAL(uint64_t value) { pVal[0] = value; } + INLINE uint64_t& get_pVal(int index) { return pVal[index]; } + INLINE uint64_t* get_pVal() { return pVal; } + INLINE const uint64_t* get_pVal() const { return pVal; } + INLINE uint64_t get_pVal(int index) const { return pVal[index]; } + INLINE uint64_t* get_pVal() const volatile { return pVal; } + INLINE uint64_t get_pVal(int index) const volatile { return pVal[index]; } + INLINE void set_pVal(int i, uint64_t value) { pVal[i] = value; } + + /// This enum is used to hold the constants we needed for ap_private. + enum { + APINT_BITS_PER_WORD = sizeof(uint64_t) * 8, ///< Bits in a word + APINT_WORD_SIZE = sizeof(uint64_t) ///< Byte size of a word + }; + + enum { + excess_bits = (_AP_W % APINT_BITS_PER_WORD) + ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD) + : 0 + }; + static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits)); + + public: + // NOTE changed to explicit to be consistent with ap_private + explicit INLINE ap_private(const char* val) { + set_canary(); + unsigned char radix = 10; + std::string str = ap_private_ops::parseString(val, radix); // determine radix. + std::string::size_type pos = str.find('.'); + if (pos != std::string::npos) str = str.substr(pos); + ap_private ap_private_val(str, radix); + operator=(ap_private_val); + report(); + check_canary(); + } + + INLINE ap_private(const char* val, unsigned char rd) { + set_canary(); + unsigned char radix = rd; + std::string str = ap_private_ops::parseString(val, radix); // determine radix. + std::string::size_type pos = str.find('.'); + if (pos != std::string::npos) str = str.substr(pos); + ap_private ap_private_val(str, radix); + operator=(ap_private_val); + report(); + + report(); + check_canary(); + } + + template + INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) { + set_canary(); + *this = ref.get(); + report(); + check_canary(); + } + + template + INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) { + set_canary(); + *this = ((uint64_t)(bool)ref); + report(); + check_canary(); + } + +// template +// INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) { +// set_canary(); +// *this = ref.get(); +// report(); +// check_canary(); +// } +// +// template +// INLINE ap_private( +// const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { +// set_canary(); +// *this = ((val.operator ap_private<_AP_W2, false>())); +// report(); +// check_canary(); +// } +// +// template +// INLINE ap_private( +// const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { +// set_canary(); +// *this = (uint64_t)(bool)val; +// report(); +// check_canary(); +// } + + /// Simply makes *this a copy of that. + /// @brief Copy Constructor. + INLINE ap_private(const ap_private& that) { + set_canary(); + memcpy(pVal, that.get_pVal(), _AP_N * APINT_WORD_SIZE); + clearUnusedBits(); + check_canary(); + } + + template + INLINE ap_private(const ap_private<_AP_W1, _AP_S1, false>& that) { + set_canary(); + operator=(that); + check_canary(); + } + + template + INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, false>& that) { + set_canary(); + operator=(const_cast&>(that)); + check_canary(); + } + + template + INLINE ap_private(const ap_private<_AP_W1, _AP_S1, true>& that) { + set_canary(); + static const uint64_t that_sign_ext_mask = + (_AP_W1 == APINT_BITS_PER_WORD) + ? 0 + : ~0ULL >> (_AP_W1 % APINT_BITS_PER_WORD) + << (_AP_W1 % APINT_BITS_PER_WORD); + if (that.isNegative()) { + pVal[0] = that.get_VAL() | that_sign_ext_mask; + memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); + } else { + pVal[0] = that.get_VAL(); + memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1)); + } + clearUnusedBits(); + check_canary(); + } + + template + INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, true>& that) { + set_canary(); + operator=(const_cast&>(that)); + check_canary(); + } + + /// @brief Destructor. + // virtual ~ap_private() {} + INLINE ~ap_private() { check_canary(); } + + /// @name Constructors + /// @{ + + /// Default constructor that creates an uninitialized ap_private. This is + /// useful + /// for object deserialization (pair this with the static method Read). + INLINE ap_private() { + set_canary(); + clearUnusedBits(); + check_canary(); + } + + INLINE ap_private(uint64_t* val, uint32_t bits = _AP_W) { assert(0); } + INLINE ap_private(const uint64_t* const val, uint32_t bits) { assert(0); } + +/// If isSigned is true then val is treated as if it were a signed value +/// (i.e. as an int64_t) and the appropriate sign extension to the bit width +/// will be done. Otherwise, no sign extension occurs (high order bits beyond +/// the range of val are zero filled). +/// @param numBits the bit width of the constructed ap_private +/// @param val the initial value of the ap_private +/// @param isSigned how to treat signedness of val +/// @brief Create a new ap_private of numBits width, initialized as val. +#define CTOR(TYPE, SIGNED) \ + INLINE ap_private(TYPE val, bool isSigned = SIGNED) { \ + set_canary(); \ + pVal[0] = (ValType)val; \ + if (isSigned && int64_t(pVal[0]) < 0) { \ + memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); \ + } else { \ + memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1)); \ + } \ + clearUnusedBits(); \ + check_canary(); \ + } + + CTOR(bool, false) + CTOR(char, CHAR_IS_SIGNED) + CTOR(signed char, true) + CTOR(unsigned char, false) + CTOR(short, true) + CTOR(unsigned short, false) + CTOR(int, true) + CTOR(unsigned int, false) + CTOR(long, true) + CTOR(unsigned long, false) + CTOR(ap_slong, true) + CTOR(ap_ulong, false) +#if 0 + CTOR(half, false) + CTOR(float, false) + CTOR(double, false) +#endif +#undef CTOR + + /// @returns true if the number of bits <= 64, false otherwise. + /// @brief Determine if this ap_private just has one word to store value. + INLINE bool isSingleWord() const { return false; } + + /// @returns the word position for the specified bit position. + /// @brief Determine which word a bit is in. + static INLINE uint32_t whichWord(uint32_t bitPosition) { + // return bitPosition / APINT_BITS_PER_WORD; + return (bitPosition) >> 6; + } + + /// @returns the bit position in a word for the specified bit position + /// in the ap_private. + /// @brief Determine which bit in a word a bit is in. + static INLINE uint32_t whichBit(uint32_t bitPosition) { + // return bitPosition % APINT_BITS_PER_WORD; + return bitPosition & 0x3f; + } + + /// bit at a specific bit position. This is used to mask the bit in the + /// corresponding word. + /// @returns a uint64_t with only bit at "whichBit(bitPosition)" set + /// @brief Get a single bit mask. + static INLINE uint64_t maskBit(uint32_t bitPosition) { + return 1ULL << (whichBit(bitPosition)); + } + + /// @returns the corresponding word for the specified bit position. + /// @brief Get the word corresponding to a bit position + INLINE uint64_t getWord(uint32_t bitPosition) const { + return pVal[whichWord(bitPosition)]; + } + + /// This method is used internally to clear the to "N" bits in the high order + /// word that are not used by the ap_private. This is needed after the most + /// significant word is assigned a value to ensure that those bits are + /// zero'd out. + /// @brief Clear unused high order bits + INLINE void clearUnusedBits(void) volatile +// just for clang compiler +#if defined(__clang__) && !defined(__CLANG_3_1__) + __attribute__((no_sanitize("undefined"))) +#endif + { + pVal[_AP_N - 1] = + _AP_S ? ((((int64_t)pVal[_AP_N - 1]) << (excess_bits)) >> excess_bits) + : (excess_bits + ? ((pVal[_AP_N - 1]) << (excess_bits)) >> (excess_bits) + : pVal[_AP_N - 1]); + } + + INLINE void clearUnusedBitsToZero(void) { pVal[_AP_N - 1] &= mask; } + + INLINE void clearUnusedBitsToOne(void) { pVal[_AP_N - 1] |= mask; } + + /// This is used by the constructors that take string arguments. + /// @brief Convert a char array into an ap_private + INLINE void fromString(const char* str, uint32_t slen, uint8_t radix) { + enum { numbits = _AP_W }; + bool isNeg = str[0] == '-'; + if (isNeg) { + str++; + slen--; + } + + if (str[0] == '0' && (str[1] == 'b' || str[1] == 'B')) { + //if(radix == 0) radix = 2; + _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", str, 2, radix); + str += 2; + slen -=2; + } else if (str[0] == '0' && (str[1] == 'o' || str[1] == 'O')) { + //if (radix == 0) radix = 8; + _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", str, 8, radix); + str += 2; + slen -=2; + } else if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) { + //if (radix == 0) radix = 16; + _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", str, 16, radix); + str += 2; + slen -=2; + } else if (str[0] == '0' && (str[1] == 'd' || str[1] == 'D')) { + //if (radix == 0) radix = 10; + _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", str, 10, radix); + str += 2; + slen -=2; + } else if (radix == 0) { + //radix = 2; // XXX default value + } + + // Check our assumptions here + assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) && + "Radix should be 2, 8, 10, or 16!"); + assert(str && "String is null?"); + + // skip any leading zero + while (*str == '0' && *(str + 1) != '\0') { + str++; + slen--; + } + assert((slen <= numbits || radix != 2) && "Insufficient bit width"); + assert(((slen - 1) * 3 <= numbits || radix != 8) && + "Insufficient bit width"); + assert(((slen - 1) * 4 <= numbits || radix != 16) && + "Insufficient bit width"); + assert((((slen - 1) * 64) / 22 <= numbits || radix != 10) && + "Insufficient bit width"); + + // clear bits + memset(pVal, 0, _AP_N * sizeof(uint64_t)); + + // Figure out if we can shift instead of multiply + uint32_t shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0); + + // Set up an ap_private for the digit to add outside the loop so we don't + // constantly construct/destruct it. + uint64_t bigVal[_AP_N]; + memset(bigVal, 0, _AP_N * sizeof(uint64_t)); + ap_private<_AP_W, _AP_S> apdigit(getBitWidth(), bigVal); + ap_private<_AP_W, _AP_S> apradix(radix); + + // Enter digit traversal loop + for (unsigned i = 0; i < slen; i++) { + // Get a digit + uint32_t digit = 0; + char cdigit = str[i]; + if (radix == 16) { +#define isxdigit(c) \ + (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \ + ((c) >= 'A' && (c) <= 'F')) +#define isdigit(c) ((c) >= '0' && (c) <= '9') + if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string"); + if (isdigit(cdigit)) + digit = cdigit - '0'; + else if (cdigit >= 'a') + digit = cdigit - 'a' + 10; + else if (cdigit >= 'A') + digit = cdigit - 'A' + 10; + else + assert(0 && "huh? we shouldn't get here"); + } else if (isdigit(cdigit)) { + digit = cdigit - '0'; + } else if (cdigit != '\0') { + assert(0 && "Invalid character in digit string"); + } +#undef isxdigit +#undef isdigit + // Shift or multiply the value by the radix + if (shift) + *this <<= shift; + else + *this *= apradix; + + // Add in the digit we just interpreted + apdigit.set_VAL(digit); + *this += apdigit; + } + // If its negative, put it in two's complement form + if (isNeg) { + (*this)--; + this->flip(); + } + clearUnusedBits(); + } + + INLINE ap_private read() volatile { return *this; } + + INLINE void write(const ap_private& op2) volatile { *this = (op2); } + + INLINE operator ValType() const { return get_VAL(); } + + INLINE int to_uchar() const { return (unsigned char)get_VAL(); } + + INLINE int to_char() const { return (signed char)get_VAL(); } + + INLINE int to_ushort() const { return (unsigned short)get_VAL(); } + + INLINE int to_short() const { return (short)get_VAL(); } + + INLINE int to_int() const { return (int)get_VAL(); } + + INLINE unsigned to_uint() const { return (unsigned)get_VAL(); } + + INLINE long to_long() const { return (long)get_VAL(); } + + INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); } + + INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); } + + INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); } + + INLINE double to_double() const { + if (isNegative()) + return roundToDouble(true); + else + return roundToDouble(false); + } + + INLINE unsigned length() const { return _AP_W; } + + /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise + * versa*/ + INLINE ap_private& reverse() { + for (int i = 0; i < _AP_W / 2; ++i) { + bool tmp = operator[](i); + if (operator[](_AP_W - 1 - i)) + set(i); + else + clear(i); + if (tmp) + set(_AP_W - 1 - i); + else + clear(_AP_W - 1 - i); + } + clearUnusedBits(); + return *this; + } + + /*Return true if the value of ap_private instance is zero*/ + INLINE bool iszero() const { return isMinValue(); } + + INLINE bool to_bool() const { return !iszero(); } + + /* x < 0 */ + INLINE bool sign() const { + if (isNegative()) return true; + return false; + } + + /* x[i] = !x[i] */ + INLINE void invert(int i) { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + flip(i); + } + + /* x[i] */ + INLINE bool test(int i) const { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + return operator[](i); + } + + // Set the ith bit into v + INLINE void set(int i, bool v) { + assert(i >= 0 && "Attempting to write bit with negative index"); + assert(i < _AP_W && "Attempting to write bit beyond MSB"); + v ? set(i) : clear(i); + } + + // Set the ith bit into v + INLINE void set_bit(int i, bool v) { + assert(i >= 0 && "Attempting to write bit with negative index"); + assert(i < _AP_W && "Attempting to write bit beyond MSB"); + v ? set(i) : clear(i); + } + + // FIXME different argument for different action? + INLINE ap_private& set(uint32_t bitPosition) { + pVal[whichWord(bitPosition)] |= maskBit(bitPosition); + clearUnusedBits(); + return *this; + } + + INLINE void set() { + for (int i = 0; i < _AP_N; ++i) pVal[i] = ~0ULL; + clearUnusedBits(); + } + + // Get the value of ith bit + INLINE bool get(int i) const { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + return ((maskBit(i) & (pVal[whichWord(i)])) != 0); + } + + // Get the value of ith bit + INLINE bool get_bit(int i) const { + assert(i >= 0 && "Attempting to read bit with negative index"); + assert(i < _AP_W && "Attempting to read bit beyond MSB"); + return ((maskBit(i) & (pVal[whichWord(i)])) != 0); + } + + // This is used for sc_lv and sc_bv, which is implemented by sc_uint + // Rotate an ap_private object n places to the left + INLINE void lrotate(int n) { + assert(n >= 0 && "Attempting to shift negative index"); + assert(n < _AP_W && "Shift value larger than bit width"); + operator=(shl(n) | lshr(_AP_W - n)); + } + + // This is used for sc_lv and sc_bv, which is implemented by sc_uint + // Rotate an ap_private object n places to the right + INLINE void rrotate(int n) { + assert(n >= 0 && "Attempting to shift negative index"); + assert(n < _AP_W && "Shift value larger than bit width"); + operator=(lshr(n) | shl(_AP_W - n)); + } + + /// Set the given bit to 0 whose position is given as "bitPosition". + /// @brief Set a given bit to 0. + INLINE ap_private& clear(uint32_t bitPosition) { + pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition); + clearUnusedBits(); + return *this; + } + + /// @brief Set every bit to 0. + INLINE void clear() { memset(pVal, 0, _AP_N * APINT_WORD_SIZE); } + + /// @brief Toggle every bit to its opposite value. + ap_private& flip() { + for (int i = 0; i < _AP_N; ++i) pVal[i] ^= ~0ULL; + clearUnusedBits(); + return *this; + } + + /// @brief Toggles a given bit to its opposite value. + INLINE ap_private& flip(uint32_t bitPosition) { + assert(bitPosition < BitWidth && "Out of the bit-width range!"); + set_bit(bitPosition, !get_bit(bitPosition)); + return *this; + } + + // complements every bit + INLINE void b_not() { flip(); } + + INLINE ap_private getLoBits(uint32_t numBits) const { + return ap_private_ops::lshr(ap_private_ops::shl(*this, _AP_W - numBits), + _AP_W - numBits); + } + + INLINE ap_private getHiBits(uint32_t numBits) const { + return ap_private_ops::lshr(*this, _AP_W - numBits); + } + + // Binary Arithmetic + //----------------------------------------------------------- + +// template +// INLINE ap_private operator&( +// const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { +// return *this & a2.get(); +// } +// +// template +// INLINE ap_private operator|( +// const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { +// return *this | a2.get(); +// } +// +// template +// INLINE ap_private operator^( +// const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) { +// return *this ^ a2.get(); +// } + +/// Arithmetic assign +//------------------------------------------------------------- + +#define OP_BIN_LOGIC_ASSIGN_AP(Sym) \ + template \ + INLINE ap_private& operator Sym(const ap_private<_AP_W1, _AP_S1>& RHS) { \ + const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N; \ + uint32_t numWords = AESL_std::min((int)_AP_N, _AP_N1); \ + uint32_t i; \ + if (_AP_W != _AP_W1) \ + fprintf(stderr, \ + "Warning! Bitsize mismach for ap_[u]int " #Sym " ap_[u]int.\n"); \ + for (i = 0; i < numWords; ++i) pVal[i] Sym RHS.get_pVal(i); \ + if (_AP_N1 < _AP_N) { \ + uint64_t ext = RHS.isNegative() ? ~0ULL : 0; \ + for (; i < _AP_N; i++) pVal[i] Sym ext; \ + } \ + clearUnusedBits(); \ + return *this; \ + } + + OP_BIN_LOGIC_ASSIGN_AP(&=); + OP_BIN_LOGIC_ASSIGN_AP(|=); + OP_BIN_LOGIC_ASSIGN_AP(^=); +#undef OP_BIN_LOGIC_ASSIGN_AP + + /// Adds the RHS APint to this ap_private. + /// @returns this, after addition of RHS. + /// @brief Addition assignment operator. + template + INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) { + const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N; + uint64_t RHSpVal[_AP_N1]; + for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i); + ap_private_ops::add(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S, + _AP_S1); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) { + const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N; + uint64_t RHSpVal[_AP_N1]; + for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i); + ap_private_ops::sub(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S, + _AP_S1); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) { + // Get some bit facts about LHS and check for zero + uint32_t lhsBits = getActiveBits(); + uint32_t lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1; + if (!lhsWords) { + // 0 * X ===> 0 + return *this; + } + + ap_private dupRHS = RHS; + // Get some bit facts about RHS and check for zero + uint32_t rhsBits = dupRHS.getActiveBits(); + uint32_t rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1; + if (!rhsWords) { + // X * 0 ===> 0 + clear(); + return *this; + } + + // Allocate space for the result + uint32_t destWords = rhsWords + lhsWords; + uint64_t* dest = (uint64_t*)malloc(destWords * sizeof(uint64_t)); + + // Perform the long multiply + ap_private_ops::mul(dest, pVal, lhsWords, dupRHS.get_pVal(), rhsWords, + destWords); + + // Copy result back into *this + clear(); + uint32_t wordsToCopy = destWords >= _AP_N ? _AP_N : destWords; + + memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE); + + uint64_t ext = (isNegative() ^ RHS.isNegative()) ? ~0ULL : 0ULL; + for (int i = wordsToCopy; i < _AP_N; i++) pVal[i] = ext; + clearUnusedBits(); + // delete dest array and return + free(dest); + return *this; + } + +#define OP_ASSIGN_AP(Sym) \ + template \ + INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \ + *this = operator Sym(op); \ + return *this; \ + } + + OP_ASSIGN_AP(/) + OP_ASSIGN_AP(%) +#undef OP_ASSIGN_AP + +#define OP_BIN_LOGIC_AP(Sym) \ + template \ + INLINE typename RType<_AP_W1, _AP_S1>::logic operator Sym( \ + const ap_private<_AP_W1, _AP_S1>& RHS) const { \ + enum { \ + numWords = (RType<_AP_W1, _AP_S1>::logic_w + APINT_BITS_PER_WORD - 1) / \ + APINT_BITS_PER_WORD \ + }; \ + typename RType<_AP_W1, _AP_S1>::logic Result; \ + uint32_t i; \ + const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N; \ + uint32_t min_N = std::min((int)_AP_N, _AP_N1); \ + uint32_t max_N = std::max((int)_AP_N, _AP_N1); \ + for (i = 0; i < min_N; ++i) \ + Result.set_pVal(i, pVal[i] Sym RHS.get_pVal(i)); \ + if (numWords > i) { \ + uint64_t ext = ((_AP_N < _AP_N1 && isNegative()) || \ + (_AP_N1 < _AP_N && RHS.isNegative())) \ + ? ~0ULL \ + : 0; \ + if (_AP_N > _AP_N1) \ + for (; i < max_N; i++) Result.set_pVal(i, pVal[i] Sym ext); \ + else \ + for (; i < max_N; i++) Result.set_pVal(i, RHS.get_pVal(i) Sym ext); \ + if (numWords > i) { \ + uint64_t ext2 = ((_AP_N > _AP_N1 && isNegative()) || \ + (_AP_N1 > _AP_N && RHS.isNegative())) \ + ? ~0ULL \ + : 0; \ + Result.set_pVal(i, ext Sym ext2); \ + } \ + } \ + Result.clearUnusedBits(); \ + return Result; \ + } + + OP_BIN_LOGIC_AP(|); + OP_BIN_LOGIC_AP(&); + OP_BIN_LOGIC_AP(^); + +#undef OP_BIN_LOGIC_AP + + template + INLINE typename RType<_AP_W1, _AP_S1>::plus operator+( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + typename RType<_AP_W1, _AP_S1>::plus Result, lhs(*this), rhs(RHS); + const int Result_AP_N = (RType<_AP_W1, _AP_S1>::plus_w + 63) / 64; + ap_private_ops::add(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(), + Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1); + Result.clearUnusedBits(); + return Result; + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::minus operator-( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + typename RType<_AP_W1, _AP_S1>::minus Result, lhs(*this), rhs(RHS); + const int Result_AP_N = (RType<_AP_W1, _AP_S1>::minus_w + 63) / 64; + ap_private_ops::sub(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(), + Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1); + Result.clearUnusedBits(); + return Result; + } + + template + INLINE typename RType<_AP_W1, _AP_S1>::mult operator*( + const ap_private<_AP_W1, _AP_S1>& RHS) const { + typename RType<_AP_W1, _AP_S1>::mult temp = *this; + temp *= RHS; + return temp; + } + + template + INLINE typename RType<_AP_W2, _AP_S2>::div operator/( + const ap_private<_AP_W2, _AP_S2>& op) const { + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + lhs = *this; + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + rhs = op; + return typename RType<_AP_W2, _AP_S2>::div( + (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs)); + } + + template + INLINE typename RType<_AP_W2, _AP_S2>::mod operator%( + const ap_private<_AP_W2, _AP_S2>& op) const { + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + lhs = *this; + ap_private _AP_W2 ? _AP_S + : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))> + rhs = op; + typename RType<_AP_W2, _AP_S2>::mod res = + typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs) + : lhs.urem(rhs)); + return res; + } + +#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED) \ + INLINE ap_private operator<<(const TYPE op) const { \ + if (op >= _AP_W) return ap_private(0); \ + if (SIGNED && op < 0) return *this >> (0 - op); \ + return shl(op); \ + } + + OP_LEFT_SHIFT_CTYPE(int, true) + // OP_LEFT_SHIFT_CTYPE(bool, false) + OP_LEFT_SHIFT_CTYPE(signed char, true) + OP_LEFT_SHIFT_CTYPE(unsigned char, false) + OP_LEFT_SHIFT_CTYPE(short, true) + OP_LEFT_SHIFT_CTYPE(unsigned short, false) + OP_LEFT_SHIFT_CTYPE(unsigned int, false) + OP_LEFT_SHIFT_CTYPE(long, true) + OP_LEFT_SHIFT_CTYPE(unsigned long, false) + OP_LEFT_SHIFT_CTYPE(unsigned long long, false) + OP_LEFT_SHIFT_CTYPE(long long, true) +#if 0 + OP_LEFT_SHIFT_CTYPE(half, false) + OP_LEFT_SHIFT_CTYPE(float, false) + OP_LEFT_SHIFT_CTYPE(double, false) +#endif +#undef OP_LEFT_SHIFT_CTYPE + + template + INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const { + if (_AP_S2 == false) { + uint32_t sh = op2.to_uint(); + return *this << sh; + } else { + int sh = op2.to_int(); + return *this << sh; + } + } + +#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED) \ + INLINE ap_private operator>>(const TYPE op) const { \ + if (op >= _AP_W) { \ + if (isNegative()) \ + return ap_private(-1); \ + else \ + return ap_private(0); \ + } \ + if ((SIGNED) && op < 0) return *this << (0 - op); \ + if (_AP_S) \ + return ashr(op); \ + else \ + return lshr(op); \ + } + + // OP_RIGHT_SHIFT_CTYPE(bool, false) + OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED) + OP_RIGHT_SHIFT_CTYPE(signed char, true) + OP_RIGHT_SHIFT_CTYPE(unsigned char, false) + OP_RIGHT_SHIFT_CTYPE(short, true) + OP_RIGHT_SHIFT_CTYPE(unsigned short, false) + OP_RIGHT_SHIFT_CTYPE(int, true) + OP_RIGHT_SHIFT_CTYPE(unsigned int, false) + OP_RIGHT_SHIFT_CTYPE(long, true) + OP_RIGHT_SHIFT_CTYPE(unsigned long, false) + OP_RIGHT_SHIFT_CTYPE(unsigned long long, false) + OP_RIGHT_SHIFT_CTYPE(long long, true) +#if 0 + OP_RIGHT_SHIFT_CTYPE(half, false) + OP_RIGHT_SHIFT_CTYPE(float, false) + OP_RIGHT_SHIFT_CTYPE(double, false) +#endif +#undef OP_RIGHT_SHIFT_CTYPE + + template + INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const { + if (_AP_S2 == false) { + uint32_t sh = op2.to_uint(); + return *this >> sh; + } else { + int sh = op2.to_int(); + return *this >> sh; + } + } + + /// Shift assign + //------------------------------------------------------------------ + // TODO call clearUnusedBits ? +#define OP_ASSIGN_AP(Sym) \ + template \ + INLINE ap_private& operator Sym##=(int op) { \ + *this = operator Sym(op); \ + return *this; \ + } \ + INLINE ap_private& operator Sym##=(unsigned int op) { \ + *this = operator Sym(op); \ + return *this; \ + } \ + template \ + INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \ + *this = operator Sym(op); \ + return *this; \ + } + OP_ASSIGN_AP(>>) + OP_ASSIGN_AP(<<) +#undef OP_ASSIGN_AP + + /// Comparisons + //----------------------------------------------------------------- + INLINE bool operator==(const ap_private& RHS) const { + // Get some facts about the number of bits used in the two operands. + uint32_t n1 = getActiveBits(); + uint32_t n2 = RHS.getActiveBits(); + + // If the number of bits isn't the same, they aren't equal + if (n1 != n2) return false; + + // If the number of bits fits in a word, we only need to compare the low + // word. + if (n1 <= APINT_BITS_PER_WORD) return pVal[0] == RHS.get_pVal(0); + + // Otherwise, compare everything + for (int i = whichWord(n1 - 1); i >= 0; --i) + if (pVal[i] != RHS.get_pVal(i)) return false; + return true; + } + + template + INLINE bool operator==(const ap_private<_AP_W2, _AP_S2>& op) const { + enum { + _AP_MAX_W = AP_MAX(_AP_W, _AP_W2), + }; + ap_private<_AP_MAX_W, false> lhs(*this); + ap_private<_AP_MAX_W, false> rhs(op); + return lhs == rhs; + } + + INLINE bool operator==(uint64_t Val) const { + uint32_t n = getActiveBits(); + if (n <= APINT_BITS_PER_WORD) + return pVal[0] == Val; + else + return false; + } + + template + INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const { + return !(*this == op); + } + + template + INLINE bool operator!=(const ap_private<_AP_W, _AP_S1>& RHS) const { + return !((*this) == RHS); + } + + INLINE bool operator!=(uint64_t Val) const { return !((*this) == Val); } + + template + INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const { + return !(*this > op); + } + + INLINE bool operator<(const ap_private& op) const { + return _AP_S ? slt(op) : ult(op); + } + + template + INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const { + enum { + _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)) + }; + ap_private<_AP_MAX_W, _AP_S> lhs(*this); + ap_private<_AP_MAX_W, _AP_S2> rhs(op); + if (_AP_S == _AP_S2) + return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs); + else if (_AP_S) + if (_AP_W2 >= _AP_W) + return lhs.ult(rhs); + else + return lhs.slt(rhs); + else if (_AP_W >= _AP_W2) + return lhs.ult(rhs); + else + return lhs.slt(rhs); + } + + template + INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const { + return !(*this < op); + } + + INLINE bool operator>(const ap_private& op) const { + return _AP_S ? sgt(op) : ugt(op); + } + + template + INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const { + enum { + _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)) + }; + ap_private<_AP_MAX_W, _AP_S> lhs(*this); + ap_private<_AP_MAX_W, _AP_S2> rhs(op); + if (_AP_S == _AP_S2) + return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs); + else if (_AP_S) + if (_AP_W2 >= _AP_W) + return lhs.ugt(rhs); + else + return lhs.sgt(rhs); + else if (_AP_W >= _AP_W2) + return lhs.ugt(rhs); + else + return lhs.sgt(rhs); + } + + /// Bit and Part Select + //-------------------------------------------------------------- + INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) { + return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const { + return _private_range_ref<_AP_W, _AP_S>( + const_cast*>(this), Hi, Lo); + } + + INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const { + return _private_range_ref<_AP_W, _AP_S>( + (const_cast*>(this)), Hi, Lo); + } + + INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) { + return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + template + INLINE _private_range_ref<_AP_W, _AP_S> range( + const ap_private<_AP_W2, _AP_S2>& HiIdx, + const ap_private<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + template + INLINE _private_range_ref<_AP_W, _AP_S> operator()( + const ap_private<_AP_W2, _AP_S2>& HiIdx, + const ap_private<_AP_W3, _AP_S3>& LoIdx) { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo); + } + + template + INLINE _private_range_ref<_AP_W, _AP_S> range( + const ap_private<_AP_W2, _AP_S2>& HiIdx, + const ap_private<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return _private_range_ref<_AP_W, _AP_S>(const_cast(this), Hi, Lo); + } + + template + INLINE _private_range_ref<_AP_W, _AP_S> operator()( + const ap_private<_AP_W2, _AP_S2>& HiIdx, + const ap_private<_AP_W3, _AP_S3>& LoIdx) const { + int Hi = HiIdx.to_int(); + int Lo = LoIdx.to_int(); + return this->range(Hi, Lo); + } + + INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index); + } + + template + INLINE _private_bit_ref<_AP_W, _AP_S> operator[]( + const ap_private<_AP_W2, _AP_S2>& index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int()); + } + + template + INLINE const _private_bit_ref<_AP_W, _AP_S> operator[]( + const ap_private<_AP_W2, _AP_S2>& index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index.to_int()); + } + + INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index); + } + + INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index); + } + + template + INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) { + return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int()); + } + + INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index); + } + + template + INLINE const _private_bit_ref<_AP_W, _AP_S> bit( + const ap_private<_AP_W2, _AP_S2>& index) const { + return _private_bit_ref<_AP_W, _AP_S>( + const_cast&>(*this), index.to_int()); + } + +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> > +// concat(ap_private<_AP_W2, _AP_S2>& a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> > +// concat(const ap_private<_AP_W2, _AP_S2>& a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(ap_private<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(ap_private<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// const_cast&>(*this), a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(const ap_private<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// *this, const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> > +// operator,(const ap_private<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> > +// operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> > +// operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> > +// operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> >( +// const_cast&>(*this), +// const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> > +// operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1, +// _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > +// operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( +// const_cast&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > +// operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { +// return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this, +// a2); +// } +// +// template +// INLINE ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> +// &a2) const { +// return ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// const_cast&>(*this), +// const_cast< +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { +// return ap_concat_ref< +// _AP_W, ap_private, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, +// a2); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> +// &a2) const { +// return ap_concat_ref< +// _AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// const_cast&>(*this), +// const_cast&>( +// a2)); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,( +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { +// return ap_concat_ref< +// _AP_W, ap_private, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2); +// } + + INLINE ap_private<_AP_W, false> get() const { + ap_private<_AP_W, false> ret(*this); + return ret; + } + + template + INLINE void set(const ap_private<_AP_W3, false>& val) { + operator=(ap_private<_AP_W3, _AP_S>(val)); + } + + /// + /// @name Value Tests + /// + /// This tests the high bit of this ap_private to determine if it is set. + /// @returns true if this ap_private is negative, false otherwise + /// @brief Determine sign of this ap_private. + INLINE bool isNegative() const { + // just for get rid of warnings + enum { shift = (_AP_W - APINT_BITS_PER_WORD * (_AP_N - 1) - 1) }; + static const uint64_t mask = 1ULL << (shift); + return _AP_S && (pVal[_AP_N - 1] & mask); + } + + /// This tests the high bit of the ap_private to determine if it is unset. + /// @brief Determine if this ap_private Value is positive (not negative). + INLINE bool isPositive() const { return !isNegative(); } + + /// This tests if the value of this ap_private is strictly positive (> 0). + /// @returns true if this ap_private is Positive and not zero. + /// @brief Determine if this ap_private Value is strictly positive. + INLINE bool isStrictlyPositive() const { + return isPositive() && (*this) != 0; + } + + /// This checks to see if the value has all bits of the ap_private are set or + /// not. + /// @brief Determine if all bits are set + INLINE bool isAllOnesValue() const { return countPopulation() == _AP_W; } + + /// This checks to see if the value of this ap_private is the maximum unsigned + /// value for the ap_private's bit width. + /// @brief Determine if this is the largest unsigned value. + INLINE bool isMaxValue() const { return countPopulation() == _AP_W; } + + /// This checks to see if the value of this ap_private is the maximum signed + /// value for the ap_private's bit width. + /// @brief Determine if this is the largest signed value. + INLINE bool isMaxSignedValue() const { + return !isNegative() && countPopulation() == _AP_W - 1; + } + + /// This checks to see if the value of this ap_private is the minimum unsigned + /// value for the ap_private's bit width. + /// @brief Determine if this is the smallest unsigned value. + INLINE bool isMinValue() const { return countPopulation() == 0; } + + /// This checks to see if the value of this ap_private is the minimum signed + /// value for the ap_private's bit width. + /// @brief Determine if this is the smallest signed value. + INLINE bool isMinSignedValue() const { + return isNegative() && countPopulation() == 1; + } + + /// This function returns a pointer to the internal storage of the ap_private. + /// This is useful for writing out the ap_private in binary form without any + /// conversions. + INLINE const uint64_t* getRawData() const { return &pVal[0]; } + + // Square Root - this method computes and returns the square root of "this". + // Three mechanisms are used for computation. For small values (<= 5 bits), + // a table lookup is done. This gets some performance for common cases. For + // values using less than 52 bits, the value is converted to double and then + // the libc sqrt function is called. The result is rounded and then converted + // back to a uint64_t which is then used to construct the result. Finally, + // the Babylonian method for computing square roots is used. + INLINE ap_private sqrt() const { + // Determine the magnitude of the value. + uint32_t magnitude = getActiveBits(); + + // Use a fast table for some small values. This also gets rid of some + // rounding errors in libc sqrt for small values. + if (magnitude <= 5) { + static const uint8_t results[32] = { + /* 0 */ 0, + /* 1- 2 */ 1, 1, + /* 3- 6 */ 2, 2, 2, 2, + /* 7-12 */ 3, 3, 3, 3, 3, 3, + /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4, + /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + /* 31 */ 6}; + return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ results[get_VAL()]); + } + + // If the magnitude of the value fits in less than 52 bits (the precision of + // an IEEE double precision floating point value), then we can use the + // libc sqrt function which will probably use a hardware sqrt computation. + // This should be faster than the algorithm below. + if (magnitude < 52) { +#ifdef _MSC_VER + // Amazingly, VC++ doesn't have round(). + return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ + uint64_t(::sqrt(double(get_VAL()))) + + 0.5); +#else + return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ + uint64_t( + ::round(::sqrt(double(get_VAL()))))); +#endif + } + + // Okay, all the short cuts are exhausted. We must compute it. The following + // is a classical Babylonian method for computing the square root. This code + // was adapted to APINt from a wikipedia article on such computations. + // See http://www.wikipedia.org/ and go to the page named + // Calculate_an_integer_square_root. + uint32_t nbits = BitWidth, i = 4; + ap_private<_AP_W, _AP_S> testy(16); + ap_private<_AP_W, _AP_S> x_old(/*BitWidth,*/ 1); + ap_private<_AP_W, _AP_S> x_new(0); + ap_private<_AP_W, _AP_S> two(/*BitWidth,*/ 2); + + // Select a good starting value using binary logarithms. + for (;; i += 2, testy = testy.shl(2)) + if (i >= nbits || this->ule(testy)) { + x_old = x_old.shl(i / 2); + break; + } + + // Use the Babylonian method to arrive at the integer square root: + for (;;) { + x_new = (this->udiv(x_old) + x_old).udiv(two); + if (x_old.ule(x_new)) break; + x_old = x_new; + } + + // Make sure we return the closest approximation + // NOTE: The rounding calculation below is correct. It will produce an + // off-by-one discrepancy with results from pari/gp. That discrepancy has + // been + // determined to be a rounding issue with pari/gp as it begins to use a + // floating point representation after 192 bits. There are no discrepancies + // between this algorithm and pari/gp for bit widths < 192 bits. + ap_private<_AP_W, _AP_S> square(x_old * x_old); + ap_private<_AP_W, _AP_S> nextSquare((x_old + 1) * (x_old + 1)); + if (this->ult(square)) + return x_old; + else if (this->ule(nextSquare)) { + ap_private<_AP_W, _AP_S> midpoint((nextSquare - square).udiv(two)); + ap_private<_AP_W, _AP_S> offset(*this - square); + if (offset.ult(midpoint)) + return x_old; + else + return x_old + 1; + } else + assert(0 && "Error in ap_private<_AP_W, _AP_S>::sqrt computation"); + return x_old + 1; + } + + /// + /// @Assignment Operators + /// + /// @returns *this after assignment of RHS. + /// @brief Copy assignment operator. + INLINE ap_private& operator=(const ap_private& RHS) { + if (this != &RHS) memcpy(pVal, RHS.get_pVal(), _AP_N * APINT_WORD_SIZE); + clearUnusedBits(); + return *this; + } + INLINE ap_private& operator=(const volatile ap_private& RHS) { + if (this != &RHS) + for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i); + clearUnusedBits(); + return *this; + } + INLINE void operator=(const ap_private& RHS) volatile { + if (this != &RHS) + for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i); + clearUnusedBits(); + } + INLINE void operator=(const volatile ap_private& RHS) volatile { + if (this != &RHS) + for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i); + clearUnusedBits(); + } + + template + INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) { + if (_AP_S1) + cpSextOrTrunc(RHS); + else + cpZextOrTrunc(RHS); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1>& RHS) { + if (_AP_S1) + cpSextOrTrunc(RHS); + else + cpZextOrTrunc(RHS); + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + *this = ap_private<_AP_W2, false>(op2); + return *this; + } + +#if 0 + template + INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1, true>& RHS) { + static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD); + if (RHS.isNegative()) { + pVal[0] = RHS.get_VAL() | that_sign_ext_mask; + memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1)); + } else { + pVal[0] = RHS.get_VAL(); + memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1)); + } + clearUnusedBits(); + return *this; + } + + template + INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1, true>& RHS) { + static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD); + if (RHS.isNegative()) { + pVal[0] = RHS.get_VAL() | that_sign_ext_mask; + memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1)); + } else { + pVal[0] = RHS.get_VAL(); + memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1)); + } + clearUnusedBits(); + return *this; + } +#endif + +/// from all c types. +#define ASSIGN_OP_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \ + INLINE ap_private& operator=(const C_TYPE rhs) { \ + ap_private<(_AP_W2), (_AP_S2)> tmp = rhs; \ + operator=(tmp); \ + return *this; \ + } + + ASSIGN_OP_FROM_INT(bool, 1, false) + ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED) + ASSIGN_OP_FROM_INT(signed char, 8, true) + ASSIGN_OP_FROM_INT(unsigned char, 8, false) + ASSIGN_OP_FROM_INT(short, sizeof(short) * 8, true) + ASSIGN_OP_FROM_INT(unsigned short, sizeof(unsigned short) * 8, false) + ASSIGN_OP_FROM_INT(int, sizeof(int) * 8, true) + ASSIGN_OP_FROM_INT(unsigned int, sizeof(unsigned int) * 8, false) + ASSIGN_OP_FROM_INT(long, sizeof(long) * 8, true) + ASSIGN_OP_FROM_INT(unsigned long, sizeof(unsigned long) * 8, false) + ASSIGN_OP_FROM_INT(ap_slong, sizeof(ap_slong) * 8, true) + ASSIGN_OP_FROM_INT(ap_ulong, sizeof(ap_ulong) * 8, false) +#undef ASSIGN_OP_FROM_INT + + /// from c string. + // XXX this is a must, to prevent pointer being converted to bool. + INLINE ap_private& operator=(const char* s) { + ap_private tmp(s); // XXX direct initialization, as ctor is explicit. + operator=(tmp); + return *this; + } + + /// + /// @name Unary Operators + /// + /// @returns a new ap_private value representing *this incremented by one + /// @brief Postfix increment operator. + INLINE const ap_private operator++(int) { + ap_private API(*this); + ++(*this); + return API; + } + + /// @returns *this incremented by one + /// @brief Prefix increment operator. + INLINE ap_private& operator++() { + ap_private_ops::add_1(pVal, pVal, _AP_N, 1); + clearUnusedBits(); + return *this; + } + + /// @returns a new ap_private representing *this decremented by one. + /// @brief Postfix decrement operator. + INLINE const ap_private operator--(int) { + ap_private API(*this); + --(*this); + return API; + } + + /// @returns *this decremented by one. + /// @brief Prefix decrement operator. + INLINE ap_private& operator--() { + ap_private_ops::sub_1(pVal, _AP_N, 1); + clearUnusedBits(); + return *this; + } + + /// Performs a bitwise complement operation on this ap_private. + /// @returns an ap_private that is the bitwise complement of *this + /// @brief Unary bitwise complement operator. + INLINE ap_private<_AP_W + !_AP_S, true> operator~() const { + ap_private<_AP_W + !_AP_S, true> Result(*this); + Result.flip(); + return Result; + } + + /// Negates *this using two's complement logic. + /// @returns An ap_private value representing the negation of *this. + /// @brief Unary negation operator + INLINE typename RType<1, false>::minus operator-() const { + return ap_private<1, false>(0) - (*this); + } + + /// Performs logical negation operation on this ap_private. + /// @returns true if *this is zero, false otherwise. + /// @brief Logical negation operator. + INLINE bool operator!() const { + for (int i = 0; i < _AP_N; ++i) + if (pVal[i]) return false; + return true; + } + + template + INLINE ap_private<_AP_W, _AP_S || _AP_S1> And( + const ap_private<_AP_W, _AP_S1>& RHS) const { + return this->operator&(RHS); + } + template + INLINE ap_private Or(const ap_private<_AP_W, _AP_S1>& RHS) const { + return this->operator|(RHS); + } + template + INLINE ap_private Xor(const ap_private<_AP_W, _AP_S1>& RHS) const { + return this->operator^(RHS); + } + + INLINE ap_private Mul(const ap_private& RHS) const { + ap_private Result(*this); + Result *= RHS; + return Result; + } + + INLINE ap_private Add(const ap_private& RHS) const { + ap_private Result(0); + ap_private_ops::add(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N, + _AP_N, _AP_S, _AP_S); + Result.clearUnusedBits(); + return Result; + } + + INLINE ap_private Sub(const ap_private& RHS) const { + ap_private Result(0); + ap_private_ops::sub(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N, + _AP_N, _AP_S, _AP_S); + Result.clearUnusedBits(); + return Result; + } + + /// Arithmetic right-shift this ap_private by shiftAmt. + /// @brief Arithmetic right-shift function. + INLINE ap_private ashr(uint32_t shiftAmt) const { + assert(shiftAmt <= BitWidth && "Invalid shift amount, too big"); + // Handle a degenerate case + if (shiftAmt == 0) return ap_private(*this); + + // If all the bits were shifted out, the result is, technically, undefined. + // We return -1 if it was negative, 0 otherwise. We check this early to + // avoid + // issues in the algorithm below. + if (shiftAmt == BitWidth) { + if (isNegative()) + return ap_private(-1); + else + return ap_private(0); + } + + // Create some space for the result. + ap_private Retval(0); + uint64_t* val = Retval.get_pVal(); + + // Compute some values needed by the following shift algorithms + uint32_t wordShift = + shiftAmt % APINT_BITS_PER_WORD; // bits to shift per word + uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift + uint32_t breakWord = _AP_N - 1 - offset; // last word affected + uint32_t bitsInWord = whichBit(BitWidth); // how many bits in last word? + if (bitsInWord == 0) bitsInWord = APINT_BITS_PER_WORD; + + // If we are shifting whole words, just move whole words + if (wordShift == 0) { + // Move the words containing significant bits + for (uint32_t i = 0; i <= breakWord; ++i) + val[i] = pVal[i + offset]; // move whole word + + // Adjust the top significant word for sign bit fill, if negative + if (isNegative()) + if (bitsInWord < APINT_BITS_PER_WORD) + val[breakWord] |= ~0ULL << (bitsInWord); // set high bits + } else { + // Shift the low order words + for (uint32_t i = 0; i < breakWord; ++i) { + // This combines the shifted corresponding word with the low bits from + // the next word (shifted into this word's high bits). + val[i] = ((pVal[i + offset]) >> (wordShift)); + val[i] |= ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift)); + } + + // Shift the break word. In this case there are no bits from the next word + // to include in this word. + val[breakWord] = (pVal[breakWord + offset]) >> (wordShift); + + // Deal with sign extenstion in the break word, and possibly the word + // before + // it. + if (isNegative()) { + if (wordShift > bitsInWord) { + if (breakWord > 0) + val[breakWord - 1] |= + ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord)); + val[breakWord] |= ~0ULL; + } else + val[breakWord] |= (~0ULL << (bitsInWord - wordShift)); + } + } + + // Remaining words are 0 or -1, just assign them. + uint64_t fillValue = (isNegative() ? ~0ULL : 0); + for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = fillValue; + Retval.clearUnusedBits(); + return Retval; + } + + /// Logical right-shift this ap_private by shiftAmt. + /// @brief Logical right-shift function. + INLINE ap_private lshr(uint32_t shiftAmt) const { + // If all the bits were shifted out, the result is 0. This avoids issues + // with shifting by the size of the integer type, which produces undefined + // results. We define these "undefined results" to always be 0. + if (shiftAmt == BitWidth) return ap_private(0); + + // If none of the bits are shifted out, the result is *this. This avoids + // issues with shifting byt he size of the integer type, which produces + // undefined results in the code below. This is also an optimization. + if (shiftAmt == 0) return ap_private(*this); + + // Create some space for the result. + ap_private Retval(0); + uint64_t* val = Retval.get_pVal(); + + // If we are shifting less than a word, compute the shift with a simple + // carry + if (shiftAmt < APINT_BITS_PER_WORD) { + uint64_t carry = 0; + for (int i = _AP_N - 1; i >= 0; --i) { + val[i] = ((pVal[i]) >> (shiftAmt)) | carry; + carry = (pVal[i]) << (APINT_BITS_PER_WORD - shiftAmt); + } + Retval.clearUnusedBits(); + return Retval; + } + + // Compute some values needed by the remaining shift algorithms + uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD; + uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; + + // If we are shifting whole words, just move whole words + if (wordShift == 0) { + for (uint32_t i = 0; i < _AP_N - offset; ++i) val[i] = pVal[i + offset]; + for (uint32_t i = _AP_N - offset; i < _AP_N; i++) val[i] = 0; + Retval.clearUnusedBits(); + return Retval; + } + + // Shift the low order words + uint32_t breakWord = _AP_N - offset - 1; + for (uint32_t i = 0; i < breakWord; ++i) + val[i] = ((pVal[i + offset]) >> (wordShift)) | + ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift)); + // Shift the break word. + val[breakWord] = (pVal[breakWord + offset]) >> (wordShift); + + // Remaining words are 0 + for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = 0; + Retval.clearUnusedBits(); + return Retval; + } + + /// Left-shift this ap_private by shiftAmt. + /// @brief Left-shift function. + INLINE ap_private shl(uint32_t shiftAmt) const { + assert(shiftAmt <= BitWidth && "Invalid shift amount, too big"); + // If all the bits were shifted out, the result is 0. This avoids issues + // with shifting by the size of the integer type, which produces undefined + // results. We define these "undefined results" to always be 0. + if (shiftAmt == BitWidth) return ap_private(0); + + // If none of the bits are shifted out, the result is *this. This avoids a + // lshr by the words size in the loop below which can produce incorrect + // results. It also avoids the expensive computation below for a common + // case. + if (shiftAmt == 0) return ap_private(*this); + + // Create some space for the result. + ap_private Retval(0); + uint64_t* val = Retval.get_pVal(); + // If we are shifting less than a word, do it the easy way + if (shiftAmt < APINT_BITS_PER_WORD) { + uint64_t carry = 0; + for (int i = 0; i < _AP_N; i++) { + val[i] = ((pVal[i]) << (shiftAmt)) | carry; + carry = (pVal[i]) >> (APINT_BITS_PER_WORD - shiftAmt); + } + Retval.clearUnusedBits(); + return Retval; + } + + // Compute some values needed by the remaining shift algorithms + uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD; + uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; + + // If we are shifting whole words, just move whole words + if (wordShift == 0) { + for (uint32_t i = 0; i < offset; i++) val[i] = 0; + for (int i = offset; i < _AP_N; i++) val[i] = pVal[i - offset]; + Retval.clearUnusedBits(); + return Retval; + } + + // Copy whole words from this to Result. + uint32_t i = _AP_N - 1; + for (; i > offset; --i) + val[i] = (pVal[i - offset]) << (wordShift) | + (pVal[i - offset - 1]) >> (APINT_BITS_PER_WORD - wordShift); + val[offset] = (pVal[0]) << (wordShift); + for (i = 0; i < offset; ++i) val[i] = 0; + Retval.clearUnusedBits(); + return Retval; + } + + INLINE ap_private rotl(uint32_t rotateAmt) const { + if (rotateAmt == 0) return ap_private(*this); + // Don't get too fancy, just use existing shift/or facilities + ap_private hi(*this); + ap_private lo(*this); + hi.shl(rotateAmt); + lo.lshr(BitWidth - rotateAmt); + return hi | lo; + } + + INLINE ap_private rotr(uint32_t rotateAmt) const { + if (rotateAmt == 0) return ap_private(*this); + // Don't get too fancy, just use existing shift/or facilities + ap_private hi(*this); + ap_private lo(*this); + lo.lshr(rotateAmt); + hi.shl(BitWidth - rotateAmt); + return hi | lo; + } + + /// Perform an unsigned divide operation on this ap_private by RHS. Both this + /// and + /// RHS are treated as unsigned quantities for purposes of this division. + /// @returns a new ap_private value containing the division result + /// @brief Unsigned division operation. + INLINE ap_private udiv(const ap_private& RHS) const { + // Get some facts about the LHS and RHS number of bits and words + uint32_t rhsBits = RHS.getActiveBits(); + uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1); + assert(rhsWords && "Divided by zero???"); + uint32_t lhsBits = this->getActiveBits(); + uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1); + + // Deal with some degenerate cases + if (!lhsWords) + // 0 / X ===> 0 + return ap_private(0); + else if (lhsWords < rhsWords || this->ult(RHS)) { + // X / Y ===> 0, iff X < Y + return ap_private(0); + } else if (*this == RHS) { + // X / X ===> 1 + return ap_private(1); + } else if (lhsWords == 1 && rhsWords == 1) { + // All high words are zero, just use native divide + return ap_private(this->pVal[0] / RHS.get_pVal(0)); + } + + // We have to compute it the hard way. Invoke the Knuth divide algorithm. + ap_private Quotient(0); // to hold result. + ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, &Quotient, + (ap_private*)0); + return Quotient; + } + + /// Signed divide this ap_private by ap_private RHS. + /// @brief Signed division function for ap_private. + INLINE ap_private sdiv(const ap_private& RHS) const { + if (isNegative()) + if (RHS.isNegative()) + return (-(*this)).udiv(-RHS); + else + return -((-(*this)).udiv(RHS)); + else if (RHS.isNegative()) + return -(this->udiv((ap_private)(-RHS))); + return this->udiv(RHS); + } + + /// Perform an unsigned remainder operation on this ap_private with RHS being + /// the + /// divisor. Both this and RHS are treated as unsigned quantities for purposes + /// of this operation. Note that this is a true remainder operation and not + /// a modulo operation because the sign follows the sign of the dividend + /// which is *this. + /// @returns a new ap_private value containing the remainder result + /// @brief Unsigned remainder operation. + INLINE ap_private urem(const ap_private& RHS) const { + // Get some facts about the LHS + uint32_t lhsBits = getActiveBits(); + uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1); + + // Get some facts about the RHS + uint32_t rhsBits = RHS.getActiveBits(); + uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1); + assert(rhsWords && "Performing remainder operation by zero ???"); + + // Check the degenerate cases + if (lhsWords == 0) { + // 0 % Y ===> 0 + return ap_private(0); + } else if (lhsWords < rhsWords || this->ult(RHS)) { + // X % Y ===> X, iff X < Y + return *this; + } else if (*this == RHS) { + // X % X == 0; + return ap_private(0); + } else if (lhsWords == 1) { + // All high words are zero, just use native remainder + return ap_private(pVal[0] % RHS.get_pVal(0)); + } + + // We have to compute it the hard way. Invoke the Knuth divide algorithm. + ap_private Remainder(0); + ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, (ap_private*)(0), + &Remainder); + return Remainder; + } + + INLINE ap_private urem(uint64_t RHS) const { + // Get some facts about the LHS + uint32_t lhsBits = getActiveBits(); + uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1); + // Get some facts about the RHS + uint32_t rhsWords = 1; //! rhsBits ? 0 : (ap_private<_AP_W, + //! _AP_S>::whichWord(rhsBits - 1) + 1); + assert(rhsWords && "Performing remainder operation by zero ???"); + // Check the degenerate cases + if (lhsWords == 0) { + // 0 % Y ===> 0 + return ap_private(0); + } else if (lhsWords < rhsWords || this->ult(RHS)) { + // X % Y ===> X, iff X < Y + return *this; + } else if (*this == RHS) { + // X % X == 0; + return ap_private(0); + } else if (lhsWords == 1) { + // All high words are zero, just use native remainder + return ap_private(pVal[0] % RHS); + } + + // We have to compute it the hard way. Invoke the Knuth divide algorithm. + ap_private Remainder(0); + divide(*this, lhsWords, RHS, (ap_private*)(0), &Remainder); + return Remainder; + } + + /// Signed remainder operation on ap_private. + /// @brief Function for signed remainder operation. + INLINE ap_private srem(const ap_private& RHS) const { + if (isNegative()) { + ap_private lhs = -(*this); + if (RHS.isNegative()) { + ap_private rhs = -RHS; + return -(lhs.urem(rhs)); + } else + return -(lhs.urem(RHS)); + } else if (RHS.isNegative()) { + ap_private rhs = -RHS; + return this->urem(rhs); + } + return this->urem(RHS); + } + + /// Signed remainder operation on ap_private. + /// @brief Function for signed remainder operation. + INLINE ap_private srem(int64_t RHS) const { + if (isNegative()) + if (RHS < 0) + return -((-(*this)).urem(-RHS)); + else + return -((-(*this)).urem(RHS)); + else if (RHS < 0) + return this->urem(-RHS); + return this->urem(RHS); + } + + /// Compares this ap_private with RHS for the validity of the equality + /// relationship. + /// @returns true if *this == Val + /// @brief Equality comparison. + template + INLINE bool eq(const ap_private<_AP_W, _AP_S1>& RHS) const { + return (*this) == RHS; + } + + /// Compares this ap_private with RHS for the validity of the inequality + /// relationship. + /// @returns true if *this != Val + /// @brief Inequality comparison + template + INLINE bool ne(const ap_private<_AP_W, _AP_S1>& RHS) const { + return !((*this) == RHS); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// the validity of the less-than relationship. + /// @returns true if *this < RHS when both are considered unsigned. + /// @brief Unsigned less than comparison + template + INLINE bool ult(const ap_private<_AP_W, _AP_S1>& RHS) const { + // Get active bit length of both operands + uint32_t n1 = getActiveBits(); + uint32_t n2 = RHS.getActiveBits(); + + // If magnitude of LHS is less than RHS, return true. + if (n1 < n2) return true; + + // If magnitude of RHS is greather than LHS, return false. + if (n2 < n1) return false; + + // If they bot fit in a word, just compare the low order word + if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD) + return pVal[0] < RHS.get_pVal(0); + + // Otherwise, compare all words + uint32_t topWord = whichWord(AESL_std::max(n1, n2) - 1); + for (int i = topWord; i >= 0; --i) { + if (pVal[i] > RHS.get_pVal(i)) return false; + if (pVal[i] < RHS.get_pVal(i)) return true; + } + return false; + } + + INLINE bool ult(uint64_t RHS) const { + // Get active bit length of both operands + uint32_t n1 = getActiveBits(); + uint32_t n2 = + 64 - ap_private_ops::CountLeadingZeros_64(RHS); // RHS.getActiveBits(); + + // If magnitude of LHS is less than RHS, return true. + if (n1 < n2) return true; + + // If magnitude of RHS is greather than LHS, return false. + if (n2 < n1) return false; + + // If they bot fit in a word, just compare the low order word + if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD) + return pVal[0] < RHS; + assert(0); + } + + template + INLINE bool slt(const ap_private<_AP_W, _AP_S1>& RHS) const { + ap_private lhs(*this); + ap_private<_AP_W, _AP_S1> rhs(RHS); + bool lhsNeg = isNegative(); + bool rhsNeg = rhs.isNegative(); + if (lhsNeg) { + // Sign bit is set so perform two's complement to make it positive + lhs.flip(); + lhs++; + } + if (rhsNeg) { + // Sign bit is set so perform two's complement to make it positive + rhs.flip(); + rhs++; + } + + // Now we have unsigned values to compare so do the comparison if necessary + // based on the negativeness of the values. + if (lhsNeg) + if (rhsNeg) + return lhs.ugt(rhs); + else + return true; + else if (rhsNeg) + return false; + else + return lhs.ult(rhs); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// validity of the less-or-equal relationship. + /// @returns true if *this <= RHS when both are considered unsigned. + /// @brief Unsigned less or equal comparison + template + INLINE bool ule(const ap_private<_AP_W, _AP_S1>& RHS) const { + return ult(RHS) || eq(RHS); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// validity of the less-or-equal relationship. + /// @returns true if *this <= RHS when both are considered signed. + /// @brief Signed less or equal comparison + template + INLINE bool sle(const ap_private<_AP_W, _AP_S1>& RHS) const { + return slt(RHS) || eq(RHS); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// the validity of the greater-than relationship. + /// @returns true if *this > RHS when both are considered unsigned. + /// @brief Unsigned greather than comparison + template + INLINE bool ugt(const ap_private<_AP_W, _AP_S1>& RHS) const { + return !ult(RHS) && !eq(RHS); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// the validity of the greater-than relationship. + /// @returns true if *this > RHS when both are considered signed. + /// @brief Signed greather than comparison + template + INLINE bool sgt(const ap_private<_AP_W, _AP_S1>& RHS) const { + return !slt(RHS) && !eq(RHS); + } + + /// Regards both *this and RHS as unsigned quantities and compares them for + /// validity of the greater-or-equal relationship. + /// @returns true if *this >= RHS when both are considered unsigned. + /// @brief Unsigned greater or equal comparison + template + INLINE bool uge(const ap_private<_AP_W, _AP_S>& RHS) const { + return !ult(RHS); + } + + /// Regards both *this and RHS as signed quantities and compares them for + /// validity of the greater-or-equal relationship. + /// @returns true if *this >= RHS when both are considered signed. + /// @brief Signed greather or equal comparison + template + INLINE bool sge(const ap_private<_AP_W, _AP_S1>& RHS) const { + return !slt(RHS); + } + + // Sign extend to a new width. + template + INLINE void cpSext(const ap_private<_AP_W1, _AP_S1>& that) { + assert(_AP_W1 < BitWidth && "Invalid ap_private SignExtend request"); + assert(_AP_W1 <= MAX_INT_BITS && "Too many bits"); + // If the sign bit isn't set, this is the same as zext. + if (!that.isNegative()) { + cpZext(that); + return; + } + + // The sign bit is set. First, get some facts + enum { wordBits = _AP_W1 % APINT_BITS_PER_WORD }; + const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N; + // Mask the high order word appropriately + if (_AP_N1 == _AP_N) { + enum { newWordBits = _AP_W % APINT_BITS_PER_WORD }; + // The extension is contained to the wordsBefore-1th word. + static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL; + for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i); + pVal[_AP_N - 1] |= mask; + return; + } + + enum { newWordBits = _AP_W % APINT_BITS_PER_WORD }; + // The extension is contained to the wordsBefore-1th word. + static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL; + int i; + for (i = 0; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i); + pVal[i - 1] |= mask; + for (; i < _AP_N - 1; i++) pVal[i] = ~0ULL; + pVal[i] = ~0ULL; + clearUnusedBits(); + return; + } + + // Zero extend to a new width. + template + INLINE void cpZext(const ap_private<_AP_W1, _AP_S1>& that) { + assert(_AP_W1 < BitWidth && "Invalid ap_private ZeroExtend request"); + assert(_AP_W1 <= MAX_INT_BITS && "Too many bits"); + const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N; + int i = 0; + for (; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i); + for (; i < _AP_N; ++i) pVal[i] = 0; + clearUnusedBits(); + } + + template + INLINE void cpZextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) { + if (BitWidth > _AP_W1) + cpZext(that); + else { + for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i); + clearUnusedBits(); + } + } + + template + INLINE void cpSextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) { + if (BitWidth > _AP_W1) + cpSext(that); + else { + for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i); + clearUnusedBits(); + } + } + + /// @} + /// @name Value Characterization Functions + /// @{ + + /// @returns the total number of bits. + INLINE uint32_t getBitWidth() const { return BitWidth; } + + /// Here one word's bitwidth equals to that of uint64_t. + /// @returns the number of words to hold the integer value of this ap_private. + /// @brief Get the number of words. + INLINE uint32_t getNumWords() const { + return (BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD; + } + + /// This function returns the number of active bits which is defined as the + /// bit width minus the number of leading zeros. This is used in several + /// computations to see how "wide" the value is. + /// @brief Compute the number of active bits in the value + INLINE uint32_t getActiveBits() const { + uint32_t bits = BitWidth - countLeadingZeros(); + return bits ? bits : 1; + } + + /// This method attempts to return the value of this ap_private as a zero + /// extended + /// uint64_t. The bitwidth must be <= 64 or the value must fit within a + /// uint64_t. Otherwise an assertion will result. + /// @brief Get zero extended value + INLINE uint64_t getZExtValue() const { + assert(getActiveBits() <= 64 && "Too many bits for uint64_t"); + return *pVal; + } + + /// This method attempts to return the value of this ap_private as a sign + /// extended + /// int64_t. The bit width must be <= 64 or the value must fit within an + /// int64_t. Otherwise an assertion will result. + /// @brief Get sign extended value + INLINE int64_t getSExtValue() const { + assert(getActiveBits() <= 64 && "Too many bits for int64_t"); + return int64_t(pVal[0]); + } + + /// This method determines how many bits are required to hold the ap_private + /// equivalent of the string given by \p str of length \p slen. + /// @brief Get bits required for string value. + INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen, + uint8_t radix) { + assert(str != 0 && "Invalid value string"); + assert(slen > 0 && "Invalid string length"); + + // Each computation below needs to know if its negative + uint32_t isNegative = str[0] == '-'; + if (isNegative) { + slen--; + str++; + } + // For radixes of power-of-two values, the bits required is accurately and + // easily computed + if (radix == 2) return slen + isNegative; + if (radix == 8) return slen * 3 + isNegative; + if (radix == 16) return slen * 4 + isNegative; + + // Otherwise it must be radix == 10, the hard case + assert(radix == 10 && "Invalid radix"); + + // Convert to the actual binary value. + // ap_private<_AP_W, _AP_S> tmp(sufficient, str, slen, radix); + + // Compute how many bits are required. + // return isNegative + tmp.logBase2() + 1; + return isNegative + slen * 4; + } + + /// countLeadingZeros - This function is an ap_private version of the + /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number + /// of zeros from the most significant bit to the first one bit. + /// @returns BitWidth if the value is zero. + /// @returns the number of zeros from the most significant bit to the first + /// one bits. + INLINE uint32_t countLeadingZeros() const { + enum { + msw_bits = (BitWidth % APINT_BITS_PER_WORD) + ? (BitWidth % APINT_BITS_PER_WORD) + : APINT_BITS_PER_WORD, + excessBits = APINT_BITS_PER_WORD - msw_bits + }; + uint32_t Count = ap_private_ops::CountLeadingZeros_64(pVal[_AP_N - 1]); + if (Count >= excessBits) Count -= excessBits; + if (!pVal[_AP_N - 1]) { + for (int i = _AP_N - 1; i; --i) { + if (!pVal[i - 1]) + Count += APINT_BITS_PER_WORD; + else { + Count += ap_private_ops::CountLeadingZeros_64(pVal[i - 1]); + break; + } + } + } + return Count; + } + + /// countLeadingOnes - This function counts the number of contiguous 1 bits + /// in the high order bits. The count stops when the first 0 bit is reached. + /// @returns 0 if the high order bit is not set + /// @returns the number of 1 bits from the most significant to the least + /// @brief Count the number of leading one bits. + INLINE uint32_t countLeadingOnes() const { + if (isSingleWord()) + return countLeadingOnes_64(get_VAL(), APINT_BITS_PER_WORD - BitWidth); + + uint32_t highWordBits = BitWidth % APINT_BITS_PER_WORD; + uint32_t shift = + (highWordBits == 0 ? 0 : APINT_BITS_PER_WORD - highWordBits); + int i = _AP_N - 1; + uint32_t Count = countLeadingOnes_64(get_pVal(i), shift); + if (Count == highWordBits) { + for (i--; i >= 0; --i) { + if (get_pVal(i) == ~0ULL) + Count += APINT_BITS_PER_WORD; + else { + Count += countLeadingOnes_64(get_pVal(i), 0); + break; + } + } + } + return Count; + } + + /// countTrailingZeros - This function is an ap_private version of the + /// countTrailingZoers_{32,64} functions in MathExtras.h. It counts + /// the number of zeros from the least significant bit to the first set bit. + /// @returns BitWidth if the value is zero. + /// @returns the number of zeros from the least significant bit to the first + /// one bit. + /// @brief Count the number of trailing zero bits. + INLINE uint32_t countTrailingZeros() const { + uint32_t Count = 0; + uint32_t i = 0; + for (; i < _AP_N && get_pVal(i) == 0; ++i) Count += APINT_BITS_PER_WORD; + if (i < _AP_N) Count += ap_private_ops::CountTrailingZeros_64(get_pVal(i)); + return AESL_std::min(Count, BitWidth); + } + /// countPopulation - This function is an ap_private version of the + /// countPopulation_{32,64} functions in MathExtras.h. It counts the number + /// of 1 bits in the ap_private value. + /// @returns 0 if the value is zero. + /// @returns the number of set bits. + /// @brief Count the number of bits set. + INLINE uint32_t countPopulation() const { + uint32_t Count = 0; + for (int i = 0; i < _AP_N - 1; ++i) + Count += ap_private_ops::CountPopulation_64(pVal[i]); + Count += ap_private_ops::CountPopulation_64(pVal[_AP_N - 1] & mask); + return Count; + } + + /// @} + /// @name Conversion Functions + /// @ + + /// This is used internally to convert an ap_private to a string. + /// @brief Converts an ap_private to a std::string + INLINE std::string toString(uint8_t radix, bool wantSigned) const; + + /// Considers the ap_private to be unsigned and converts it into a string in + /// the + /// radix given. The radix can be 2, 8, 10 or 16. + /// @returns a character interpretation of the ap_private + /// @brief Convert unsigned ap_private to string representation. + INLINE std::string toStringUnsigned(uint8_t radix = 10) const { + return toString(radix, false); + } + + /// Considers the ap_private to be unsigned and converts it into a string in + /// the + /// radix given. The radix can be 2, 8, 10 or 16. + /// @returns a character interpretation of the ap_private + /// @brief Convert unsigned ap_private to string representation. + INLINE std::string toStringSigned(uint8_t radix = 10) const { + return toString(radix, true); + } + + /// @brief Converts this ap_private to a double value. + INLINE double roundToDouble(bool isSigned) const { + // Handle the simple case where the value is contained in one uint64_t. + if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) { + uint64_t val = pVal[0]; + if (isSigned) { + int64_t sext = ((int64_t(val)) << (64 - BitWidth)) >> (64 - BitWidth); + return double(sext); + } else + return double(val); + } + + // Determine if the value is negative. + bool isNeg = isSigned ? (*this)[BitWidth - 1] : false; + + // Construct the absolute value if we're negative. + ap_private<_AP_W, _AP_S> Tmp(isNeg ? -(*this) : (*this)); + + // Figure out how many bits we're using. + uint32_t n = Tmp.getActiveBits(); + + // The exponent (without bias normalization) is just the number of bits + // we are using. Note that the sign bit is gone since we constructed the + // absolute value. + uint64_t exp = n; + + // Return infinity for exponent overflow + if (exp > 1023) { + if (!isSigned || !isNeg) + return std::numeric_limits::infinity(); + else + return -std::numeric_limits::infinity(); + } + exp += 1023; // Increment for 1023 bias + + // Number of bits in mantissa is 52. To obtain the mantissa value, we must + // extract the high 52 bits from the correct words in pVal. + uint64_t mantissa; + unsigned hiWord = whichWord(n - 1); + if (hiWord == 0) { + mantissa = Tmp.get_pVal(0); + if (n > 52) + (mantissa) >>= (n - 52); // shift down, we want the top 52 bits. + } else { + assert(hiWord > 0 && "High word is negative?"); + uint64_t hibits = (Tmp.get_pVal(hiWord)) + << (52 - n % APINT_BITS_PER_WORD); + uint64_t lobits = + (Tmp.get_pVal(hiWord - 1)) >> (11 + n % APINT_BITS_PER_WORD); + mantissa = hibits | lobits; + } + + // The leading bit of mantissa is implicit, so get rid of it. + uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0; + union { + double __D; + uint64_t __I; + } __T; + __T.__I = sign | ((exp) << 52) | mantissa; + return __T.__D; + } + + /// @brief Converts this unsigned ap_private to a double value. + INLINE double roundToDouble() const { return roundToDouble(false); } + + /// @brief Converts this signed ap_private to a double value. + INLINE double signedRoundToDouble() const { return roundToDouble(true); } + + /// The conversion does not do a translation from integer to double, it just + /// re-interprets the bits as a double. Note that it is valid to do this on + /// any bit width. Exactly 64 bits will be translated. + /// @brief Converts ap_private bits to a double + INLINE double bitsToDouble() const { + union { + uint64_t __I; + double __D; + } __T; + __T.__I = pVal[0]; + return __T.__D; + } + + /// The conversion does not do a translation from integer to float, it just + /// re-interprets the bits as a float. Note that it is valid to do this on + /// any bit width. Exactly 32 bits will be translated. + /// @brief Converts ap_private bits to a double + INLINE float bitsToFloat() const { + union { + uint32_t __I; + float __F; + } __T; + __T.__I = uint32_t(pVal[0]); + return __T.__F; + } + + /// The conversion does not do a translation from double to integer, it just + /// re-interprets the bits of the double. Note that it is valid to do this on + /// any bit width but bits from V may get truncated. + /// @brief Converts a double to ap_private bits. + INLINE ap_private& doubleToBits(double __V) { + union { + uint64_t __I; + double __D; + } __T; + __T.__D = __V; + pVal[0] = __T.__I; + return *this; + } + + /// The conversion does not do a translation from float to integer, it just + /// re-interprets the bits of the float. Note that it is valid to do this on + /// any bit width but bits from V may get truncated. + /// @brief Converts a float to ap_private bits. + INLINE ap_private& floatToBits(float __V) { + union { + uint32_t __I; + float __F; + } __T; + __T.__F = __V; + pVal[0] = __T.__I; + } + + // Reduce operation + //----------------------------------------------------------- + INLINE bool and_reduce() const { return isMaxValue(); } + + INLINE bool nand_reduce() const { return isMinValue(); } + + INLINE bool or_reduce() const { return (bool)countPopulation(); } + + INLINE bool nor_reduce() const { return countPopulation() == 0; } + + INLINE bool xor_reduce() const { + unsigned int i = countPopulation(); + return (i % 2) ? true : false; + } + + INLINE bool xnor_reduce() const { + unsigned int i = countPopulation(); + return (i % 2) ? false : true; + } + INLINE std::string to_string(uint8_t radix = 16, bool sign = false) const { + return toString(radix, radix == 10 ? _AP_S : sign); + } +}; // End of class ap_private <_AP_W, _AP_S, false> + +namespace ap_private_ops { + +enum { APINT_BITS_PER_WORD = 64 }; +template +INLINE bool operator==(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) { + return V2 == V1; +} + +template +INLINE bool operator!=(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) { + return V2 != V1; +} + +template +INLINE bool get(const ap_private<_AP_W, _AP_S>& a) { + static const uint64_t mask = 1ULL << (index & 0x3f); + return ((mask & a.get_pVal((index) >> 6)) != 0); +} + +template +INLINE void set(ap_private<_AP_W, _AP_S>& a, + const ap_private& mark1 = 0, + const ap_private& mark2 = 0) { + enum { + APINT_BITS_PER_WORD = 64, + lsb_word = lsb_index / APINT_BITS_PER_WORD, + msb_word = msb_index / APINT_BITS_PER_WORD, + msb = msb_index % APINT_BITS_PER_WORD, + lsb = lsb_index % APINT_BITS_PER_WORD + }; + if (msb_word == lsb_word) { + const uint64_t mask = ~0ULL >> + (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >> + (APINT_BITS_PER_WORD - msb - 1); + // a.set_pVal(msb_word, a.get_pVal(msb_word) | mask); + a.get_pVal(msb_word) |= mask; + } else { + const uint64_t lsb_mask = ~0ULL >> (lsb) << (lsb); + const uint64_t msb_mask = ~0ULL << (APINT_BITS_PER_WORD - msb - 1) >> + (APINT_BITS_PER_WORD - msb - 1); + // a.set_pVal(lsb_word, a.get_pVal(lsb_word) | lsb_mask); + a.get_pVal(lsb_word) |= lsb_mask; + for (int i = lsb_word + 1; i < msb_word; i++) { + a.set_pVal(i, ~0ULL); + // a.get_pVal(i)=0; + } + // a.set_pVal(msb_word, a.get_pVal(msb_word) | msb_mask); + + a.get_pVal(msb_word) |= msb_mask; + } + a.clearUnusedBits(); +} + +template +INLINE void clear(ap_private<_AP_W, _AP_S>& a, + const ap_private& mark1 = 0, + const ap_private& mark2 = 0) { + enum { + APINT_BITS_PER_WORD = 64, + lsb_word = lsb_index / APINT_BITS_PER_WORD, + msb_word = msb_index / APINT_BITS_PER_WORD, + msb = msb_index % APINT_BITS_PER_WORD, + lsb = lsb_index % APINT_BITS_PER_WORD + }; + if (msb_word == lsb_word) { + const uint64_t mask = + ~(~0ULL >> (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >> + (APINT_BITS_PER_WORD - msb - 1)); + // a.set_pVal(msb_word, a.get_pVal(msb_word) & mask); + a.get_pVal(msb_word) &= mask; + } else { + const uint64_t lsb_mask = ~(~0ULL >> (lsb) << (lsb)); + const uint64_t msb_mask = ~(~0ULL << (APINT_BITS_PER_WORD - msb - 1) >> + (APINT_BITS_PER_WORD - msb - 1)); + // a.set_pVal(lsb_word, a.get_pVal(lsb_word) & lsb_mask); + a.get_pVal(lsb_word) &= lsb_mask; + for (int i = lsb_word + 1; i < msb_word; i++) { + // a.set_pVal(i, 0); + a.get_pVal(i) = 0; + } + // a.set_pVal(msb_word, a.get_pVal(msb_word) & msb_mask); + a.get_pVal(msb_word) &= msb_mask; + } + a.clearUnusedBits(); +} + +template +INLINE void set(ap_private<_AP_W, _AP_S>& a, + const ap_private& mark = 0) { + enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD }; + static const uint64_t mask = 1ULL << (index % APINT_BITS_PER_WORD); + // a.set_pVal(word, a.get_pVal(word) | mask); + a.get_pVal(word) |= mask; + a.clearUnusedBits(); +} + +template +INLINE void clear(ap_private<_AP_W, _AP_S>& a, + const ap_private& mark = 0) { + enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD }; + static const uint64_t mask = ~(1ULL << (index % APINT_BITS_PER_WORD)); + // a.set_pVal(word, a.get_pVal(word) & mask); + a.get_pVal(word) &= mask; + a.clearUnusedBits(); +} + +} // End of ap_private_ops namespace + +template +INLINE std::string ap_private<_AP_W, _AP_S, false>::toString( + uint8_t radix, bool wantSigned) const { + assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) && + "Radix should be 2, 8, 10, or 16!"); + static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", "A", "B", "C", "D", "E", "F"}; + std::string result; + + if (radix != 10) { + // For the 2, 8 and 16 bit cases, we can just shift instead of divide + // because the number of bits per digit (1,3 and 4 respectively) divides + // equaly. We just shift until there value is zero. + + // First, check for a zero value and just short circuit the logic below. + if (*this == (uint64_t)(0)) + result = "0"; + else { + ap_private<_AP_W, false> tmp(*this); + size_t insert_at = 0; + bool leading_zero = true; + if (wantSigned && isNegative()) { + // They want to print the signed version and it is a negative value + // Flip the bits and add one to turn it into the equivalent positive + // value and put a '-' in the result. + tmp.flip(); + tmp++; + tmp.clearUnusedBitsToZero(); + result = "-"; + insert_at = 1; + leading_zero = false; + } + switch (radix) { + case 2: + result += "0b"; + break; + case 8: + result += "0o"; + break; + case 16: + result += "0x"; + break; + default: + assert("invalid radix" && 0); + } + insert_at += 2; + // Just shift tmp right for each digit width until it becomes zero + uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1)); + uint64_t mask = radix - 1; + ap_private<_AP_W, false> zero(0); + unsigned bits = 0; + while (tmp.ne(zero)) { + uint64_t digit = tmp.get_VAL() & mask; + result.insert(insert_at, digits[digit]); + tmp = tmp.lshr(shift); + ++bits; + } + bits *= shift; + if (bits < _AP_W && leading_zero) result.insert(insert_at, digits[0]); + } + return result; + } + + ap_private<_AP_W, false> tmp(*this); + ap_private<_AP_W, false> divisor(radix); + ap_private<_AP_W, false> zero(0); + size_t insert_at = 0; + if (wantSigned && isNegative()) { + // They want to print the signed version and it is a negative value + // Flip the bits and add one to turn it into the equivalent positive + // value and put a '-' in the result. + tmp.flip(); + tmp++; + tmp.clearUnusedBitsToZero(); + result = "-"; + insert_at = 1; + } + if (tmp == ap_private<_AP_W, false>(0)) + result = "0"; + else + while (tmp.ne(zero)) { + ap_private<_AP_W, false> APdigit(0); + ap_private<_AP_W, false> tmp2(0); + ap_private_ops::divide(tmp, tmp.getNumWords(), divisor, + divisor.getNumWords(), &tmp2, &APdigit); + uint64_t digit = APdigit.getZExtValue(); + assert(digit < radix && "divide failed"); + result.insert(insert_at, digits[digit]); + tmp = tmp2; + } + + return result; +} // End of ap_private<_AP_W, _AP_S, false>::toString() + +template +std::ostream &operator<<(std::ostream &os, const ap_private<_AP_W, _AP_S> &x) { + std::ios_base::fmtflags ff = std::cout.flags(); + if (ff & std::cout.hex) { + os << x.toString(16, false); // don't print sign + } else if (ff & std::cout.oct) { + os << x.toString(8, false); // don't print sign + } else { + os << x.toString(10, _AP_S); + } + return os; +} + +// ------------------------------------------------------------ // +// XXX moved here from ap_int_sim.h XXX // +// ------------------------------------------------------------ // + +/// Concatination reference. +/// Proxy class which allows concatination to be used as rvalue(for reading) and +/// lvalue(for writing) +// ---------------------------------------------------------------- +// template +// struct ap_concat_ref { +//#ifdef _MSC_VER +//#pragma warning(disable : 4521 4522) +//#endif +// enum { +// _AP_WR = _AP_W1 + _AP_W2, +// }; +// _AP_T1& mbv1; +// _AP_T2& mbv2; +// +// INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& +// ref) +// : mbv1(ref.mbv1), mbv2(ref.mbv2) {} +// +// INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {} +// +// template +// INLINE ap_concat_ref& operator=(const ap_private<_AP_W3, _AP_S3>& val) { +// ap_private<_AP_W1 + _AP_W2, false> vval(val); +// int W_ref1 = mbv1.length(); +// int W_ref2 = mbv2.length(); +// ap_private<_AP_W1, false> mask1(-1); +// mask1 >>= _AP_W1 - W_ref1; +// ap_private<_AP_W2, false> mask2(-1); +// mask2 >>= _AP_W2 - W_ref2; +// mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1)); +// mbv2.set(ap_private<_AP_W2, false>(vval & mask2)); +// return *this; +// } +// +// INLINE ap_concat_ref& operator=(unsigned long long val) { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal(val); +// return operator=(tmpVal); +// } +// +// template +// INLINE ap_concat_ref& operator=( +// const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal(val); +// return operator=(tmpVal); +// } +// +// INLINE ap_concat_ref& operator=( +// const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal(val); +// return operator=(tmpVal); +// } +// +// template +// INLINE ap_concat_ref& operator=(const _private_bit_ref<_AP_W3, _AP_S3>& +// val) { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal(val); +// return operator=(tmpVal); +// } +// +// template +// INLINE ap_concat_ref& operator=(const _private_range_ref<_AP_W3, _AP_S3>& +// val) { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal(val); +// return operator=(tmpVal); +// } +// +// template +// INLINE ap_concat_ref& operator=( +// const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) +// { +// return operator=((const ap_private<_AP_W3, false>)(val)); +// } +// +// template +// INLINE ap_concat_ref& operator=( +// const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& +// val) { +// return operator=(val.to_ap_private()); +// } +// +// template +// INLINE ap_concat_ref& operator=( +// const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) { +// return operator=((unsigned long long)(bool)(val)); +// } +// +// INLINE operator ap_private<_AP_WR, false>() const { return get(); } +// +// INLINE operator unsigned long long() const { return get().to_uint64(); } +// +// template +// INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, +// _private_range_ref<_AP_W3, _AP_S3> > +// operator,(const _private_range_ref<_AP_W3, _AP_S3> &a2) { +// return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, +// _private_range_ref<_AP_W3, _AP_S3> >( +// *this, const_cast<_private_range_ref<_AP_W3, _AP_S3>&>(a2)); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3> +// > +// operator,(ap_private<_AP_W3, _AP_S3> &a2) { +// return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, +// ap_private<_AP_W3, _AP_S3> >(*this, a2); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3> +// > +// operator,(const ap_private<_AP_W3, _AP_S3> &a2) { +// return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, +// ap_private<_AP_W3, _AP_S3> >( +// *this, const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3, +// _AP_S3> > +// operator,(const _private_bit_ref<_AP_W3, _AP_S3> &a2) { +// return ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3, +// _AP_S3> >( +// *this, const_cast<_private_bit_ref<_AP_W3, _AP_S3>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4, +// ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> > +// operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) { +// return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4, +// ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >( +// *this, const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref< +// _AP_WR, ap_concat_ref, _AP_W3, +// af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> > +// operator,( +// const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) +// { +// return ap_concat_ref< +// _AP_WR, ap_concat_ref, _AP_W3, +// af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >( +// *this, +// const_cast< +// af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, +// _AP_N3>&>(a2)); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_WR, ap_concat_ref, 1, +// af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> +// > +// operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, +// _AP_N3> +// &a2) { +// return ap_concat_ref< +// _AP_WR, ap_concat_ref, 1, +// af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >( +// *this, +// const_cast&>( +// a2)); +// } +// +// template +// INLINE ap_private operator&( +// const ap_private<_AP_W3, _AP_S3>& a2) { +// return get() & a2; +// } +// +// template +// INLINE ap_private operator|( +// const ap_private<_AP_W3, _AP_S3>& a2) { +// return get() | a2; +// } +// +// template +// INLINE ap_private operator^( +// const ap_private<_AP_W3, _AP_S3>& a2) { +// return ap_private(get() ^ a2); +// } +// +// INLINE const ap_private<_AP_WR, false> get() const { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal = +// ap_private<_AP_W1 + _AP_W2, false>(mbv1.get()); +// ap_private<_AP_W1 + _AP_W2, false> tmpVal2 = +// ap_private<_AP_W1 + _AP_W2, false>(mbv2.get()); +// int W_ref2 = mbv2.length(); +// tmpVal <<= W_ref2; +// tmpVal |= tmpVal2; +// return tmpVal; +// } +// +// INLINE const ap_private<_AP_WR, false> get() { +// ap_private<_AP_W1 + _AP_W2, false> tmpVal = +// ap_private<_AP_W1 + _AP_W2, false>(mbv1.get()); +// ap_private<_AP_W1 + _AP_W2, false> tmpVal2 = +// ap_private<_AP_W1 + _AP_W2, false>(mbv2.get()); +// int W_ref2 = mbv2.length(); +// tmpVal <<= W_ref2; +// tmpVal |= tmpVal2; +// return tmpVal; +// } +// +// template +// INLINE void set(const ap_private<_AP_W3, false>& val) { +// ap_private<_AP_W1 + _AP_W2, false> vval(val); +// int W_ref1 = mbv1.length(); +// int W_ref2 = mbv2.length(); +// ap_private<_AP_W1, false> mask1(-1); +// mask1 >>= _AP_W1 - W_ref1; +// ap_private<_AP_W2, false> mask2(-1); +// mask2 >>= _AP_W2 - W_ref2; +// mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1)); +// mbv2.set(ap_private<_AP_W2, false>(vval & mask2)); +// } +// +// INLINE int length() const { return mbv1.length() + mbv2.length(); } +// +// INLINE std::string to_string(uint8_t radix = 2) const { +// return get().to_string(radix); +// } +//}; // struct ap_concat_ref. + +/// Range(slice) reference +/// Proxy class, which allows part selection to be used as rvalue(for reading) +/// and lvalue(for writing) +//------------------------------------------------------------ +template +struct _private_range_ref { +#ifdef _MSC_VER +#pragma warning(disable : 4521 4522) +#endif + ap_private<_AP_W, _AP_S>& d_bv; + int l_index; + int h_index; + + public: + /// copy ctor. + INLINE _private_range_ref(const _private_range_ref<_AP_W, _AP_S>& ref) + : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {} + + /// direct ctor. + INLINE _private_range_ref(ap_private<_AP_W, _AP_S>* bv, int h, int l) + : d_bv(*bv), l_index(l), h_index(h) { + _AP_WARNING(h < 0 || l < 0, + "Higher bound (%d) and lower bound (%d) cannot be " + "negative.", + h, l); + _AP_WARNING(h >= _AP_W || l >= _AP_W, + "Higher bound (%d) or lower bound (%d) out of range (%d).", h, l, + _AP_W); + } + + /// compound or assignment. + template + INLINE _private_range_ref<_AP_W, _AP_S>& operator|=( + const _private_range_ref<_AP_W2, _AP_S2>& ref) { + _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index), + "Bitsize mismach for ap_private<>.range() &= " + "ap_private<>.range()."); + this->d_bv |= ref.d_bv; + return *this; + } + + /// compound or assignment with root type. + template + INLINE _private_range_ref<_AP_W, _AP_S>& operator|=( + const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) { + _AP_WARNING((h_index - l_index + 1) != _AP_W2, + "Bitsize mismach for ap_private<>.range() |= _AP_ROOT_TYPE<>."); + this->d_bv |= ref.V; + return *this; + } + + /// compound and assignment. + template + INLINE _private_range_ref<_AP_W, _AP_S>& operator&=( + const _private_range_ref<_AP_W2, _AP_S2>& ref) { + _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index), + "Bitsize mismach for ap_private<>.range() &= " + "ap_private<>.range()."); + this->d_bv &= ref.d_bv; + return *this; + }; + + /// compound and assignment with root type. + template + INLINE _private_range_ref<_AP_W, _AP_S>& operator&=( + const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) { + _AP_WARNING((h_index - l_index + 1) != _AP_W2, + "Bitsize mismach for ap_private<>.range() &= _AP_ROOT_TYPE<>."); + this->d_bv &= ref.V; + return *this; + } + + /// compound xor assignment. + template + INLINE _private_range_ref<_AP_W, _AP_S>& operator^=( + const _private_range_ref<_AP_W2, _AP_S2>& ref) { + _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index), + "Bitsize mismach for ap_private<>.range() ^= " + "ap_private<>.range()."); + this->d_bv ^= ref.d_bv; + return *this; + }; + + /// compound xor assignment with root type. + template + INLINE _private_range_ref<_AP_W, _AP_S>& operator^=( + const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) { + _AP_WARNING((h_index - l_index + 1) != _AP_W2, + "Bitsize mismach for ap_private<>.range() ^= _AP_ROOT_TYPE<>."); + this->d_bv ^= ref.V; + return *this; + } + + /// @name convertors. + // @{ + INLINE operator ap_private<_AP_W, false>() const { + ap_private<_AP_W, false> val(0); + if (h_index >= l_index) { + if (_AP_W > 64) { + val = d_bv; + ap_private<_AP_W, false> mask(-1); + mask >>= _AP_W - (h_index - l_index + 1); + val >>= l_index; + val &= mask; + } else { + const static uint64_t mask = (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0)); + val = (d_bv >> l_index) & (mask >> (_AP_W - (h_index - l_index + 1))); + } + } else { + for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++) + if ((d_bv)[j]) val.set(i); + } + return val; + } + + INLINE operator unsigned long long() const { return to_uint64(); } + // @} + + template + INLINE _private_range_ref& operator=(const ap_private<_AP_W2, _AP_S2>& val) { + ap_private<_AP_W, false> vval = ap_private<_AP_W, false>(val); + if (l_index > h_index) { + for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++) + (vval)[i] ? d_bv.set(j) : d_bv.clear(j); + } else { + if (_AP_W > 64) { + ap_private<_AP_W, false> mask(-1); + if (l_index > 0) { + mask <<= l_index; + vval <<= l_index; + } + if (h_index < _AP_W - 1) { + ap_private<_AP_W, false> mask2(-1); + mask2 >>= _AP_W - h_index - 1; + mask &= mask2; + vval &= mask2; + } + mask.flip(); + d_bv &= mask; + d_bv |= vval; + } else { + unsigned shift = 64 - _AP_W; + uint64_t mask = ~0ULL >> (shift); + if (l_index > 0) { + vval = mask & vval << l_index; + mask = mask & mask << l_index; + } + if (h_index < _AP_W - 1) { + uint64_t mask2 = mask; + mask2 >>= (_AP_W - h_index - 1); + mask &= mask2; + vval &= mask2; + } + mask = ~mask; + d_bv &= mask; + d_bv |= vval; + } + } + return *this; + } // operator=(const ap_private<>&) + + INLINE _private_range_ref& operator=(unsigned long long val) { + const ap_private<_AP_W, _AP_S> vval = val; + return operator=(vval); + } + + template + INLINE _private_range_ref& operator=( + const _private_bit_ref<_AP_W2, _AP_S2>& val) { + return operator=((unsigned long long)(bool)val); + } + + template + INLINE _private_range_ref& operator=( + const _private_range_ref<_AP_W2, _AP_S2>& val) { + const ap_private<_AP_W, false> tmpVal(val); + return operator=(tmpVal); + } + +// template +// INLINE _private_range_ref& operator=( +// const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) { +// const ap_private<_AP_W, false> tmpVal(val); +// return operator=(tmpVal); +// } + + // TODO from ap_int_base, ap_bit_ref and ap_range_ref. + + template + INLINE _private_range_ref& operator=( + const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=(val.to_ap_int_base().V); + } + + template + INLINE _private_range_ref& operator=( + const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=(val.operator ap_int_base<_AP_W2, false>().V); + } + + template + INLINE _private_range_ref& operator=( + const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) { + return operator=((unsigned long long)(bool)val); + } + +// template +// INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> > +// operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2, +// _private_range_ref<_AP_W2, _AP_S2> >( +// *this, const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2, +// ap_private<_AP_W2, _AP_S2> > +// operator,(ap_private<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2, +// ap_private<_AP_W2, _AP_S2> >(*this, a2); +// } +// +// INLINE +// ap_concat_ref<_AP_W, _private_range_ref, _AP_W, ap_private<_AP_W, _AP_S> > +// operator,(ap_private<_AP_W, _AP_S>& a2) { +// return ap_concat_ref<_AP_W, _private_range_ref, _AP_W, +// ap_private<_AP_W, _AP_S> >(*this, a2); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, _private_range_ref, 1, +// _private_bit_ref<_AP_W2, _AP_S2> > +// operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) { +// return ap_concat_ref<_AP_W, _private_range_ref, 1, +// _private_bit_ref<_AP_W2, _AP_S2> >( +// *this, const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > +// operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) { +// return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( +// *this, const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref< +// _AP_W, _private_range_ref, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,( +// const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) { +// return ap_concat_ref< +// _AP_W, _private_range_ref, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// *this, +// const_cast< +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2)); +// } +// +// template +// INLINE +// ap_concat_ref<_AP_W, _private_range_ref, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> +// &a2) { +// return ap_concat_ref< +// _AP_W, _private_range_ref, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// *this, +// const_cast&>( +// a2)); +// } + + template + INLINE bool operator==(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + ap_private<_AP_W, false> lhs = get(); + ap_private<_AP_W2, false> rhs = op2.get(); + return lhs == rhs; + } + + template + INLINE bool operator!=(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + ap_private<_AP_W, false> lhs = get(); + ap_private<_AP_W2, false> rhs = op2.get(); + return lhs != rhs; + } + + template + INLINE bool operator>(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + ap_private<_AP_W, false> lhs = get(); + ap_private<_AP_W2, false> rhs = op2.get(); + return lhs > rhs; + } + + template + INLINE bool operator>=(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + ap_private<_AP_W, false> lhs = get(); + ap_private<_AP_W2, false> rhs = op2.get(); + return lhs >= rhs; + } + + template + INLINE bool operator<(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + ap_private<_AP_W, false> lhs = get(); + ap_private<_AP_W2, false> rhs = op2.get(); + return lhs < rhs; + } + + template + INLINE bool operator<=(const _private_range_ref<_AP_W2, _AP_S2>& op2) { + ap_private<_AP_W, false> lhs = get(); + ap_private<_AP_W2, false> rhs = op2.get(); + return lhs <= rhs; + } + + template + INLINE void set(const ap_private<_AP_W2, false>& val) { + ap_private<_AP_W, _AP_S> vval = val; + if (l_index > h_index) { + for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++) + (vval)[i] ? d_bv.set(j) : d_bv.clear(j); + } else { + if (_AP_W > 64) { + ap_private<_AP_W, _AP_S> mask(-1); + if (l_index > 0) { + ap_private<_AP_W, false> mask1(-1); + mask1 >>= _AP_W - l_index; + mask1.flip(); + mask = mask1; + // vval&=mask1; + vval <<= l_index; + } + if (h_index < _AP_W - 1) { + ap_private<_AP_W, false> mask2(-1); + mask2 <<= h_index + 1; + mask2.flip(); + mask &= mask2; + vval &= mask2; + } + mask.flip(); + d_bv &= mask; + d_bv |= vval; + } else { + uint64_t mask = ~0ULL >> (64 - _AP_W); + if (l_index > 0) { + uint64_t mask1 = mask; + mask1 = mask & (mask1 >> (_AP_W - l_index)); + vval = mask & (vval << l_index); + mask = ~mask1 & mask; + // vval&=mask1; + } + if (h_index < _AP_W - 1) { + uint64_t mask2 = ~0ULL >> (64 - _AP_W); + mask2 = mask & (mask2 << (h_index + 1)); + mask &= ~mask2; + vval &= ~mask2; + } + d_bv &= (~mask & (~0ULL >> (64 - _AP_W))); + d_bv |= vval; + } + } + } + + INLINE ap_private<_AP_W, false> get() const { + ap_private<_AP_W, false> val(0); + if (h_index < l_index) { + for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++) + if ((d_bv)[j]) val.set(i); + } else { + val = d_bv; + val >>= l_index; + if (h_index < _AP_W - 1) { + if (_AP_W <= 64) { + const static uint64_t mask = + (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0)); + val &= (mask >> (_AP_W - (h_index - l_index + 1))); + } else { + ap_private<_AP_W, false> mask(-1); + mask >>= _AP_W - (h_index - l_index + 1); + val &= mask; + } + } + } + return val; + } + + INLINE ap_private<_AP_W, false> get() { + ap_private<_AP_W, false> val(0); + if (h_index < l_index) { + for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++) + if ((d_bv)[j]) val.set(i); + } else { + val = d_bv; + val >>= l_index; + if (h_index < _AP_W - 1) { + if (_AP_W <= 64) { + static const uint64_t mask = ~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0); + return val &= ((mask) >> (_AP_W - (h_index - l_index + 1))); + } else { + ap_private<_AP_W, false> mask(-1); + mask >>= _AP_W - (h_index - l_index + 1); + val &= mask; + } + } + } + return val; + } + + INLINE int length() const { + return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1; + } + + INLINE int to_int() const { + ap_private<_AP_W, false> val = get(); + return val.to_int(); + } + + INLINE unsigned int to_uint() const { + ap_private<_AP_W, false> val = get(); + return val.to_uint(); + } + + INLINE long to_long() const { + ap_private<_AP_W, false> val = get(); + return val.to_long(); + } + + INLINE unsigned long to_ulong() const { + ap_private<_AP_W, false> val = get(); + return val.to_ulong(); + } + + INLINE ap_slong to_int64() const { + ap_private<_AP_W, false> val = get(); + return val.to_int64(); + } + + INLINE ap_ulong to_uint64() const { + ap_private<_AP_W, false> val = get(); + return val.to_uint64(); + } + + INLINE std::string to_string(uint8_t radix = 2) const { + return get().to_string(radix); + } + + INLINE bool and_reduce() { + bool ret = true; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + for (unsigned i = low; i != high; ++i) ret &= d_bv[i]; + return ret; + } + + INLINE bool or_reduce() { + bool ret = false; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + for (unsigned i = low; i != high; ++i) ret |= d_bv[i]; + return ret; + } + + INLINE bool xor_reduce() { + bool ret = false; + bool reverse = l_index > h_index; + unsigned low = reverse ? h_index : l_index; + unsigned high = reverse ? l_index : h_index; + for (unsigned i = low; i != high; ++i) ret ^= d_bv[i]; + return ret; + } +}; // struct _private_range_ref. + +/// Bit reference +/// Proxy class, which allows bit selection to be used as rvalue(for reading) +/// and lvalue(for writing) +//-------------------------------------------------------------- +template +struct _private_bit_ref { +#ifdef _MSC_VER +#pragma warning(disable : 4521 4522) +#endif + ap_private<_AP_W, _AP_S>& d_bv; + int d_index; + + public: + // copy ctor. + INLINE _private_bit_ref(const _private_bit_ref<_AP_W, _AP_S>& ref) + : d_bv(ref.d_bv), d_index(ref.d_index) {} + + // director ctor. + INLINE _private_bit_ref(ap_private<_AP_W, _AP_S>& bv, int index = 0) + : d_bv(bv), d_index(index) { + _AP_WARNING(d_index < 0, "Index of bit vector (%d) cannot be negative.\n", + d_index); + _AP_WARNING(d_index >= _AP_W, + "Index of bit vector (%d) out of range (%d).\n", d_index, _AP_W); + } + + INLINE operator bool() const { return d_bv.get_bit(d_index); } + + INLINE bool to_bool() const { return operator bool(); } + + template + INLINE _private_bit_ref& operator=(const T& val) { + if (!!val) + d_bv.set(d_index); + else + d_bv.clear(d_index); + return *this; + } + +// template +// INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2, +// _AP_S2> > +// operator,(ap_private<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2, +// _AP_S2> >( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), a2); +// } +// +// template +// INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, +// _private_range_ref<_AP_W2, +// _AP_S2> > +// operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<1, _private_bit_ref, _AP_W2, +// _private_range_ref<_AP_W2, +// _AP_S2> >( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), +// const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref<_AP_W2, +// _AP_S2> > operator,( +// const _private_bit_ref<_AP_W2, _AP_S2> &a2) const { +// return ap_concat_ref<1, _private_bit_ref, 1, +// _private_bit_ref<_AP_W2, _AP_S2> >( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), +// const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2)); +// } +// +// INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref> +// operator,( +// const _private_bit_ref &a2) const { +// return ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), +// const_cast<_private_bit_ref&>(a2)); +// } +// +// template +// INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> > +// operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const { +// return ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3, +// ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), +// const_cast&>(a2)); +// } +// +// template +// INLINE ap_concat_ref< +// 1, _private_bit_ref, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> > +// operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, +// _AP_N2> +// &a2) const { +// return ap_concat_ref< +// 1, _private_bit_ref, _AP_W2, +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), +// const_cast< +// af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, +// _AP_N2>&>(a2)); +// } +// +// template +// INLINE +// ap_concat_ref<1, _private_bit_ref, 1, +// af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, +// _AP_N2> > +// operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, +// _AP_N2> +// &a2) const { +// return ap_concat_ref<1, _private_bit_ref, 1, af_bit_ref<_AP_W2, +// _AP_I2, _AP_S2, +// _AP_Q2, _AP_O2, +// _AP_N2> >( +// const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), +// const_cast&>( +// a2)); +// } + + template + INLINE bool operator==(const _private_bit_ref<_AP_W2, _AP_S2>& op) const { + return get() == op.get(); + } + + template + INLINE bool operator!=(const _private_bit_ref<_AP_W2, _AP_S2>& op) const { + return get() != op.get(); + } + + INLINE bool get() const { return operator bool(); } + + // template + // INLINE void set(const ap_private<_AP_W3, false>& val) { + // operator=(val); + // } + + // INLINE bool operator~() const { + // bool bit = (d_bv)[d_index]; + // return bit ? false : true; + // } + + INLINE int length() const { return 1; } + + // INLINE std::string to_string() const { + // bool val = get(); + // return val ? "1" : "0"; + // } + +}; // struct _private_bit_ref. + +// char a[100]; +// char* ptr = a; +// ap_int<2> n = 3; +// char* ptr2 = ptr + n*2; +// avoid ambiguous errors +#define OP_BIN_MIX_PTR(BIN_OP) \ + template \ + INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op, \ + const ap_private<_AP_W, _AP_S>& op) { \ + typename ap_private<_AP_W, _AP_S>::ValType op2 = op; \ + return i_op BIN_OP op2; \ + } \ + template \ + INLINE PTR_TYPE* operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, \ + PTR_TYPE* i_op) { \ + typename ap_private<_AP_W, _AP_S>::ValType op2 = op; \ + return op2 BIN_OP i_op; \ + } + +OP_BIN_MIX_PTR(+) +OP_BIN_MIX_PTR(-) +#undef OP_BIN_MIX_PTR + +// float OP ap_int +// when ap_int's width > 64, then trunc ap_int to ap_int<64> +#define OP_BIN_MIX_FLOAT(BIN_OP, C_TYPE) \ + template \ + INLINE C_TYPE operator BIN_OP(C_TYPE i_op, \ + const ap_private<_AP_W, _AP_S>& op) { \ + typename ap_private<_AP_W, _AP_S>::ValType op2 = op; \ + return i_op BIN_OP op2; \ + } \ + template \ + INLINE C_TYPE operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, \ + C_TYPE i_op) { \ + typename ap_private<_AP_W, _AP_S>::ValType op2 = op; \ + return op2 BIN_OP i_op; \ + } + +#define OPS_MIX_FLOAT(C_TYPE) \ + OP_BIN_MIX_FLOAT(*, C_TYPE) \ + OP_BIN_MIX_FLOAT(/, C_TYPE) \ + OP_BIN_MIX_FLOAT(+, C_TYPE) \ + OP_BIN_MIX_FLOAT(-, C_TYPE) + +OPS_MIX_FLOAT(float) +OPS_MIX_FLOAT(double) +#undef OP_BIN_MIX_FLOAT +#undef OPS_MIX_FLOAT + +/// Operators mixing Integers with AP_Int +// ---------------------------------------------------------------- + +// partially specialize template argument _AP_C in order that: +// for _AP_W > 64, we will explicitly convert operand with native data type +// into corresponding ap_private +// for _AP_W <= 64, we will implicitly convert operand with ap_private into +// (unsigned) long long +#define OP_BIN_MIX_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE) \ + template \ + INLINE \ + typename ap_private<_AP_WI, _AP_SI>::template RType<_AP_W, _AP_S>::RTYPE \ + operator BIN_OP(C_TYPE i_op, const ap_private<_AP_W, _AP_S>& op) { \ + return ap_private<_AP_WI, _AP_SI>(i_op).operator BIN_OP(op); \ + } \ + template \ + INLINE \ + typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \ + operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) { \ + return op.operator BIN_OP(ap_private<_AP_WI, _AP_SI>(i_op)); \ + } + +#define OP_REL_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE bool operator REL_OP(const ap_private<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return op.operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2)); \ + } \ + template \ + INLINE bool operator REL_OP(C_TYPE op2, \ + const ap_private<_AP_W, _AP_S, false>& op) { \ + return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(op); \ + } + +#define OP_ASSIGN_MIX_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP( \ + ap_private<_AP_W, _AP_S>& op, C_TYPE op2) { \ + return op.operator ASSIGN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \ + } + +#define OP_BIN_SHIFT_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE) \ + template \ + C_TYPE operator BIN_OP(C_TYPE i_op, \ + const ap_private<_AP_W, _AP_S, false>& op) { \ + return i_op BIN_OP(op.get_VAL()); \ + } \ + template \ + INLINE \ + typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \ + operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) { \ + return op.operator BIN_OP(i_op); \ + } + +#define OP_ASSIGN_RSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP( \ + ap_private<_AP_W, _AP_S>& op, C_TYPE op2) { \ + op = op.operator>>(op2); \ + return op; \ + } + +#define OP_ASSIGN_LSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP( \ + ap_private<_AP_W, _AP_S>& op, C_TYPE op2) { \ + op = op.operator<<(op2); \ + return op; \ + } + +#define OPS_MIX_INT(C_TYPE, _AP_W2, _AP_S2) \ + OP_BIN_MIX_INT(*, C_TYPE, (_AP_W2), (_AP_S2), mult) \ + OP_BIN_MIX_INT(+, C_TYPE, (_AP_W2), (_AP_S2), plus) \ + OP_BIN_MIX_INT(-, C_TYPE, (_AP_W2), (_AP_S2), minus) \ + OP_BIN_MIX_INT(/, C_TYPE, (_AP_W2), (_AP_S2), div) \ + OP_BIN_MIX_INT(%, C_TYPE, (_AP_W2), (_AP_S2), mod) \ + OP_BIN_MIX_INT(&, C_TYPE, (_AP_W2), (_AP_S2), logic) \ + OP_BIN_MIX_INT(|, C_TYPE, (_AP_W2), (_AP_S2), logic) \ + OP_BIN_MIX_INT (^, C_TYPE, (_AP_W2), (_AP_S2), logic) \ + OP_BIN_SHIFT_INT(>>, C_TYPE, (_AP_W2), (_AP_S2), arg1) \ + OP_BIN_SHIFT_INT(<<, C_TYPE, (_AP_W2), (_AP_S2), arg1) \ + \ + OP_ASSIGN_MIX_INT(+=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(-=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(*=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(/=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(%=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(&=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(|=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_MIX_INT(^=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_RSHIFT_INT(>>=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_ASSIGN_LSHIFT_INT(<<=, C_TYPE, (_AP_W2), (_AP_S2)) \ + \ + OP_REL_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_REL_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_REL_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_REL_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_REL_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \ + OP_REL_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2)) + +OPS_MIX_INT(bool, 1, false) +OPS_MIX_INT(char, 8, CHAR_IS_SIGNED) +OPS_MIX_INT(signed char, 8, true) +OPS_MIX_INT(unsigned char, 8, false) +OPS_MIX_INT(short, sizeof(short) * 8, true) +OPS_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false) +OPS_MIX_INT(int, sizeof(int) * 8, true) +OPS_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false) +OPS_MIX_INT(long, sizeof(long) * 8, true) +OPS_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false) +OPS_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true) +OPS_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false) + +#undef OP_BIN_MIX_INT +#undef OP_BIN_SHIFT_INT +#undef OP_ASSIGN_MIX_INT +#undef OP_ASSIGN_RSHIFT_INT +#undef OP_ASSIGN_LSHIFT_INT +#undef OP_REL_MIX_INT +#undef OPS_MIX_INT + +#define OP_BIN_MIX_RANGE(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1, \ + const ap_private<_AP_W2, _AP_S2>& op2) { \ + return ap_private<_AP_W1, false>(op1).operator BIN_OP(op2); \ + } \ + template \ + INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2, \ + _AP_S2>::RTYPE \ + operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1, \ + const _private_range_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator BIN_OP(ap_private<_AP_W2, false>(op2)); \ + } + +#define OP_ASSIGN_MIX_RANGE(ASSIGN_OP) \ + template \ + INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + ap_private<_AP_W1, _AP_S1>& op1, \ + const _private_range_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator ASSIGN_OP(ap_private<_AP_W2, false>(op2)); \ + } \ + template \ + INLINE _private_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + _private_range_ref<_AP_W1, _AP_S1>& op1, \ + ap_private<_AP_W2, _AP_S2>& op2) { \ + ap_private<_AP_W1, false> tmp(op1); \ + tmp.operator ASSIGN_OP(op2); \ + op1 = tmp; \ + return op1; \ + } + +#define OP_REL_MIX_RANGE(REL_OP) \ + template \ + INLINE bool operator REL_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1, \ + const ap_private<_AP_W2, _AP_S2>& op2) { \ + return ap_private<_AP_W1, false>(op1).operator REL_OP(op2); \ + } \ + template \ + INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1, \ + const _private_range_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator REL_OP(op2.operator ap_private<_AP_W2, false>()); \ + } + +OP_BIN_MIX_RANGE(+, plus) +OP_BIN_MIX_RANGE(-, minus) +OP_BIN_MIX_RANGE(*, mult) +OP_BIN_MIX_RANGE(/, div) +OP_BIN_MIX_RANGE(%, mod) +OP_BIN_MIX_RANGE(&, logic) +OP_BIN_MIX_RANGE(|, logic) +OP_BIN_MIX_RANGE(^, logic) +OP_BIN_MIX_RANGE(>>, arg1) +OP_BIN_MIX_RANGE(<<, arg1) +#undef OP_BIN_MIX_RANGE + +OP_ASSIGN_MIX_RANGE(+=) +OP_ASSIGN_MIX_RANGE(-=) +OP_ASSIGN_MIX_RANGE(*=) +OP_ASSIGN_MIX_RANGE(/=) +OP_ASSIGN_MIX_RANGE(%=) +OP_ASSIGN_MIX_RANGE(&=) +OP_ASSIGN_MIX_RANGE(|=) +OP_ASSIGN_MIX_RANGE(^=) +OP_ASSIGN_MIX_RANGE(>>=) +OP_ASSIGN_MIX_RANGE(<<=) +#undef OP_ASSIGN_MIX_RANGE + +OP_REL_MIX_RANGE(>) +OP_REL_MIX_RANGE(<) +OP_REL_MIX_RANGE(>=) +OP_REL_MIX_RANGE(<=) +OP_REL_MIX_RANGE(==) +OP_REL_MIX_RANGE(!=) +#undef OP_REL_MIX_RANGE + +#define OP_BIN_MIX_BIT(BIN_OP, RTYPE) \ + template \ + INLINE typename ap_private<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1, \ + const ap_private<_AP_W2, _AP_S2>& op2) { \ + return ap_private<1, false>(op1).operator BIN_OP(op2); \ + } \ + template \ + INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \ + operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1, \ + const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator BIN_OP(ap_private<1, false>(op2)); \ + } + +#define OP_ASSIGN_MIX_BIT(ASSIGN_OP) \ + template \ + INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + ap_private<_AP_W1, _AP_S1>& op1, \ + _private_bit_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator ASSIGN_OP(ap_private<1, false>(op2)); \ + } \ + template \ + INLINE _private_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \ + _private_bit_ref<_AP_W1, _AP_S1>& op1, \ + ap_private<_AP_W2, _AP_S2>& op2) { \ + ap_private<1, false> tmp(op1); \ + tmp.operator ASSIGN_OP(op2); \ + op1 = tmp; \ + return op1; \ + } + +#define OP_REL_MIX_BIT(REL_OP) \ + template \ + INLINE bool operator REL_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1, \ + const ap_private<_AP_W2, _AP_S2>& op2) { \ + return ap_private<_AP_W1, false>(op1).operator REL_OP(op2); \ + } \ + template \ + INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1, \ + const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \ + return op1.operator REL_OP(ap_private<1, false>(op2)); \ + } + +OP_ASSIGN_MIX_BIT(+=) +OP_ASSIGN_MIX_BIT(-=) +OP_ASSIGN_MIX_BIT(*=) +OP_ASSIGN_MIX_BIT(/=) +OP_ASSIGN_MIX_BIT(%=) +OP_ASSIGN_MIX_BIT(&=) +OP_ASSIGN_MIX_BIT(|=) +OP_ASSIGN_MIX_BIT(^=) +OP_ASSIGN_MIX_BIT(>>=) +OP_ASSIGN_MIX_BIT(<<=) +#undef OP_ASSIGN_MIX_BIT + +OP_BIN_MIX_BIT(+, plus) +OP_BIN_MIX_BIT(-, minus) +OP_BIN_MIX_BIT(*, mult) +OP_BIN_MIX_BIT(/, div) +OP_BIN_MIX_BIT(%, mod) +OP_BIN_MIX_BIT(&, logic) +OP_BIN_MIX_BIT(|, logic) +OP_BIN_MIX_BIT(^, logic) +OP_BIN_MIX_BIT(>>, arg1) +OP_BIN_MIX_BIT(<<, arg1) +#undef OP_BIN_MIX_BIT + +OP_REL_MIX_BIT(>) +OP_REL_MIX_BIT(<) +OP_REL_MIX_BIT(<=) +OP_REL_MIX_BIT(>=) +OP_REL_MIX_BIT(==) +OP_REL_MIX_BIT(!=) +#undef OP_REL_MIX_BIT + +#define REF_REL_OP_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE bool operator REL_OP(const _private_range_ref<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return (ap_private<_AP_W, false>(op)) \ + . \ + operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2)); \ + } \ + template \ + INLINE bool operator REL_OP(C_TYPE op2, \ + const _private_range_ref<_AP_W, _AP_S>& op) { \ + return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP( \ + ap_private<_AP_W, false>(op)); \ + } \ + template \ + INLINE bool operator REL_OP(const _private_bit_ref<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return (bool(op))REL_OP op2; \ + } \ + template \ + INLINE bool operator REL_OP(C_TYPE op2, \ + const _private_bit_ref<_AP_W, _AP_S>& op) { \ + return op2 REL_OP(bool(op)); \ + } + +#define REF_REL_MIX_INT(C_TYPE, _AP_W2, _AP_S2) \ + REF_REL_OP_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_REL_OP_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_REL_OP_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_REL_OP_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_REL_OP_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_REL_OP_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2)) + +REF_REL_MIX_INT(bool, 1, false) +REF_REL_MIX_INT(char, 8, CHAR_IS_SIGNED) +REF_REL_MIX_INT(signed char, 8, true) +REF_REL_MIX_INT(unsigned char, 8, false) +REF_REL_MIX_INT(short, sizeof(short) * 8, true) +REF_REL_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false) +REF_REL_MIX_INT(int, sizeof(int) * 8, true) +REF_REL_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false) +REF_REL_MIX_INT(long, sizeof(long) * 8, true) +REF_REL_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false) +REF_REL_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true) +REF_REL_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false) +#undef REF_REL_OP_MIX_INT +#undef REF_REL_MIX_INT + +#define REF_BIN_OP_MIX_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2) \ + template \ + INLINE \ + typename ap_private<_AP_W, false>::template RType<_AP_W2, _AP_S2>::RTYPE \ + operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& op, \ + C_TYPE op2) { \ + return (ap_private<_AP_W, false>(op)) \ + . \ + operator BIN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \ + } \ + template \ + INLINE \ + typename ap_private<_AP_W2, _AP_S2>::template RType<_AP_W, false>::RTYPE \ + operator BIN_OP(C_TYPE op2, \ + const _private_range_ref<_AP_W, _AP_S>& op) { \ + return ap_private<_AP_W2, _AP_S2>(op2).operator BIN_OP( \ + ap_private<_AP_W, false>(op)); \ + } + +#define REF_BIN_MIX_INT(C_TYPE, _AP_W2, _AP_S2) \ + REF_BIN_OP_MIX_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \ + REF_BIN_OP_MIX_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2)) + +REF_BIN_MIX_INT(bool, 1, false) +REF_BIN_MIX_INT(char, 8, CHAR_IS_SIGNED) +REF_BIN_MIX_INT(signed char, 8, true) +REF_BIN_MIX_INT(unsigned char, 8, false) +REF_BIN_MIX_INT(short, sizeof(short) * 8, true) +REF_BIN_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false) +REF_BIN_MIX_INT(int, sizeof(int) * 8, true) +REF_BIN_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false) +REF_BIN_MIX_INT(long, sizeof(long) * 8, true) +REF_BIN_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false) +REF_BIN_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true) +REF_BIN_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false) +#undef REF_BIN_OP_MIX_INT +#undef REF_BIN_MIX_INT + +#define REF_BIN_OP(BIN_OP, RTYPE) \ + template \ + INLINE \ + typename ap_private<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \ + operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& lhs, \ + const _private_range_ref<_AP_W2, _AP_S2>& rhs) { \ + return ap_private<_AP_W, false>(lhs).operator BIN_OP( \ + ap_private<_AP_W2, false>(rhs)); \ + } + +REF_BIN_OP(+, plus) +REF_BIN_OP(-, minus) +REF_BIN_OP(*, mult) +REF_BIN_OP(/, div) +REF_BIN_OP(%, mod) +REF_BIN_OP(&, logic) +REF_BIN_OP(|, logic) +REF_BIN_OP(^, logic) +REF_BIN_OP(>>, arg1) +REF_BIN_OP(<<, arg1) +#undef REF_BIN_OP + +//************************************************************************ +// Implement +// ap_private = ap_concat_ref OP ap_concat_ref +// for operators +, -, *, /, %, >>, <<, &, |, ^ +// Without these operators the operands are converted to int64 and +// larger results lose informations (higher order bits). +// +// operand OP +// / | +// left-concat right-concat +// / | / | +// +// +// _AP_LW1, _AP_LT1 (width and type of left-concat's left side) +// _AP_LW2, _AP_LT2 (width and type of left-concat's right side) +// Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2 +// +// In Verilog 2001 result of concatenation is always unsigned even +// when both sides are signed. +//************************************************************************ + +#endif // ifndef __AP_PRIVATE_H__ + +// -*- cpp -*- diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_math.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_math.h new file mode 100644 index 00000000..f1299714 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_math.h @@ -0,0 +1,27 @@ +#ifndef X_HLS_MATH_H +#define X_HLS_MATH_H + +#include +#include "ap_fixed.h" + +namespace hls { + +template +static T exp(const T x) { + return (T) std::exp(x.to_double()); +} + +template T sin(T x) { return (T) std::sin(x.to_double()); }; + +template T cos(T x) { return (T) std::cos(x.to_double()); }; + +template T asin(T x) { return (T) std::asin(x.to_double()); }; + +template T acos(T x) { return (T) std::acos(x.to_double()); }; + +template T atan(T x) { return (T) std::atan(x.to_double()); }; + +template T atan2(T x, T y) { return (T) hls::atan2(x.to_double(), y.to_double()); }; + +} +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_stream.h new file mode 100644 index 00000000..f516c39e --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_stream.h @@ -0,0 +1,263 @@ +/* +#- (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved. +#- +#- This file contains confidential and proprietary information +#- of Xilinx, Inc. and is protected under U.S. and +#- international copyright and other intellectual property +#- laws. +#- +#- DISCLAIMER +#- This disclaimer is not a license and does not grant any +#- rights to the materials distributed herewith. Except as +#- otherwise provided in a valid license issued to you by +#- Xilinx, and to the maximum extent permitted by applicable +#- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +#- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +#- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +#- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +#- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +#- (2) Xilinx shall not be liable (whether in contract or tort, +#- including negligence, or under any other theory of +#- liability) for any loss or damage of any kind or nature +#- related to, arising under or in connection with these +#- materials, including for any direct, or any indirect, +#- special, incidental, or consequential loss or damage +#- (including loss of data, profits, goodwill, or any type of +#- loss or damage suffered as a result of any action brought +#- by a third party) even if such damage or loss was +#- reasonably foreseeable or Xilinx had been advised of the +#- possibility of the same. +#- +#- CRITICAL APPLICATIONS +#- Xilinx products are not designed or intended to be fail- +#- safe, or for use in any application requiring fail-safe +#- performance, such as life-support or safety devices or +#- systems, Class III medical devices, nuclear facilities, +#- applications related to the deployment of airbags, or any +#- other applications that could lead to death, personal +#- injury, or severe property or environmental damage +#- (individually and collectively, "Critical +#- Applications"). Customer assumes the sole risk and +#- liability of any use of Xilinx products in Critical +#- Applications, subject only to applicable laws and +#- regulations governing limitations on product liability. +#- +#- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +#- PART OF THIS FILE AT ALL TIMES. +#- ************************************************************************ + + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef X_HLS_STREAM_SIM_H +#define X_HLS_STREAM_SIM_H + +/* + * This file contains a C++ model of hls::stream. + * It defines C simulation model. + */ +#ifndef __cplusplus + +#error C++ is required to include this header file + +#else + +////////////////////////////////////////////// +// C level simulation models for hls::stream +////////////////////////////////////////////// +#include +#include +#include +#include +#include + +#ifdef HLS_STREAM_THREAD_SAFE +#include +#include +#endif + +#ifndef _MSC_VER +#include +#include +#endif + +namespace hls { + +template +class stream +{ + protected: + std::string _name; + std::deque<__STREAM_T__> _data; // container for the elements +#ifdef HLS_STREAM_THREAD_SAFE + std::mutex _mutex; + std::condition_variable _condition_var; +#endif + + public: + /// Constructors + // Keep consistent with the synthesis model's constructors + stream() { + static unsigned _counter = 1; + std::stringstream ss; +#ifndef _MSC_VER + char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0); + if (_demangle_name) { + _name = _demangle_name; + free(_demangle_name); + } + else { + _name = "hls_stream"; + } +#else + _name = typeid(*this).name(); +#endif + + ss << _counter++; + _name += "." + ss.str(); + } + + stream(const std::string name) { + // default constructor, + // capacity set to predefined maximum + _name = name; + } + + /// Make copy constructor and assignment operator private + private: + stream(const stream< __STREAM_T__ >& chn): + _name(chn._name), _data(chn._data) { + } + + stream& operator = (const stream< __STREAM_T__ >& chn) { + _name = chn._name; + _data = chn._data; + return *this; + } + + public: + /// Overload >> and << operators to implement read() and write() + void operator >> (__STREAM_T__& rdata) { + read(rdata); + } + + void operator << (const __STREAM_T__& wdata) { + write(wdata); + } + + + public: + /// Destructor + /// Check status of the queue + virtual ~stream() { + if (!_data.empty()) + { + std::cout << "WARNING: Hls::stream '" + << _name + << "' contains leftover data," + << " which may result in RTL simulation hanging." + << std::endl; + } + } + + /// Status of the queue + bool empty() { +#ifdef HLS_STREAM_THREAD_SAFE + std::lock_guard lg(_mutex); +#endif + return _data.empty(); + } + + bool full() const { return false; } + + /// Blocking read + void read(__STREAM_T__& head) { + head = read(); + } + +#ifdef HLS_STREAM_THREAD_SAFE + __STREAM_T__ read() { + std::unique_lock ul(_mutex); + while (_data.empty()) { + _condition_var.wait(ul); + } + + __STREAM_T__ elem; + elem = _data.front(); + _data.pop_front(); + return elem; + } +#else + __STREAM_T__ read() { + __STREAM_T__ elem; + if (_data.empty()) { + std::cout << "WARNING: Hls::stream '" + << _name + << "' is read while empty," + << " which may result in RTL simulation hanging." + << std::endl; + elem = __STREAM_T__(); + } else { + elem = _data.front(); + _data.pop_front(); + } + return elem; + } +#endif + + /// Blocking write + void write(const __STREAM_T__& tail) { +#ifdef HLS_STREAM_THREAD_SAFE + std::unique_lock ul(_mutex); +#endif + _data.push_back(tail); +#ifdef HLS_STREAM_THREAD_SAFE + _condition_var.notify_one(); +#endif + } + + /// Nonblocking read + bool read_nb(__STREAM_T__& head) { +#ifdef HLS_STREAM_THREAD_SAFE + std::lock_guard lg(_mutex); +#endif + bool is_empty = _data.empty(); + if (is_empty) { + head = __STREAM_T__(); + } else { + __STREAM_T__ elem(_data.front()); + _data.pop_front(); + head = elem; + } + return !is_empty; + } + + /// Nonblocking write + bool write_nb(const __STREAM_T__& tail) { + bool is_full = full(); + write(tail); + return !is_full; + } + + /// Fifo size + size_t size() { + return _data.size(); + } +}; + +} // namespace hls + +#endif // __cplusplus +#endif // X_HLS_STREAM_SIM_H + diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/utils/x_hls_utils.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/utils/x_hls_utils.h new file mode 100644 index 00000000..3e751c36 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/utils/x_hls_utils.h @@ -0,0 +1,80 @@ +#ifndef X_HLS_UTILS_H +#define X_HLS_UTILS_H +#include "ap_fixed.h" +#include + +namespace hls { + + template + class numeric_limits { + public: + static T max() { return std::numeric_limits::max(); } + static T min() { return std::numeric_limits::min(); } + static T epsilon() { return std::numeric_limits::epsilon(); } + }; + + template + class numeric_limits > { + public: + static ap_fixed max() { + ap_int m = ::hls::numeric_limits >::max(); + ap_fixed x; + x(W-1,0) = m(W-1,0); + return x; + } + static ap_fixed min() { + ap_int m = ::hls::numeric_limits >::min(); + ap_fixed x; + x(W-1,0) = m(W-1,0); + return x; + } + static ap_fixed epsilon() { + ap_fixed x = 0; + x[0] = 1; + return x; + } + }; + + template + class numeric_limits > { + public: + static ap_ufixed max() { + ap_uint m = ::hls::numeric_limits >::max(); + ap_ufixed x; + x(W-1,0) = m(W-1,0); + return x; + } + static ap_ufixed min() { return 0; } + static ap_ufixed epsilon() { + ap_ufixed x = 0; + x[0] = 1; + return x; + } + }; + + template + class numeric_limits > { + public: + static ap_int max() { ap_int m = min(); return ~m; } + static ap_int min() { ap_int m = 0; m[W-1] = 1; return m; } + static ap_int epsilon() { + ap_int x = 0; + x[0] = 1; + return x; + } + }; + + template + class numeric_limits > { + public: + static ap_uint max() { ap_uint zero = 0; return ~zero; } + static ap_uint min() { return 0; } + static ap_uint epsilon() { + ap_uint x = 0; + x[0] = 1; + return x; + } + }; +} + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/defines.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/defines.h new file mode 100644 index 00000000..cf8d98c3 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/defines.h @@ -0,0 +1,67 @@ +#ifndef DEFINES_H_ +#define DEFINES_H_ + +#include "ap_fixed.h" +#include "ap_int.h" +#include "nnet_utils/nnet_types.h" +#include +#include + +// hls-fpga-machine-learning insert numbers +#define N_INPUT_1_1 100 +#define N_INPUT_1_2 100 +#define N_LAYER_1_3 100 +#define N_LAYER_2_3 2 +#define N_LAYER_1_4 100 +#define N_LAYER_2_4 2 +#define N_INPUT_1_5 100 +#define N_INPUT_2_5 4 +#define OUT_CONCAT_0_6 100 +#define OUT_CONCAT_1_6 4 +#define OUT_CONCAT_0_7 100 +#define OUT_CONCAT_1_7 8 +#define N_OUTPUTS_22 100 +#define N_FILT_22 12 +#define N_LAYER_1_8 100 +#define N_LAYER_2_8 12 +#define N_OUTPUTS_23 100 +#define N_FILT_23 36 +#define N_LAYER_1_12 100 +#define N_LAYER_2_12 36 +#define N_OUTPUTS_24 100 +#define N_FILT_24 1 +#define N_INPUT_1_19 100 +#define N_INPUT_2_19 2 +#define N_INPUT_1_19 100 +#define N_INPUT_2_19 2 +#define N_FILT_21 2 + +// hls-fpga-machine-learning insert layer-precision +typedef ap_uint<4> input_t; +typedef ap_uint<4> input2_t; +typedef ap_fixed<32,16> layer3_t; +typedef ap_fixed<32,16> embedding0_embeddings_t; +typedef ap_fixed<32,16> layer4_t; +typedef ap_fixed<32,16> embedding1_embeddings_t; +typedef ap_fixed<32,16> input5_t; +typedef ap_fixed<32,16> layer6_t; +typedef ap_fixed<32,16> layer7_t; +typedef ap_fixed<32,16> model_default_t; +typedef ap_fixed<32,16> layer22_t; +typedef ap_fixed<32,16> dense_weight_t; +typedef ap_fixed<32,16> dense_bias_t; +typedef ap_fixed<32,16> layer11_t; +typedef ap_fixed<18,8> activation_table_t; +typedef ap_fixed<32,16> layer23_t; +typedef ap_fixed<32,16> dense_1_weight_t; +typedef ap_fixed<32,16> dense_1_bias_t; +typedef ap_fixed<32,16> layer15_t; +typedef ap_fixed<18,8> activation_1_table_t; +typedef ap_fixed<32,16> layer24_t; +typedef ap_fixed<32,16> met_weight_weight_t; +typedef ap_fixed<32,16> met_weight_bias_t; +typedef ap_fixed<32,16> input19_t; +typedef ap_fixed<32,16> layer20_t; +typedef ap_fixed<32,16> result_t; + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation.h new file mode 100644 index 00000000..8baadf28 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation.h @@ -0,0 +1,777 @@ +#ifndef NNET_ACTIVATION_H_ +#define NNET_ACTIVATION_H_ + +#include "ap_fixed.h" +#include "nnet_common.h" +#include + +namespace nnet { + +struct activ_config { + // IO size + static const unsigned n_in = 10; + + // Internal info + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef ap_fixed<18, 8> table_t; +}; + +// ************************************************* +// LINEAR Activation -- See Issue 53 +// ************************************************* +template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + res[ii] = data[ii]; + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +template +void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; + } +} + +template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); } + +template void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default logistic sigmoid function: + // result = 1/(1+e^(-x)) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_sigmoid_table(sigmoid_table); + initialized = true; + } + + #pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)sigmoid_table[index]; + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; + +inline float exp_fcn_float(float input) { return std::exp(input); } + +template inline float softmax_real_val_from_idx(unsigned i) { + // Treat the index as the top N bits + static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table + data_T x(0); + x(x.width - 1, x.width - N) = i; + return (float)x; +} + +template inline unsigned softmax_idx_from_real_val(data_T x) { + // Slice the top N bits to get an index into the table + static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table + ap_uint y = x(x.width - 1, x.width - N); // slice the top N bits of input + return (unsigned)y(N - 1, 0); +} + +template +void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) { + // The template data_T is the data type used to address the table + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { + // Slicing bits for address is going to round towards 0, so take the central value + float x = softmax_real_val_from_idx(i); + typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x); + table_out[i] = exp_x; + } +} + +template +void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) { + // The template data_T is the data type used to address the table + for (unsigned i = 0; i < CONFIG_T::table_size; i++) { + float x = softmax_real_val_from_idx(i); + typename CONFIG_T::inv_table_t inv_x = 1 / x; + table_out[i] = inv_x; + } +} + +template +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS pipeline + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma HLS array_partition variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + #pragma HLS unroll + unsigned x = softmax_idx_from_real_val(data[i]); + exp_res[i] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + #pragma HLS unroll + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS pipeline + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + data_T x_max = reduce>(data, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + ap_fixed d_xi_xmax[CONFIG_T::n_in]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + #pragma HLS unroll + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma HLS array_partition variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + #pragma HLS unroll + unsigned x = softmax_idx_from_real_val(d_xi_xmax[i]); + exp_res[i] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + #pragma HLS unroll + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = exp_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Inversion function: + // result = 1/x + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range 0 to +64) + float in_val = 64.0 * ii / float(N_TABLE); + // Next, compute lookup table function + if (in_val > 0.0) + table_out[ii] = 1.0 / in_val; + else + table_out[ii] = 0.0; + } +} + +template +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_exp_table_legacy(exp_table); + init_invert_table_legacy(invert_table); + initialized = true; + } + + #pragma HLS PIPELINE + + // Index into the lookup table based on data for exponentials + typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision + typename CONFIG_T::table_t exp_diff_res; // different, independent, fixed point precision + data_T data_cache[CONFIG_T::n_in]; + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_cache[ii] = data[ii]; + exp_res[ii] = 0; + } + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + if (ii == jj) + exp_diff_res = 1; + else { + data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table[index]; + } + exp_res[ii] += exp_diff_res; + } + } + + // Second loop to invert + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index]; + res[ii] = (res_T)invert_table[exp_res_index]; + } +} + +template +void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + for (int i = 0; i < CONFIG_T::n_in; i++) { + #pragma HLS UNROLL + res[i] = (res_T)0; + } + + data_T maximum = data[0]; + int idx = 0; + + for (int i = 1; i < CONFIG_T::n_in; i++) { + #pragma HLS PIPELINE + if (data[i] > maximum) { + maximum = data[i]; + idx = i; + } + } + + res[idx] = (res_T)1; +} + +template +void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS inline + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Implement tanh lookup + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -4 to +4) + float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = tanh(in_val); + // std::cout << "Tanh: Lookup table Index: " << ii<< " In Value: " << in_val << " Result: " << real_val << + // std::endl; + table_out[ii] = real_val; + } +} + +template void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_tanh_table(tanh_table); + initialized = true; + } + + #pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 8; + index = data_round + 4 * CONFIG_T::table_size / 8; + // std::cout << "Input: " << data[ii] << " Round: " << data_round << " Index: " << index << std::endl; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)tanh_table[index]; + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +template +void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + res[ii] = datareg; + } +} + +template +void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + if (CONFIG_T::io_type == io_parallel) { + #pragma HLS PIPELINE + } + + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); } + +template void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default softplus function: + // result = log(exp(x) + 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softplus_table(softplus_table); + initialized = true; + } + + #pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softplus_table[index]; + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); } + +template void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default softsign function: + // result = x / (abs(x) + 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to +8) + float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softsign_table(softsign_table); + initialized = true; + } + + #pragma HLS PIPELINE + + // Index into the lookup table based on data + int data_round; + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round = data[ii] * CONFIG_T::table_size / 16; + index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softsign_table[index]; + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +inline float elu_fcn_float(float input) { return std::exp(input) - 1.; } + +template void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default ELU function: + // result = alpha * (e^(x) - 1) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to 0) + float in_val = -8.0 * ii / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = elu_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template +void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_elu_table(elu_table); + initialized = true; + } + + #pragma HLS PIPELINE + + data_T datareg; + // Index into the lookup table based on data + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg >= 0) { + res[ii] = datareg; + } else { + index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = alpha * elu_table[index]; + } + } +} + +template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + elu(data, 1.0, res); +} + +// ************************************************* +// SELU Activation +// ************************************************* +inline float selu_fcn_float(float input) { + return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.)); +} + +template void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) { + // Default SELU function: + // result = 1.05 * (1.673 * (e^(x) - 1)) + for (int ii = 0; ii < N_TABLE; ii++) { + // First, convert from table index to X-value (signed 8-bit, range -8 to 0) + float in_val = -8.0 * ii / float(N_TABLE); + // Next, compute lookup table function + typename CONFIG_T::table_t real_val = selu_fcn_float(in_val); + // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl; + table_out[ii] = real_val; + } +} + +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_selu_table(selu_table); + initialized = true; + } + + #pragma HLS PIPELINE + + data_T datareg; + // Index into the lookup table based on data + int index; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg >= 0) { + res[ii] = res_T(1.0507009873554804934193349852946) * datareg; + } else { + index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = selu_table[index]; + } + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template +void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + res_T cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + if (datareg > 0) + cache = 1; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template +void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma HLS PIPELINE + + data_T datareg; + res_T cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = 2 * data[ii]; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation_stream.h new file mode 100644 index 00000000..b72809ef --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation_stream.h @@ -0,0 +1,777 @@ +#ifndef NNET_ACTIVATION_STREAM_H_ +#define NNET_ACTIVATION_STREAM_H_ + +#include "ap_fixed.h" +#include "hls_stream.h" +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_stream.h" +#include "nnet_types.h" +#include + +namespace nnet { + +// ************************************************* +// LINEAR Activation +// ************************************************* +template void linear(hls::stream &data, hls::stream &res) { +LinearActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + LinearPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = in_data[j]; + } + + res.write(out_data); + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(hls::stream &data, hls::stream &res) { +ReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res.write(out_data); + } +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* + +template void sigmoid(hls::stream &data, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_sigmoid_table(sigmoid_table); + initialized = true; + } + +SigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + SigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = sigmoid_table[index]; + } + + res.write(out_data); + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +template +void softmax_latency(hls::stream &data, hls::stream &res) { + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned ii = data_T::size / multiplier_limit; + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[data_T::size]; + #pragma HLS array_partition variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); +SoftmaxExpLoop: + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + #pragma HLS PIPELINE II=ii + + data_T in_pack = data.read(); + SoftmaxExpPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + unsigned x = softmax_idx_from_real_val(in_pack[j]); + exp_res[j] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + + res_T out_pack; + PRAGMA_DATA_PACK(out_pack) + + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + out_pack[j] = exp_res[j] * inv_exp_sum; + } + res.write(out_pack); + } +} + +template +void softmax_stable(hls::stream &data, hls::stream &res) { + // Initialize the lookup tables +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size]; + +#endif + if (!initialized) { + // Note we are exponentiating the inputs, which have type data_T + init_exp_table(exp_table); + // Note we are inverting the exponentials, which have type exp_table_t + init_invert_table(invert_table); + initialized = true; + } + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor); + constexpr unsigned ii = data_T::size / multiplier_limit; + + typename data_T::value_type data_array[data_T::size]; +#pragma HLS ARRAY_PARTITION variable=data_array complete +SoftmaxArrayLoop: + for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + #pragma HLS PIPELINE II=ii + + data_T in_pack = data.read(); + SoftmaxArrayPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + data_array[j] = in_pack[j]; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + typename data_T::value_type x_max = + reduce>(data_array, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + ap_fixed d_xi_xmax[data_T::size]; + for (unsigned j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + d_xi_xmax[j] = data_array[j] - x_max; + } + + // Calculate all the e^x's + typename CONFIG_T::exp_table_t exp_res[data_T::size]; + #pragma HLS ARRAY_PARTITION variable=exp_res complete + typename CONFIG_T::exp_table_t exp_sum(0); + for (unsigned j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + unsigned x = softmax_idx_from_real_val(d_xi_xmax[j]); + exp_res[j] = exp_table[x]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + exp_sum = + reduce>(exp_res, op_add); + + typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_idx_from_real_val(exp_sum)]; + + res_T out_pack; + PRAGMA_DATA_PACK(out_pack) + + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + out_pack[j] = exp_res[j] * inv_exp_sum; + } + res.write(out_pack); + } +} + +template +void softmax_legacy(hls::stream &data, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size]; + static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_exp_table_legacy(exp_table); + init_invert_table_legacy(invert_table); + initialized = true; + } + + // Index into the lookup table based on data for exponentials + typename CONFIG_T::table_t exp_res[data_T::size]; + typename CONFIG_T::table_t exp_diff_res; + typename data_T::value_type data_cache[data_T::size]; + +SoftmaxInitLoop: + for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) { + #pragma HLS PIPELINE + data_T in_pack = data.read(); + SoftmaxInitPackLoop: + for (unsigned j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + data_cache[j] = in_pack[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + for (int i = 0; i < data_T::size; i++) { + #pragma HLS UNROLL + SoftmaxExpInner: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table[index]; + } + + exp_res[i] += exp_diff_res; + } + } + + res_T out_pack; + PRAGMA_DATA_PACK(out_pack) + + SoftmaxInvPackLoop: + for (unsigned j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + + int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64; + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + + out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index]; + } + res.write(out_pack); + } +} + +template +void softmax_argmax(hls::stream &data, hls::stream &res) { + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + data_T in_data = data.read(); + res_T out_data; + + for (int i = 0; i < res_T::size; i++) { + #pragma HLS UNROLL + out_data[i] = (typename res_T::value_type)0; + } + + typename data_T::value_type maximum = in_data[0]; + int idx = 0; + + for (int i = 1; i < res_T::size; i++) { + #pragma HLS PIPELINE + if (in_data[i] > maximum) { + maximum = in_data[i]; + idx = i; + } + } + + out_data[idx] = (typename res_T::value_type)1; + res.write(out_data); + } +} + +template void softmax(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::axis == -1); + + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* + +template void tanh(hls::stream &data, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_tanh_table(tanh_table); + initialized = true; + } + +TanHActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + TanHPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + int data_round = in_data[j] * CONFIG_T::table_size / 8; + int index = data_round + 4 * CONFIG_T::table_size / 8; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = tanh_table[index]; + } + + res.write(out_data); + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* + +template +void hard_sigmoid(hls::stream &data, hls::stream &res) { + +HardSigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + HardSigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data[j] = datareg; + } + + res.write(out_data); + } +} + +template void hard_tanh(hls::stream &data, hls::stream &res) { + +HardSigmoidActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + HardSigmoidPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res.write(out_data); + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* + +template +void leaky_relu(hls::stream &data, typename data_T::value_type alpha, hls::stream &res) { +LeakyReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + LeakyReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; + } + res.write(out_data); + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* + +template +void thresholded_relu(hls::stream &data, typename data_T::value_type theta, hls::stream &res) { +ThresholdedReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ThresholdedReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res.write(out_data); + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* + +template void softplus(hls::stream &data, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softplus_table(softplus_table); + initialized = true; + } + +SoftplusActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + SoftplusPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softplus_table[index]; + } + res.write(out_data); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* + +template void softsign(hls::stream &data, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_softsign_table(softsign_table); + initialized = true; + } + +SoftsignActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + SoftsignPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + int data_round = in_data[j] * CONFIG_T::table_size / 16; + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softsign_table[index]; + } + res.write(out_data); + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template +void elu(hls::stream &data, typename data_T::value_type alpha, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_elu_table(elu_table); + initialized = true; + } + +EluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + EluPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + + typename data_T::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = datareg; + } else { + int index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = alpha * elu_table[index]; + } + } + res.write(out_data); + } +} + +template void elu(hls::stream &data, hls::stream &res) { + elu(data, 1.0, res); +} + +// ************************************************* +// SELU Activation +// ************************************************* + +template void selu(hls::stream &data, hls::stream &res) { + // Initialize the lookup table +#ifdef __HLS_SYN__ + bool initialized = false; + typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#else + static bool initialized = false; + static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size]; +#endif + if (!initialized) { + init_selu_table(selu_table); + initialized = true; + } + +SeluActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + SeluPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + + typename data_T::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg; + } else { + int index = datareg * CONFIG_T::table_size / -8; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = selu_table[index]; + } + } + res.write(out_data); + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* + +template +void prelu(hls::stream &data, typename data_T::value_type alpha[CONFIG_T::n_in], hls::stream &res) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * res_T::size + j] * in_data[j]; + } + res.write(out_data); + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template +void binary_tanh(hls::stream &data, hls::stream &res) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + if (in_data[j] > 0) + out_data[j] = (typename res_T::value_type)1; + else + out_data[j] = (typename res_T::value_type) - 1; + } + res.write(out_data); + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template +void ternary_tanh(hls::stream &data, hls::stream &res) { +PReLUActLoop: + for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + PReLUPackLoop: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + if (in_data[j] > 1) + out_data[j] = (typename res_T::value_type)1; + else if (in_data[j] <= -1) + out_data[j] = (typename res_T::value_type) - 1; + else + out_data[j] = (typename res_T::value_type)0; + } + res.write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_array.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_array.h new file mode 100644 index 00000000..d179102a --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_array.h @@ -0,0 +1,52 @@ +#ifndef NNET_ARRAY_H_ +#define NNET_ARRAY_H_ + +#include + +namespace nnet { + +struct transpose_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned depth = 10; + static constexpr unsigned perm[3] = {2, 0, 1}; +}; + +template +void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) { + #pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::height; i++) { + for (int j = 0; j < CONFIG_T::width; j++) { + data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j]; + } + } +} + +template +void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width], + res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) { + unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; + unsigned dims_t[3]; + dims_t[0] = dims[CONFIG_T::perm[0]]; + dims_t[1] = dims[CONFIG_T::perm[1]]; + dims_t[2] = dims[CONFIG_T::perm[2]]; + + int idx[3] = {0}, idx_t[3] = {0}; + for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) { + for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) { + for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) { + idx_t[0] = idx[CONFIG_T::perm[0]]; + idx_t[1] = idx[CONFIG_T::perm[1]]; + idx_t[2] = idx[CONFIG_T::perm[2]]; + + data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] = + data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm.h new file mode 100644 index 00000000..d8be45b7 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm.h @@ -0,0 +1,124 @@ +#ifndef NNET_BATCHNORM_H_ +#define NNET_BATCHNORM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_dense.h" +#include + +namespace nnet { + +struct batchnorm_config { + // Internal data type definitions + typedef float bias_t; + typedef float scale_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + template using product = nnet::product::mult; +}; + +template +void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in], + typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { + data_T cache; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=scale,bias + + // For parallel inputs: + // - completely partition arrays -- target fabric + // - if we have an unroll factor, limit number of multipliers + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes + #pragma HLS ARRAY_PARTITION variable=scale complete + #pragma HLS ARRAY_PARTITION variable=bias complete + + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + +// Calcuate result +Result: + for (int ires = 0; ires < CONFIG_T::n_in; ires++) { + if (CONFIG_T::n_filt == -1) { + res[ires] = CONFIG_T::template product::product(data[ires], scale[ires]) + + bias[ires]; + } else { + int norm_index = ires % CONFIG_T::n_filt; + res[ires] = + CONFIG_T::template product::product(data[ires], scale[norm_index]) + + bias[norm_index]; + } + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +struct batchnorm_quantized_tanh_config { + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; +}; + +template +void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in], + data_T threshold[CONFIG_T::n_scale_bias]) { + #pragma HLS PIPELINE + #pragma HLS ARRAY_PARTITION variable=res complete + + data_T datareg; + ap_uint<1> cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg >= threshold[norm_index]) + cache = 1; + else + cache = 0; + + res[ii] = cache; + } +} + +template +void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in], + data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) { + #pragma HLS PIPELINE + #pragma HLS ARRAY_PARTITION variable=res complete + + data_T datareg; + ap_int<2> cache; + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg > threshold_hi[norm_index]) + cache = 1; + else if (datareg <= threshold_lo[norm_index]) + cache = -1; + else + cache = 0; + + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm_stream.h new file mode 100644 index 00000000..a064677d --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm_stream.h @@ -0,0 +1,123 @@ +#ifndef NNET_BATCHNORM_STREAM_H_ +#define NNET_BATCHNORM_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_mult.h" +#include "nnet_types.h" + +namespace nnet { + +// **************************************************** +// Streaming Batch Normalization +// **************************************************** + +template +void normalize(hls::stream &data, hls::stream &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { + #pragma HLS ARRAY_PARTITION variable=scale complete + #pragma HLS ARRAY_PARTITION variable=bias complete + + constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit; + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + +BatchNormLoop: + for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + #pragma HLS PIPELINE II=ii + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + BatchNormpack: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + int norm_index; + if (CONFIG_T::n_filt == -1) { + norm_index = i * data_T::size + j; + } else { + norm_index = j % CONFIG_T::n_filt; + } + out_data[j] = CONFIG_T::template product::product( + in_data[j], scale[norm_index]) + + bias[norm_index]; + } + + res.write(out_data); + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +template +void normalize_binary_tanh(hls::stream &data, hls::stream, CONFIG_T::n_scale_bias>> &res, + typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) { + #pragma HLS ARRAY_PARTITION variable=threshold complete + +BinaryNormLoop: + for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + PRAGMA_DATA_PACK(out_data) + + BatchNormPack: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + int norm_index; + if (CONFIG_T::n_filt == -1) { + norm_index = i * data_T::size + j; + } else { + norm_index = j % CONFIG_T::n_filt; + } + out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0; + } + + res.write(out_data); + } +} + +template +void normalize_ternary_tanh(hls::stream &data, hls::stream, CONFIG_T::n_scale_bias>> &res, + typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias], + typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) { + #pragma HLS ARRAY_PARTITION variable=threshold_hi complete + #pragma HLS ARRAY_PARTITION variable=threshold_lo complete + +TernaryNormLoop: + for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + PRAGMA_DATA_PACK(out_data) + + BatchNormPack: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + + int norm_index; + if (CONFIG_T::n_filt == -1) { + norm_index = i * data_T::size + j; + } else { + norm_index = j % CONFIG_T::n_filt; + } + + if (in_data[j] > threshold_hi[norm_index]) { + out_data[j] = 1; + } else if (in_data[j] <= threshold_lo[norm_index]) { + out_data[j] = -1; + } else { + out_data[j] = 0; + } + } + + res.write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_code_gen.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_code_gen.h new file mode 100644 index 00000000..5bffda3d --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_code_gen.h @@ -0,0 +1,1262 @@ +#ifndef NNET_INSTR_GEN_H_ +#define NNET_INSTR_GEN_H_ + +#include "nnet_helpers.h" +#include + +namespace nnet { + +template class FillConv1DBuffer { + public: + static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +template class FillConv2DBuffer { + public: + static void + fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition) { + // To be implemented in subclasses + } +}; + +// hls4ml insert code +template +class fill_buffer_22 : public FillConv1DBuffer { + public: + static void fill_buffer( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition + ) { + if (partition == 0) { + buffer[0][0] = data[0]; buffer[0][1] = data[1]; buffer[0][2] = data[2]; buffer[0][3] = data[3]; buffer[0][4] = data[4]; buffer[0][5] = data[5]; buffer[0][6] = data[6]; buffer[0][7] = data[7]; + + } + if (partition == 1) { + buffer[0][0] = data[8]; buffer[0][1] = data[9]; buffer[0][2] = data[10]; buffer[0][3] = data[11]; buffer[0][4] = data[12]; buffer[0][5] = data[13]; buffer[0][6] = data[14]; buffer[0][7] = data[15]; + + } + if (partition == 2) { + buffer[0][0] = data[16]; buffer[0][1] = data[17]; buffer[0][2] = data[18]; buffer[0][3] = data[19]; buffer[0][4] = data[20]; buffer[0][5] = data[21]; buffer[0][6] = data[22]; buffer[0][7] = data[23]; + + } + if (partition == 3) { + buffer[0][0] = data[24]; buffer[0][1] = data[25]; buffer[0][2] = data[26]; buffer[0][3] = data[27]; buffer[0][4] = data[28]; buffer[0][5] = data[29]; buffer[0][6] = data[30]; buffer[0][7] = data[31]; + + } + if (partition == 4) { + buffer[0][0] = data[32]; buffer[0][1] = data[33]; buffer[0][2] = data[34]; buffer[0][3] = data[35]; buffer[0][4] = data[36]; buffer[0][5] = data[37]; buffer[0][6] = data[38]; buffer[0][7] = data[39]; + + } + if (partition == 5) { + buffer[0][0] = data[40]; buffer[0][1] = data[41]; buffer[0][2] = data[42]; buffer[0][3] = data[43]; buffer[0][4] = data[44]; buffer[0][5] = data[45]; buffer[0][6] = data[46]; buffer[0][7] = data[47]; + + } + if (partition == 6) { + buffer[0][0] = data[48]; buffer[0][1] = data[49]; buffer[0][2] = data[50]; buffer[0][3] = data[51]; buffer[0][4] = data[52]; buffer[0][5] = data[53]; buffer[0][6] = data[54]; buffer[0][7] = data[55]; + + } + if (partition == 7) { + buffer[0][0] = data[56]; buffer[0][1] = data[57]; buffer[0][2] = data[58]; buffer[0][3] = data[59]; buffer[0][4] = data[60]; buffer[0][5] = data[61]; buffer[0][6] = data[62]; buffer[0][7] = data[63]; + + } + if (partition == 8) { + buffer[0][0] = data[64]; buffer[0][1] = data[65]; buffer[0][2] = data[66]; buffer[0][3] = data[67]; buffer[0][4] = data[68]; buffer[0][5] = data[69]; buffer[0][6] = data[70]; buffer[0][7] = data[71]; + + } + if (partition == 9) { + buffer[0][0] = data[72]; buffer[0][1] = data[73]; buffer[0][2] = data[74]; buffer[0][3] = data[75]; buffer[0][4] = data[76]; buffer[0][5] = data[77]; buffer[0][6] = data[78]; buffer[0][7] = data[79]; + + } + if (partition == 10) { + buffer[0][0] = data[80]; buffer[0][1] = data[81]; buffer[0][2] = data[82]; buffer[0][3] = data[83]; buffer[0][4] = data[84]; buffer[0][5] = data[85]; buffer[0][6] = data[86]; buffer[0][7] = data[87]; + + } + if (partition == 11) { + buffer[0][0] = data[88]; buffer[0][1] = data[89]; buffer[0][2] = data[90]; buffer[0][3] = data[91]; buffer[0][4] = data[92]; buffer[0][5] = data[93]; buffer[0][6] = data[94]; buffer[0][7] = data[95]; + + } + if (partition == 12) { + buffer[0][0] = data[96]; buffer[0][1] = data[97]; buffer[0][2] = data[98]; buffer[0][3] = data[99]; buffer[0][4] = data[100]; buffer[0][5] = data[101]; buffer[0][6] = data[102]; buffer[0][7] = data[103]; + + } + if (partition == 13) { + buffer[0][0] = data[104]; buffer[0][1] = data[105]; buffer[0][2] = data[106]; buffer[0][3] = data[107]; buffer[0][4] = data[108]; buffer[0][5] = data[109]; buffer[0][6] = data[110]; buffer[0][7] = data[111]; + + } + if (partition == 14) { + buffer[0][0] = data[112]; buffer[0][1] = data[113]; buffer[0][2] = data[114]; buffer[0][3] = data[115]; buffer[0][4] = data[116]; buffer[0][5] = data[117]; buffer[0][6] = data[118]; buffer[0][7] = data[119]; + + } + if (partition == 15) { + buffer[0][0] = data[120]; buffer[0][1] = data[121]; buffer[0][2] = data[122]; buffer[0][3] = data[123]; buffer[0][4] = data[124]; buffer[0][5] = data[125]; buffer[0][6] = data[126]; buffer[0][7] = data[127]; + + } + if (partition == 16) { + buffer[0][0] = data[128]; buffer[0][1] = data[129]; buffer[0][2] = data[130]; buffer[0][3] = data[131]; buffer[0][4] = data[132]; buffer[0][5] = data[133]; buffer[0][6] = data[134]; buffer[0][7] = data[135]; + + } + if (partition == 17) { + buffer[0][0] = data[136]; buffer[0][1] = data[137]; buffer[0][2] = data[138]; buffer[0][3] = data[139]; buffer[0][4] = data[140]; buffer[0][5] = data[141]; buffer[0][6] = data[142]; buffer[0][7] = data[143]; + + } + if (partition == 18) { + buffer[0][0] = data[144]; buffer[0][1] = data[145]; buffer[0][2] = data[146]; buffer[0][3] = data[147]; buffer[0][4] = data[148]; buffer[0][5] = data[149]; buffer[0][6] = data[150]; buffer[0][7] = data[151]; + + } + if (partition == 19) { + buffer[0][0] = data[152]; buffer[0][1] = data[153]; buffer[0][2] = data[154]; buffer[0][3] = data[155]; buffer[0][4] = data[156]; buffer[0][5] = data[157]; buffer[0][6] = data[158]; buffer[0][7] = data[159]; + + } + if (partition == 20) { + buffer[0][0] = data[160]; buffer[0][1] = data[161]; buffer[0][2] = data[162]; buffer[0][3] = data[163]; buffer[0][4] = data[164]; buffer[0][5] = data[165]; buffer[0][6] = data[166]; buffer[0][7] = data[167]; + + } + if (partition == 21) { + buffer[0][0] = data[168]; buffer[0][1] = data[169]; buffer[0][2] = data[170]; buffer[0][3] = data[171]; buffer[0][4] = data[172]; buffer[0][5] = data[173]; buffer[0][6] = data[174]; buffer[0][7] = data[175]; + + } + if (partition == 22) { + buffer[0][0] = data[176]; buffer[0][1] = data[177]; buffer[0][2] = data[178]; buffer[0][3] = data[179]; buffer[0][4] = data[180]; buffer[0][5] = data[181]; buffer[0][6] = data[182]; buffer[0][7] = data[183]; + + } + if (partition == 23) { + buffer[0][0] = data[184]; buffer[0][1] = data[185]; buffer[0][2] = data[186]; buffer[0][3] = data[187]; buffer[0][4] = data[188]; buffer[0][5] = data[189]; buffer[0][6] = data[190]; buffer[0][7] = data[191]; + + } + if (partition == 24) { + buffer[0][0] = data[192]; buffer[0][1] = data[193]; buffer[0][2] = data[194]; buffer[0][3] = data[195]; buffer[0][4] = data[196]; buffer[0][5] = data[197]; buffer[0][6] = data[198]; buffer[0][7] = data[199]; + + } + if (partition == 25) { + buffer[0][0] = data[200]; buffer[0][1] = data[201]; buffer[0][2] = data[202]; buffer[0][3] = data[203]; buffer[0][4] = data[204]; buffer[0][5] = data[205]; buffer[0][6] = data[206]; buffer[0][7] = data[207]; + + } + if (partition == 26) { + buffer[0][0] = data[208]; buffer[0][1] = data[209]; buffer[0][2] = data[210]; buffer[0][3] = data[211]; buffer[0][4] = data[212]; buffer[0][5] = data[213]; buffer[0][6] = data[214]; buffer[0][7] = data[215]; + + } + if (partition == 27) { + buffer[0][0] = data[216]; buffer[0][1] = data[217]; buffer[0][2] = data[218]; buffer[0][3] = data[219]; buffer[0][4] = data[220]; buffer[0][5] = data[221]; buffer[0][6] = data[222]; buffer[0][7] = data[223]; + + } + if (partition == 28) { + buffer[0][0] = data[224]; buffer[0][1] = data[225]; buffer[0][2] = data[226]; buffer[0][3] = data[227]; buffer[0][4] = data[228]; buffer[0][5] = data[229]; buffer[0][6] = data[230]; buffer[0][7] = data[231]; + + } + if (partition == 29) { + buffer[0][0] = data[232]; buffer[0][1] = data[233]; buffer[0][2] = data[234]; buffer[0][3] = data[235]; buffer[0][4] = data[236]; buffer[0][5] = data[237]; buffer[0][6] = data[238]; buffer[0][7] = data[239]; + + } + if (partition == 30) { + buffer[0][0] = data[240]; buffer[0][1] = data[241]; buffer[0][2] = data[242]; buffer[0][3] = data[243]; buffer[0][4] = data[244]; buffer[0][5] = data[245]; buffer[0][6] = data[246]; buffer[0][7] = data[247]; + + } + if (partition == 31) { + buffer[0][0] = data[248]; buffer[0][1] = data[249]; buffer[0][2] = data[250]; buffer[0][3] = data[251]; buffer[0][4] = data[252]; buffer[0][5] = data[253]; buffer[0][6] = data[254]; buffer[0][7] = data[255]; + + } + if (partition == 32) { + buffer[0][0] = data[256]; buffer[0][1] = data[257]; buffer[0][2] = data[258]; buffer[0][3] = data[259]; buffer[0][4] = data[260]; buffer[0][5] = data[261]; buffer[0][6] = data[262]; buffer[0][7] = data[263]; + + } + if (partition == 33) { + buffer[0][0] = data[264]; buffer[0][1] = data[265]; buffer[0][2] = data[266]; buffer[0][3] = data[267]; buffer[0][4] = data[268]; buffer[0][5] = data[269]; buffer[0][6] = data[270]; buffer[0][7] = data[271]; + + } + if (partition == 34) { + buffer[0][0] = data[272]; buffer[0][1] = data[273]; buffer[0][2] = data[274]; buffer[0][3] = data[275]; buffer[0][4] = data[276]; buffer[0][5] = data[277]; buffer[0][6] = data[278]; buffer[0][7] = data[279]; + + } + if (partition == 35) { + buffer[0][0] = data[280]; buffer[0][1] = data[281]; buffer[0][2] = data[282]; buffer[0][3] = data[283]; buffer[0][4] = data[284]; buffer[0][5] = data[285]; buffer[0][6] = data[286]; buffer[0][7] = data[287]; + + } + if (partition == 36) { + buffer[0][0] = data[288]; buffer[0][1] = data[289]; buffer[0][2] = data[290]; buffer[0][3] = data[291]; buffer[0][4] = data[292]; buffer[0][5] = data[293]; buffer[0][6] = data[294]; buffer[0][7] = data[295]; + + } + if (partition == 37) { + buffer[0][0] = data[296]; buffer[0][1] = data[297]; buffer[0][2] = data[298]; buffer[0][3] = data[299]; buffer[0][4] = data[300]; buffer[0][5] = data[301]; buffer[0][6] = data[302]; buffer[0][7] = data[303]; + + } + if (partition == 38) { + buffer[0][0] = data[304]; buffer[0][1] = data[305]; buffer[0][2] = data[306]; buffer[0][3] = data[307]; buffer[0][4] = data[308]; buffer[0][5] = data[309]; buffer[0][6] = data[310]; buffer[0][7] = data[311]; + + } + if (partition == 39) { + buffer[0][0] = data[312]; buffer[0][1] = data[313]; buffer[0][2] = data[314]; buffer[0][3] = data[315]; buffer[0][4] = data[316]; buffer[0][5] = data[317]; buffer[0][6] = data[318]; buffer[0][7] = data[319]; + + } + if (partition == 40) { + buffer[0][0] = data[320]; buffer[0][1] = data[321]; buffer[0][2] = data[322]; buffer[0][3] = data[323]; buffer[0][4] = data[324]; buffer[0][5] = data[325]; buffer[0][6] = data[326]; buffer[0][7] = data[327]; + + } + if (partition == 41) { + buffer[0][0] = data[328]; buffer[0][1] = data[329]; buffer[0][2] = data[330]; buffer[0][3] = data[331]; buffer[0][4] = data[332]; buffer[0][5] = data[333]; buffer[0][6] = data[334]; buffer[0][7] = data[335]; + + } + if (partition == 42) { + buffer[0][0] = data[336]; buffer[0][1] = data[337]; buffer[0][2] = data[338]; buffer[0][3] = data[339]; buffer[0][4] = data[340]; buffer[0][5] = data[341]; buffer[0][6] = data[342]; buffer[0][7] = data[343]; + + } + if (partition == 43) { + buffer[0][0] = data[344]; buffer[0][1] = data[345]; buffer[0][2] = data[346]; buffer[0][3] = data[347]; buffer[0][4] = data[348]; buffer[0][5] = data[349]; buffer[0][6] = data[350]; buffer[0][7] = data[351]; + + } + if (partition == 44) { + buffer[0][0] = data[352]; buffer[0][1] = data[353]; buffer[0][2] = data[354]; buffer[0][3] = data[355]; buffer[0][4] = data[356]; buffer[0][5] = data[357]; buffer[0][6] = data[358]; buffer[0][7] = data[359]; + + } + if (partition == 45) { + buffer[0][0] = data[360]; buffer[0][1] = data[361]; buffer[0][2] = data[362]; buffer[0][3] = data[363]; buffer[0][4] = data[364]; buffer[0][5] = data[365]; buffer[0][6] = data[366]; buffer[0][7] = data[367]; + + } + if (partition == 46) { + buffer[0][0] = data[368]; buffer[0][1] = data[369]; buffer[0][2] = data[370]; buffer[0][3] = data[371]; buffer[0][4] = data[372]; buffer[0][5] = data[373]; buffer[0][6] = data[374]; buffer[0][7] = data[375]; + + } + if (partition == 47) { + buffer[0][0] = data[376]; buffer[0][1] = data[377]; buffer[0][2] = data[378]; buffer[0][3] = data[379]; buffer[0][4] = data[380]; buffer[0][5] = data[381]; buffer[0][6] = data[382]; buffer[0][7] = data[383]; + + } + if (partition == 48) { + buffer[0][0] = data[384]; buffer[0][1] = data[385]; buffer[0][2] = data[386]; buffer[0][3] = data[387]; buffer[0][4] = data[388]; buffer[0][5] = data[389]; buffer[0][6] = data[390]; buffer[0][7] = data[391]; + + } + if (partition == 49) { + buffer[0][0] = data[392]; buffer[0][1] = data[393]; buffer[0][2] = data[394]; buffer[0][3] = data[395]; buffer[0][4] = data[396]; buffer[0][5] = data[397]; buffer[0][6] = data[398]; buffer[0][7] = data[399]; + + } + if (partition == 50) { + buffer[0][0] = data[400]; buffer[0][1] = data[401]; buffer[0][2] = data[402]; buffer[0][3] = data[403]; buffer[0][4] = data[404]; buffer[0][5] = data[405]; buffer[0][6] = data[406]; buffer[0][7] = data[407]; + + } + if (partition == 51) { + buffer[0][0] = data[408]; buffer[0][1] = data[409]; buffer[0][2] = data[410]; buffer[0][3] = data[411]; buffer[0][4] = data[412]; buffer[0][5] = data[413]; buffer[0][6] = data[414]; buffer[0][7] = data[415]; + + } + if (partition == 52) { + buffer[0][0] = data[416]; buffer[0][1] = data[417]; buffer[0][2] = data[418]; buffer[0][3] = data[419]; buffer[0][4] = data[420]; buffer[0][5] = data[421]; buffer[0][6] = data[422]; buffer[0][7] = data[423]; + + } + if (partition == 53) { + buffer[0][0] = data[424]; buffer[0][1] = data[425]; buffer[0][2] = data[426]; buffer[0][3] = data[427]; buffer[0][4] = data[428]; buffer[0][5] = data[429]; buffer[0][6] = data[430]; buffer[0][7] = data[431]; + + } + if (partition == 54) { + buffer[0][0] = data[432]; buffer[0][1] = data[433]; buffer[0][2] = data[434]; buffer[0][3] = data[435]; buffer[0][4] = data[436]; buffer[0][5] = data[437]; buffer[0][6] = data[438]; buffer[0][7] = data[439]; + + } + if (partition == 55) { + buffer[0][0] = data[440]; buffer[0][1] = data[441]; buffer[0][2] = data[442]; buffer[0][3] = data[443]; buffer[0][4] = data[444]; buffer[0][5] = data[445]; buffer[0][6] = data[446]; buffer[0][7] = data[447]; + + } + if (partition == 56) { + buffer[0][0] = data[448]; buffer[0][1] = data[449]; buffer[0][2] = data[450]; buffer[0][3] = data[451]; buffer[0][4] = data[452]; buffer[0][5] = data[453]; buffer[0][6] = data[454]; buffer[0][7] = data[455]; + + } + if (partition == 57) { + buffer[0][0] = data[456]; buffer[0][1] = data[457]; buffer[0][2] = data[458]; buffer[0][3] = data[459]; buffer[0][4] = data[460]; buffer[0][5] = data[461]; buffer[0][6] = data[462]; buffer[0][7] = data[463]; + + } + if (partition == 58) { + buffer[0][0] = data[464]; buffer[0][1] = data[465]; buffer[0][2] = data[466]; buffer[0][3] = data[467]; buffer[0][4] = data[468]; buffer[0][5] = data[469]; buffer[0][6] = data[470]; buffer[0][7] = data[471]; + + } + if (partition == 59) { + buffer[0][0] = data[472]; buffer[0][1] = data[473]; buffer[0][2] = data[474]; buffer[0][3] = data[475]; buffer[0][4] = data[476]; buffer[0][5] = data[477]; buffer[0][6] = data[478]; buffer[0][7] = data[479]; + + } + if (partition == 60) { + buffer[0][0] = data[480]; buffer[0][1] = data[481]; buffer[0][2] = data[482]; buffer[0][3] = data[483]; buffer[0][4] = data[484]; buffer[0][5] = data[485]; buffer[0][6] = data[486]; buffer[0][7] = data[487]; + + } + if (partition == 61) { + buffer[0][0] = data[488]; buffer[0][1] = data[489]; buffer[0][2] = data[490]; buffer[0][3] = data[491]; buffer[0][4] = data[492]; buffer[0][5] = data[493]; buffer[0][6] = data[494]; buffer[0][7] = data[495]; + + } + if (partition == 62) { + buffer[0][0] = data[496]; buffer[0][1] = data[497]; buffer[0][2] = data[498]; buffer[0][3] = data[499]; buffer[0][4] = data[500]; buffer[0][5] = data[501]; buffer[0][6] = data[502]; buffer[0][7] = data[503]; + + } + if (partition == 63) { + buffer[0][0] = data[504]; buffer[0][1] = data[505]; buffer[0][2] = data[506]; buffer[0][3] = data[507]; buffer[0][4] = data[508]; buffer[0][5] = data[509]; buffer[0][6] = data[510]; buffer[0][7] = data[511]; + + } + if (partition == 64) { + buffer[0][0] = data[512]; buffer[0][1] = data[513]; buffer[0][2] = data[514]; buffer[0][3] = data[515]; buffer[0][4] = data[516]; buffer[0][5] = data[517]; buffer[0][6] = data[518]; buffer[0][7] = data[519]; + + } + if (partition == 65) { + buffer[0][0] = data[520]; buffer[0][1] = data[521]; buffer[0][2] = data[522]; buffer[0][3] = data[523]; buffer[0][4] = data[524]; buffer[0][5] = data[525]; buffer[0][6] = data[526]; buffer[0][7] = data[527]; + + } + if (partition == 66) { + buffer[0][0] = data[528]; buffer[0][1] = data[529]; buffer[0][2] = data[530]; buffer[0][3] = data[531]; buffer[0][4] = data[532]; buffer[0][5] = data[533]; buffer[0][6] = data[534]; buffer[0][7] = data[535]; + + } + if (partition == 67) { + buffer[0][0] = data[536]; buffer[0][1] = data[537]; buffer[0][2] = data[538]; buffer[0][3] = data[539]; buffer[0][4] = data[540]; buffer[0][5] = data[541]; buffer[0][6] = data[542]; buffer[0][7] = data[543]; + + } + if (partition == 68) { + buffer[0][0] = data[544]; buffer[0][1] = data[545]; buffer[0][2] = data[546]; buffer[0][3] = data[547]; buffer[0][4] = data[548]; buffer[0][5] = data[549]; buffer[0][6] = data[550]; buffer[0][7] = data[551]; + + } + if (partition == 69) { + buffer[0][0] = data[552]; buffer[0][1] = data[553]; buffer[0][2] = data[554]; buffer[0][3] = data[555]; buffer[0][4] = data[556]; buffer[0][5] = data[557]; buffer[0][6] = data[558]; buffer[0][7] = data[559]; + + } + if (partition == 70) { + buffer[0][0] = data[560]; buffer[0][1] = data[561]; buffer[0][2] = data[562]; buffer[0][3] = data[563]; buffer[0][4] = data[564]; buffer[0][5] = data[565]; buffer[0][6] = data[566]; buffer[0][7] = data[567]; + + } + if (partition == 71) { + buffer[0][0] = data[568]; buffer[0][1] = data[569]; buffer[0][2] = data[570]; buffer[0][3] = data[571]; buffer[0][4] = data[572]; buffer[0][5] = data[573]; buffer[0][6] = data[574]; buffer[0][7] = data[575]; + + } + if (partition == 72) { + buffer[0][0] = data[576]; buffer[0][1] = data[577]; buffer[0][2] = data[578]; buffer[0][3] = data[579]; buffer[0][4] = data[580]; buffer[0][5] = data[581]; buffer[0][6] = data[582]; buffer[0][7] = data[583]; + + } + if (partition == 73) { + buffer[0][0] = data[584]; buffer[0][1] = data[585]; buffer[0][2] = data[586]; buffer[0][3] = data[587]; buffer[0][4] = data[588]; buffer[0][5] = data[589]; buffer[0][6] = data[590]; buffer[0][7] = data[591]; + + } + if (partition == 74) { + buffer[0][0] = data[592]; buffer[0][1] = data[593]; buffer[0][2] = data[594]; buffer[0][3] = data[595]; buffer[0][4] = data[596]; buffer[0][5] = data[597]; buffer[0][6] = data[598]; buffer[0][7] = data[599]; + + } + if (partition == 75) { + buffer[0][0] = data[600]; buffer[0][1] = data[601]; buffer[0][2] = data[602]; buffer[0][3] = data[603]; buffer[0][4] = data[604]; buffer[0][5] = data[605]; buffer[0][6] = data[606]; buffer[0][7] = data[607]; + + } + if (partition == 76) { + buffer[0][0] = data[608]; buffer[0][1] = data[609]; buffer[0][2] = data[610]; buffer[0][3] = data[611]; buffer[0][4] = data[612]; buffer[0][5] = data[613]; buffer[0][6] = data[614]; buffer[0][7] = data[615]; + + } + if (partition == 77) { + buffer[0][0] = data[616]; buffer[0][1] = data[617]; buffer[0][2] = data[618]; buffer[0][3] = data[619]; buffer[0][4] = data[620]; buffer[0][5] = data[621]; buffer[0][6] = data[622]; buffer[0][7] = data[623]; + + } + if (partition == 78) { + buffer[0][0] = data[624]; buffer[0][1] = data[625]; buffer[0][2] = data[626]; buffer[0][3] = data[627]; buffer[0][4] = data[628]; buffer[0][5] = data[629]; buffer[0][6] = data[630]; buffer[0][7] = data[631]; + + } + if (partition == 79) { + buffer[0][0] = data[632]; buffer[0][1] = data[633]; buffer[0][2] = data[634]; buffer[0][3] = data[635]; buffer[0][4] = data[636]; buffer[0][5] = data[637]; buffer[0][6] = data[638]; buffer[0][7] = data[639]; + + } + if (partition == 80) { + buffer[0][0] = data[640]; buffer[0][1] = data[641]; buffer[0][2] = data[642]; buffer[0][3] = data[643]; buffer[0][4] = data[644]; buffer[0][5] = data[645]; buffer[0][6] = data[646]; buffer[0][7] = data[647]; + + } + if (partition == 81) { + buffer[0][0] = data[648]; buffer[0][1] = data[649]; buffer[0][2] = data[650]; buffer[0][3] = data[651]; buffer[0][4] = data[652]; buffer[0][5] = data[653]; buffer[0][6] = data[654]; buffer[0][7] = data[655]; + + } + if (partition == 82) { + buffer[0][0] = data[656]; buffer[0][1] = data[657]; buffer[0][2] = data[658]; buffer[0][3] = data[659]; buffer[0][4] = data[660]; buffer[0][5] = data[661]; buffer[0][6] = data[662]; buffer[0][7] = data[663]; + + } + if (partition == 83) { + buffer[0][0] = data[664]; buffer[0][1] = data[665]; buffer[0][2] = data[666]; buffer[0][3] = data[667]; buffer[0][4] = data[668]; buffer[0][5] = data[669]; buffer[0][6] = data[670]; buffer[0][7] = data[671]; + + } + if (partition == 84) { + buffer[0][0] = data[672]; buffer[0][1] = data[673]; buffer[0][2] = data[674]; buffer[0][3] = data[675]; buffer[0][4] = data[676]; buffer[0][5] = data[677]; buffer[0][6] = data[678]; buffer[0][7] = data[679]; + + } + if (partition == 85) { + buffer[0][0] = data[680]; buffer[0][1] = data[681]; buffer[0][2] = data[682]; buffer[0][3] = data[683]; buffer[0][4] = data[684]; buffer[0][5] = data[685]; buffer[0][6] = data[686]; buffer[0][7] = data[687]; + + } + if (partition == 86) { + buffer[0][0] = data[688]; buffer[0][1] = data[689]; buffer[0][2] = data[690]; buffer[0][3] = data[691]; buffer[0][4] = data[692]; buffer[0][5] = data[693]; buffer[0][6] = data[694]; buffer[0][7] = data[695]; + + } + if (partition == 87) { + buffer[0][0] = data[696]; buffer[0][1] = data[697]; buffer[0][2] = data[698]; buffer[0][3] = data[699]; buffer[0][4] = data[700]; buffer[0][5] = data[701]; buffer[0][6] = data[702]; buffer[0][7] = data[703]; + + } + if (partition == 88) { + buffer[0][0] = data[704]; buffer[0][1] = data[705]; buffer[0][2] = data[706]; buffer[0][3] = data[707]; buffer[0][4] = data[708]; buffer[0][5] = data[709]; buffer[0][6] = data[710]; buffer[0][7] = data[711]; + + } + if (partition == 89) { + buffer[0][0] = data[712]; buffer[0][1] = data[713]; buffer[0][2] = data[714]; buffer[0][3] = data[715]; buffer[0][4] = data[716]; buffer[0][5] = data[717]; buffer[0][6] = data[718]; buffer[0][7] = data[719]; + + } + if (partition == 90) { + buffer[0][0] = data[720]; buffer[0][1] = data[721]; buffer[0][2] = data[722]; buffer[0][3] = data[723]; buffer[0][4] = data[724]; buffer[0][5] = data[725]; buffer[0][6] = data[726]; buffer[0][7] = data[727]; + + } + if (partition == 91) { + buffer[0][0] = data[728]; buffer[0][1] = data[729]; buffer[0][2] = data[730]; buffer[0][3] = data[731]; buffer[0][4] = data[732]; buffer[0][5] = data[733]; buffer[0][6] = data[734]; buffer[0][7] = data[735]; + + } + if (partition == 92) { + buffer[0][0] = data[736]; buffer[0][1] = data[737]; buffer[0][2] = data[738]; buffer[0][3] = data[739]; buffer[0][4] = data[740]; buffer[0][5] = data[741]; buffer[0][6] = data[742]; buffer[0][7] = data[743]; + + } + if (partition == 93) { + buffer[0][0] = data[744]; buffer[0][1] = data[745]; buffer[0][2] = data[746]; buffer[0][3] = data[747]; buffer[0][4] = data[748]; buffer[0][5] = data[749]; buffer[0][6] = data[750]; buffer[0][7] = data[751]; + + } + if (partition == 94) { + buffer[0][0] = data[752]; buffer[0][1] = data[753]; buffer[0][2] = data[754]; buffer[0][3] = data[755]; buffer[0][4] = data[756]; buffer[0][5] = data[757]; buffer[0][6] = data[758]; buffer[0][7] = data[759]; + + } + if (partition == 95) { + buffer[0][0] = data[760]; buffer[0][1] = data[761]; buffer[0][2] = data[762]; buffer[0][3] = data[763]; buffer[0][4] = data[764]; buffer[0][5] = data[765]; buffer[0][6] = data[766]; buffer[0][7] = data[767]; + + } + if (partition == 96) { + buffer[0][0] = data[768]; buffer[0][1] = data[769]; buffer[0][2] = data[770]; buffer[0][3] = data[771]; buffer[0][4] = data[772]; buffer[0][5] = data[773]; buffer[0][6] = data[774]; buffer[0][7] = data[775]; + + } + if (partition == 97) { + buffer[0][0] = data[776]; buffer[0][1] = data[777]; buffer[0][2] = data[778]; buffer[0][3] = data[779]; buffer[0][4] = data[780]; buffer[0][5] = data[781]; buffer[0][6] = data[782]; buffer[0][7] = data[783]; + + } + if (partition == 98) { + buffer[0][0] = data[784]; buffer[0][1] = data[785]; buffer[0][2] = data[786]; buffer[0][3] = data[787]; buffer[0][4] = data[788]; buffer[0][5] = data[789]; buffer[0][6] = data[790]; buffer[0][7] = data[791]; + + } + if (partition == 99) { + buffer[0][0] = data[792]; buffer[0][1] = data[793]; buffer[0][2] = data[794]; buffer[0][3] = data[795]; buffer[0][4] = data[796]; buffer[0][5] = data[797]; buffer[0][6] = data[798]; buffer[0][7] = data[799]; + + } + } +}; +template +class fill_buffer_23 : public FillConv1DBuffer { + public: + static void fill_buffer( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition + ) { + if (partition == 0) { + buffer[0][0] = data[0]; buffer[0][1] = data[1]; buffer[0][2] = data[2]; buffer[0][3] = data[3]; buffer[0][4] = data[4]; buffer[0][5] = data[5]; buffer[0][6] = data[6]; buffer[0][7] = data[7]; buffer[0][8] = data[8]; buffer[0][9] = data[9]; buffer[0][10] = data[10]; buffer[0][11] = data[11]; + + } + if (partition == 1) { + buffer[0][0] = data[12]; buffer[0][1] = data[13]; buffer[0][2] = data[14]; buffer[0][3] = data[15]; buffer[0][4] = data[16]; buffer[0][5] = data[17]; buffer[0][6] = data[18]; buffer[0][7] = data[19]; buffer[0][8] = data[20]; buffer[0][9] = data[21]; buffer[0][10] = data[22]; buffer[0][11] = data[23]; + + } + if (partition == 2) { + buffer[0][0] = data[24]; buffer[0][1] = data[25]; buffer[0][2] = data[26]; buffer[0][3] = data[27]; buffer[0][4] = data[28]; buffer[0][5] = data[29]; buffer[0][6] = data[30]; buffer[0][7] = data[31]; buffer[0][8] = data[32]; buffer[0][9] = data[33]; buffer[0][10] = data[34]; buffer[0][11] = data[35]; + + } + if (partition == 3) { + buffer[0][0] = data[36]; buffer[0][1] = data[37]; buffer[0][2] = data[38]; buffer[0][3] = data[39]; buffer[0][4] = data[40]; buffer[0][5] = data[41]; buffer[0][6] = data[42]; buffer[0][7] = data[43]; buffer[0][8] = data[44]; buffer[0][9] = data[45]; buffer[0][10] = data[46]; buffer[0][11] = data[47]; + + } + if (partition == 4) { + buffer[0][0] = data[48]; buffer[0][1] = data[49]; buffer[0][2] = data[50]; buffer[0][3] = data[51]; buffer[0][4] = data[52]; buffer[0][5] = data[53]; buffer[0][6] = data[54]; buffer[0][7] = data[55]; buffer[0][8] = data[56]; buffer[0][9] = data[57]; buffer[0][10] = data[58]; buffer[0][11] = data[59]; + + } + if (partition == 5) { + buffer[0][0] = data[60]; buffer[0][1] = data[61]; buffer[0][2] = data[62]; buffer[0][3] = data[63]; buffer[0][4] = data[64]; buffer[0][5] = data[65]; buffer[0][6] = data[66]; buffer[0][7] = data[67]; buffer[0][8] = data[68]; buffer[0][9] = data[69]; buffer[0][10] = data[70]; buffer[0][11] = data[71]; + + } + if (partition == 6) { + buffer[0][0] = data[72]; buffer[0][1] = data[73]; buffer[0][2] = data[74]; buffer[0][3] = data[75]; buffer[0][4] = data[76]; buffer[0][5] = data[77]; buffer[0][6] = data[78]; buffer[0][7] = data[79]; buffer[0][8] = data[80]; buffer[0][9] = data[81]; buffer[0][10] = data[82]; buffer[0][11] = data[83]; + + } + if (partition == 7) { + buffer[0][0] = data[84]; buffer[0][1] = data[85]; buffer[0][2] = data[86]; buffer[0][3] = data[87]; buffer[0][4] = data[88]; buffer[0][5] = data[89]; buffer[0][6] = data[90]; buffer[0][7] = data[91]; buffer[0][8] = data[92]; buffer[0][9] = data[93]; buffer[0][10] = data[94]; buffer[0][11] = data[95]; + + } + if (partition == 8) { + buffer[0][0] = data[96]; buffer[0][1] = data[97]; buffer[0][2] = data[98]; buffer[0][3] = data[99]; buffer[0][4] = data[100]; buffer[0][5] = data[101]; buffer[0][6] = data[102]; buffer[0][7] = data[103]; buffer[0][8] = data[104]; buffer[0][9] = data[105]; buffer[0][10] = data[106]; buffer[0][11] = data[107]; + + } + if (partition == 9) { + buffer[0][0] = data[108]; buffer[0][1] = data[109]; buffer[0][2] = data[110]; buffer[0][3] = data[111]; buffer[0][4] = data[112]; buffer[0][5] = data[113]; buffer[0][6] = data[114]; buffer[0][7] = data[115]; buffer[0][8] = data[116]; buffer[0][9] = data[117]; buffer[0][10] = data[118]; buffer[0][11] = data[119]; + + } + if (partition == 10) { + buffer[0][0] = data[120]; buffer[0][1] = data[121]; buffer[0][2] = data[122]; buffer[0][3] = data[123]; buffer[0][4] = data[124]; buffer[0][5] = data[125]; buffer[0][6] = data[126]; buffer[0][7] = data[127]; buffer[0][8] = data[128]; buffer[0][9] = data[129]; buffer[0][10] = data[130]; buffer[0][11] = data[131]; + + } + if (partition == 11) { + buffer[0][0] = data[132]; buffer[0][1] = data[133]; buffer[0][2] = data[134]; buffer[0][3] = data[135]; buffer[0][4] = data[136]; buffer[0][5] = data[137]; buffer[0][6] = data[138]; buffer[0][7] = data[139]; buffer[0][8] = data[140]; buffer[0][9] = data[141]; buffer[0][10] = data[142]; buffer[0][11] = data[143]; + + } + if (partition == 12) { + buffer[0][0] = data[144]; buffer[0][1] = data[145]; buffer[0][2] = data[146]; buffer[0][3] = data[147]; buffer[0][4] = data[148]; buffer[0][5] = data[149]; buffer[0][6] = data[150]; buffer[0][7] = data[151]; buffer[0][8] = data[152]; buffer[0][9] = data[153]; buffer[0][10] = data[154]; buffer[0][11] = data[155]; + + } + if (partition == 13) { + buffer[0][0] = data[156]; buffer[0][1] = data[157]; buffer[0][2] = data[158]; buffer[0][3] = data[159]; buffer[0][4] = data[160]; buffer[0][5] = data[161]; buffer[0][6] = data[162]; buffer[0][7] = data[163]; buffer[0][8] = data[164]; buffer[0][9] = data[165]; buffer[0][10] = data[166]; buffer[0][11] = data[167]; + + } + if (partition == 14) { + buffer[0][0] = data[168]; buffer[0][1] = data[169]; buffer[0][2] = data[170]; buffer[0][3] = data[171]; buffer[0][4] = data[172]; buffer[0][5] = data[173]; buffer[0][6] = data[174]; buffer[0][7] = data[175]; buffer[0][8] = data[176]; buffer[0][9] = data[177]; buffer[0][10] = data[178]; buffer[0][11] = data[179]; + + } + if (partition == 15) { + buffer[0][0] = data[180]; buffer[0][1] = data[181]; buffer[0][2] = data[182]; buffer[0][3] = data[183]; buffer[0][4] = data[184]; buffer[0][5] = data[185]; buffer[0][6] = data[186]; buffer[0][7] = data[187]; buffer[0][8] = data[188]; buffer[0][9] = data[189]; buffer[0][10] = data[190]; buffer[0][11] = data[191]; + + } + if (partition == 16) { + buffer[0][0] = data[192]; buffer[0][1] = data[193]; buffer[0][2] = data[194]; buffer[0][3] = data[195]; buffer[0][4] = data[196]; buffer[0][5] = data[197]; buffer[0][6] = data[198]; buffer[0][7] = data[199]; buffer[0][8] = data[200]; buffer[0][9] = data[201]; buffer[0][10] = data[202]; buffer[0][11] = data[203]; + + } + if (partition == 17) { + buffer[0][0] = data[204]; buffer[0][1] = data[205]; buffer[0][2] = data[206]; buffer[0][3] = data[207]; buffer[0][4] = data[208]; buffer[0][5] = data[209]; buffer[0][6] = data[210]; buffer[0][7] = data[211]; buffer[0][8] = data[212]; buffer[0][9] = data[213]; buffer[0][10] = data[214]; buffer[0][11] = data[215]; + + } + if (partition == 18) { + buffer[0][0] = data[216]; buffer[0][1] = data[217]; buffer[0][2] = data[218]; buffer[0][3] = data[219]; buffer[0][4] = data[220]; buffer[0][5] = data[221]; buffer[0][6] = data[222]; buffer[0][7] = data[223]; buffer[0][8] = data[224]; buffer[0][9] = data[225]; buffer[0][10] = data[226]; buffer[0][11] = data[227]; + + } + if (partition == 19) { + buffer[0][0] = data[228]; buffer[0][1] = data[229]; buffer[0][2] = data[230]; buffer[0][3] = data[231]; buffer[0][4] = data[232]; buffer[0][5] = data[233]; buffer[0][6] = data[234]; buffer[0][7] = data[235]; buffer[0][8] = data[236]; buffer[0][9] = data[237]; buffer[0][10] = data[238]; buffer[0][11] = data[239]; + + } + if (partition == 20) { + buffer[0][0] = data[240]; buffer[0][1] = data[241]; buffer[0][2] = data[242]; buffer[0][3] = data[243]; buffer[0][4] = data[244]; buffer[0][5] = data[245]; buffer[0][6] = data[246]; buffer[0][7] = data[247]; buffer[0][8] = data[248]; buffer[0][9] = data[249]; buffer[0][10] = data[250]; buffer[0][11] = data[251]; + + } + if (partition == 21) { + buffer[0][0] = data[252]; buffer[0][1] = data[253]; buffer[0][2] = data[254]; buffer[0][3] = data[255]; buffer[0][4] = data[256]; buffer[0][5] = data[257]; buffer[0][6] = data[258]; buffer[0][7] = data[259]; buffer[0][8] = data[260]; buffer[0][9] = data[261]; buffer[0][10] = data[262]; buffer[0][11] = data[263]; + + } + if (partition == 22) { + buffer[0][0] = data[264]; buffer[0][1] = data[265]; buffer[0][2] = data[266]; buffer[0][3] = data[267]; buffer[0][4] = data[268]; buffer[0][5] = data[269]; buffer[0][6] = data[270]; buffer[0][7] = data[271]; buffer[0][8] = data[272]; buffer[0][9] = data[273]; buffer[0][10] = data[274]; buffer[0][11] = data[275]; + + } + if (partition == 23) { + buffer[0][0] = data[276]; buffer[0][1] = data[277]; buffer[0][2] = data[278]; buffer[0][3] = data[279]; buffer[0][4] = data[280]; buffer[0][5] = data[281]; buffer[0][6] = data[282]; buffer[0][7] = data[283]; buffer[0][8] = data[284]; buffer[0][9] = data[285]; buffer[0][10] = data[286]; buffer[0][11] = data[287]; + + } + if (partition == 24) { + buffer[0][0] = data[288]; buffer[0][1] = data[289]; buffer[0][2] = data[290]; buffer[0][3] = data[291]; buffer[0][4] = data[292]; buffer[0][5] = data[293]; buffer[0][6] = data[294]; buffer[0][7] = data[295]; buffer[0][8] = data[296]; buffer[0][9] = data[297]; buffer[0][10] = data[298]; buffer[0][11] = data[299]; + + } + if (partition == 25) { + buffer[0][0] = data[300]; buffer[0][1] = data[301]; buffer[0][2] = data[302]; buffer[0][3] = data[303]; buffer[0][4] = data[304]; buffer[0][5] = data[305]; buffer[0][6] = data[306]; buffer[0][7] = data[307]; buffer[0][8] = data[308]; buffer[0][9] = data[309]; buffer[0][10] = data[310]; buffer[0][11] = data[311]; + + } + if (partition == 26) { + buffer[0][0] = data[312]; buffer[0][1] = data[313]; buffer[0][2] = data[314]; buffer[0][3] = data[315]; buffer[0][4] = data[316]; buffer[0][5] = data[317]; buffer[0][6] = data[318]; buffer[0][7] = data[319]; buffer[0][8] = data[320]; buffer[0][9] = data[321]; buffer[0][10] = data[322]; buffer[0][11] = data[323]; + + } + if (partition == 27) { + buffer[0][0] = data[324]; buffer[0][1] = data[325]; buffer[0][2] = data[326]; buffer[0][3] = data[327]; buffer[0][4] = data[328]; buffer[0][5] = data[329]; buffer[0][6] = data[330]; buffer[0][7] = data[331]; buffer[0][8] = data[332]; buffer[0][9] = data[333]; buffer[0][10] = data[334]; buffer[0][11] = data[335]; + + } + if (partition == 28) { + buffer[0][0] = data[336]; buffer[0][1] = data[337]; buffer[0][2] = data[338]; buffer[0][3] = data[339]; buffer[0][4] = data[340]; buffer[0][5] = data[341]; buffer[0][6] = data[342]; buffer[0][7] = data[343]; buffer[0][8] = data[344]; buffer[0][9] = data[345]; buffer[0][10] = data[346]; buffer[0][11] = data[347]; + + } + if (partition == 29) { + buffer[0][0] = data[348]; buffer[0][1] = data[349]; buffer[0][2] = data[350]; buffer[0][3] = data[351]; buffer[0][4] = data[352]; buffer[0][5] = data[353]; buffer[0][6] = data[354]; buffer[0][7] = data[355]; buffer[0][8] = data[356]; buffer[0][9] = data[357]; buffer[0][10] = data[358]; buffer[0][11] = data[359]; + + } + if (partition == 30) { + buffer[0][0] = data[360]; buffer[0][1] = data[361]; buffer[0][2] = data[362]; buffer[0][3] = data[363]; buffer[0][4] = data[364]; buffer[0][5] = data[365]; buffer[0][6] = data[366]; buffer[0][7] = data[367]; buffer[0][8] = data[368]; buffer[0][9] = data[369]; buffer[0][10] = data[370]; buffer[0][11] = data[371]; + + } + if (partition == 31) { + buffer[0][0] = data[372]; buffer[0][1] = data[373]; buffer[0][2] = data[374]; buffer[0][3] = data[375]; buffer[0][4] = data[376]; buffer[0][5] = data[377]; buffer[0][6] = data[378]; buffer[0][7] = data[379]; buffer[0][8] = data[380]; buffer[0][9] = data[381]; buffer[0][10] = data[382]; buffer[0][11] = data[383]; + + } + if (partition == 32) { + buffer[0][0] = data[384]; buffer[0][1] = data[385]; buffer[0][2] = data[386]; buffer[0][3] = data[387]; buffer[0][4] = data[388]; buffer[0][5] = data[389]; buffer[0][6] = data[390]; buffer[0][7] = data[391]; buffer[0][8] = data[392]; buffer[0][9] = data[393]; buffer[0][10] = data[394]; buffer[0][11] = data[395]; + + } + if (partition == 33) { + buffer[0][0] = data[396]; buffer[0][1] = data[397]; buffer[0][2] = data[398]; buffer[0][3] = data[399]; buffer[0][4] = data[400]; buffer[0][5] = data[401]; buffer[0][6] = data[402]; buffer[0][7] = data[403]; buffer[0][8] = data[404]; buffer[0][9] = data[405]; buffer[0][10] = data[406]; buffer[0][11] = data[407]; + + } + if (partition == 34) { + buffer[0][0] = data[408]; buffer[0][1] = data[409]; buffer[0][2] = data[410]; buffer[0][3] = data[411]; buffer[0][4] = data[412]; buffer[0][5] = data[413]; buffer[0][6] = data[414]; buffer[0][7] = data[415]; buffer[0][8] = data[416]; buffer[0][9] = data[417]; buffer[0][10] = data[418]; buffer[0][11] = data[419]; + + } + if (partition == 35) { + buffer[0][0] = data[420]; buffer[0][1] = data[421]; buffer[0][2] = data[422]; buffer[0][3] = data[423]; buffer[0][4] = data[424]; buffer[0][5] = data[425]; buffer[0][6] = data[426]; buffer[0][7] = data[427]; buffer[0][8] = data[428]; buffer[0][9] = data[429]; buffer[0][10] = data[430]; buffer[0][11] = data[431]; + + } + if (partition == 36) { + buffer[0][0] = data[432]; buffer[0][1] = data[433]; buffer[0][2] = data[434]; buffer[0][3] = data[435]; buffer[0][4] = data[436]; buffer[0][5] = data[437]; buffer[0][6] = data[438]; buffer[0][7] = data[439]; buffer[0][8] = data[440]; buffer[0][9] = data[441]; buffer[0][10] = data[442]; buffer[0][11] = data[443]; + + } + if (partition == 37) { + buffer[0][0] = data[444]; buffer[0][1] = data[445]; buffer[0][2] = data[446]; buffer[0][3] = data[447]; buffer[0][4] = data[448]; buffer[0][5] = data[449]; buffer[0][6] = data[450]; buffer[0][7] = data[451]; buffer[0][8] = data[452]; buffer[0][9] = data[453]; buffer[0][10] = data[454]; buffer[0][11] = data[455]; + + } + if (partition == 38) { + buffer[0][0] = data[456]; buffer[0][1] = data[457]; buffer[0][2] = data[458]; buffer[0][3] = data[459]; buffer[0][4] = data[460]; buffer[0][5] = data[461]; buffer[0][6] = data[462]; buffer[0][7] = data[463]; buffer[0][8] = data[464]; buffer[0][9] = data[465]; buffer[0][10] = data[466]; buffer[0][11] = data[467]; + + } + if (partition == 39) { + buffer[0][0] = data[468]; buffer[0][1] = data[469]; buffer[0][2] = data[470]; buffer[0][3] = data[471]; buffer[0][4] = data[472]; buffer[0][5] = data[473]; buffer[0][6] = data[474]; buffer[0][7] = data[475]; buffer[0][8] = data[476]; buffer[0][9] = data[477]; buffer[0][10] = data[478]; buffer[0][11] = data[479]; + + } + if (partition == 40) { + buffer[0][0] = data[480]; buffer[0][1] = data[481]; buffer[0][2] = data[482]; buffer[0][3] = data[483]; buffer[0][4] = data[484]; buffer[0][5] = data[485]; buffer[0][6] = data[486]; buffer[0][7] = data[487]; buffer[0][8] = data[488]; buffer[0][9] = data[489]; buffer[0][10] = data[490]; buffer[0][11] = data[491]; + + } + if (partition == 41) { + buffer[0][0] = data[492]; buffer[0][1] = data[493]; buffer[0][2] = data[494]; buffer[0][3] = data[495]; buffer[0][4] = data[496]; buffer[0][5] = data[497]; buffer[0][6] = data[498]; buffer[0][7] = data[499]; buffer[0][8] = data[500]; buffer[0][9] = data[501]; buffer[0][10] = data[502]; buffer[0][11] = data[503]; + + } + if (partition == 42) { + buffer[0][0] = data[504]; buffer[0][1] = data[505]; buffer[0][2] = data[506]; buffer[0][3] = data[507]; buffer[0][4] = data[508]; buffer[0][5] = data[509]; buffer[0][6] = data[510]; buffer[0][7] = data[511]; buffer[0][8] = data[512]; buffer[0][9] = data[513]; buffer[0][10] = data[514]; buffer[0][11] = data[515]; + + } + if (partition == 43) { + buffer[0][0] = data[516]; buffer[0][1] = data[517]; buffer[0][2] = data[518]; buffer[0][3] = data[519]; buffer[0][4] = data[520]; buffer[0][5] = data[521]; buffer[0][6] = data[522]; buffer[0][7] = data[523]; buffer[0][8] = data[524]; buffer[0][9] = data[525]; buffer[0][10] = data[526]; buffer[0][11] = data[527]; + + } + if (partition == 44) { + buffer[0][0] = data[528]; buffer[0][1] = data[529]; buffer[0][2] = data[530]; buffer[0][3] = data[531]; buffer[0][4] = data[532]; buffer[0][5] = data[533]; buffer[0][6] = data[534]; buffer[0][7] = data[535]; buffer[0][8] = data[536]; buffer[0][9] = data[537]; buffer[0][10] = data[538]; buffer[0][11] = data[539]; + + } + if (partition == 45) { + buffer[0][0] = data[540]; buffer[0][1] = data[541]; buffer[0][2] = data[542]; buffer[0][3] = data[543]; buffer[0][4] = data[544]; buffer[0][5] = data[545]; buffer[0][6] = data[546]; buffer[0][7] = data[547]; buffer[0][8] = data[548]; buffer[0][9] = data[549]; buffer[0][10] = data[550]; buffer[0][11] = data[551]; + + } + if (partition == 46) { + buffer[0][0] = data[552]; buffer[0][1] = data[553]; buffer[0][2] = data[554]; buffer[0][3] = data[555]; buffer[0][4] = data[556]; buffer[0][5] = data[557]; buffer[0][6] = data[558]; buffer[0][7] = data[559]; buffer[0][8] = data[560]; buffer[0][9] = data[561]; buffer[0][10] = data[562]; buffer[0][11] = data[563]; + + } + if (partition == 47) { + buffer[0][0] = data[564]; buffer[0][1] = data[565]; buffer[0][2] = data[566]; buffer[0][3] = data[567]; buffer[0][4] = data[568]; buffer[0][5] = data[569]; buffer[0][6] = data[570]; buffer[0][7] = data[571]; buffer[0][8] = data[572]; buffer[0][9] = data[573]; buffer[0][10] = data[574]; buffer[0][11] = data[575]; + + } + if (partition == 48) { + buffer[0][0] = data[576]; buffer[0][1] = data[577]; buffer[0][2] = data[578]; buffer[0][3] = data[579]; buffer[0][4] = data[580]; buffer[0][5] = data[581]; buffer[0][6] = data[582]; buffer[0][7] = data[583]; buffer[0][8] = data[584]; buffer[0][9] = data[585]; buffer[0][10] = data[586]; buffer[0][11] = data[587]; + + } + if (partition == 49) { + buffer[0][0] = data[588]; buffer[0][1] = data[589]; buffer[0][2] = data[590]; buffer[0][3] = data[591]; buffer[0][4] = data[592]; buffer[0][5] = data[593]; buffer[0][6] = data[594]; buffer[0][7] = data[595]; buffer[0][8] = data[596]; buffer[0][9] = data[597]; buffer[0][10] = data[598]; buffer[0][11] = data[599]; + + } + if (partition == 50) { + buffer[0][0] = data[600]; buffer[0][1] = data[601]; buffer[0][2] = data[602]; buffer[0][3] = data[603]; buffer[0][4] = data[604]; buffer[0][5] = data[605]; buffer[0][6] = data[606]; buffer[0][7] = data[607]; buffer[0][8] = data[608]; buffer[0][9] = data[609]; buffer[0][10] = data[610]; buffer[0][11] = data[611]; + + } + if (partition == 51) { + buffer[0][0] = data[612]; buffer[0][1] = data[613]; buffer[0][2] = data[614]; buffer[0][3] = data[615]; buffer[0][4] = data[616]; buffer[0][5] = data[617]; buffer[0][6] = data[618]; buffer[0][7] = data[619]; buffer[0][8] = data[620]; buffer[0][9] = data[621]; buffer[0][10] = data[622]; buffer[0][11] = data[623]; + + } + if (partition == 52) { + buffer[0][0] = data[624]; buffer[0][1] = data[625]; buffer[0][2] = data[626]; buffer[0][3] = data[627]; buffer[0][4] = data[628]; buffer[0][5] = data[629]; buffer[0][6] = data[630]; buffer[0][7] = data[631]; buffer[0][8] = data[632]; buffer[0][9] = data[633]; buffer[0][10] = data[634]; buffer[0][11] = data[635]; + + } + if (partition == 53) { + buffer[0][0] = data[636]; buffer[0][1] = data[637]; buffer[0][2] = data[638]; buffer[0][3] = data[639]; buffer[0][4] = data[640]; buffer[0][5] = data[641]; buffer[0][6] = data[642]; buffer[0][7] = data[643]; buffer[0][8] = data[644]; buffer[0][9] = data[645]; buffer[0][10] = data[646]; buffer[0][11] = data[647]; + + } + if (partition == 54) { + buffer[0][0] = data[648]; buffer[0][1] = data[649]; buffer[0][2] = data[650]; buffer[0][3] = data[651]; buffer[0][4] = data[652]; buffer[0][5] = data[653]; buffer[0][6] = data[654]; buffer[0][7] = data[655]; buffer[0][8] = data[656]; buffer[0][9] = data[657]; buffer[0][10] = data[658]; buffer[0][11] = data[659]; + + } + if (partition == 55) { + buffer[0][0] = data[660]; buffer[0][1] = data[661]; buffer[0][2] = data[662]; buffer[0][3] = data[663]; buffer[0][4] = data[664]; buffer[0][5] = data[665]; buffer[0][6] = data[666]; buffer[0][7] = data[667]; buffer[0][8] = data[668]; buffer[0][9] = data[669]; buffer[0][10] = data[670]; buffer[0][11] = data[671]; + + } + if (partition == 56) { + buffer[0][0] = data[672]; buffer[0][1] = data[673]; buffer[0][2] = data[674]; buffer[0][3] = data[675]; buffer[0][4] = data[676]; buffer[0][5] = data[677]; buffer[0][6] = data[678]; buffer[0][7] = data[679]; buffer[0][8] = data[680]; buffer[0][9] = data[681]; buffer[0][10] = data[682]; buffer[0][11] = data[683]; + + } + if (partition == 57) { + buffer[0][0] = data[684]; buffer[0][1] = data[685]; buffer[0][2] = data[686]; buffer[0][3] = data[687]; buffer[0][4] = data[688]; buffer[0][5] = data[689]; buffer[0][6] = data[690]; buffer[0][7] = data[691]; buffer[0][8] = data[692]; buffer[0][9] = data[693]; buffer[0][10] = data[694]; buffer[0][11] = data[695]; + + } + if (partition == 58) { + buffer[0][0] = data[696]; buffer[0][1] = data[697]; buffer[0][2] = data[698]; buffer[0][3] = data[699]; buffer[0][4] = data[700]; buffer[0][5] = data[701]; buffer[0][6] = data[702]; buffer[0][7] = data[703]; buffer[0][8] = data[704]; buffer[0][9] = data[705]; buffer[0][10] = data[706]; buffer[0][11] = data[707]; + + } + if (partition == 59) { + buffer[0][0] = data[708]; buffer[0][1] = data[709]; buffer[0][2] = data[710]; buffer[0][3] = data[711]; buffer[0][4] = data[712]; buffer[0][5] = data[713]; buffer[0][6] = data[714]; buffer[0][7] = data[715]; buffer[0][8] = data[716]; buffer[0][9] = data[717]; buffer[0][10] = data[718]; buffer[0][11] = data[719]; + + } + if (partition == 60) { + buffer[0][0] = data[720]; buffer[0][1] = data[721]; buffer[0][2] = data[722]; buffer[0][3] = data[723]; buffer[0][4] = data[724]; buffer[0][5] = data[725]; buffer[0][6] = data[726]; buffer[0][7] = data[727]; buffer[0][8] = data[728]; buffer[0][9] = data[729]; buffer[0][10] = data[730]; buffer[0][11] = data[731]; + + } + if (partition == 61) { + buffer[0][0] = data[732]; buffer[0][1] = data[733]; buffer[0][2] = data[734]; buffer[0][3] = data[735]; buffer[0][4] = data[736]; buffer[0][5] = data[737]; buffer[0][6] = data[738]; buffer[0][7] = data[739]; buffer[0][8] = data[740]; buffer[0][9] = data[741]; buffer[0][10] = data[742]; buffer[0][11] = data[743]; + + } + if (partition == 62) { + buffer[0][0] = data[744]; buffer[0][1] = data[745]; buffer[0][2] = data[746]; buffer[0][3] = data[747]; buffer[0][4] = data[748]; buffer[0][5] = data[749]; buffer[0][6] = data[750]; buffer[0][7] = data[751]; buffer[0][8] = data[752]; buffer[0][9] = data[753]; buffer[0][10] = data[754]; buffer[0][11] = data[755]; + + } + if (partition == 63) { + buffer[0][0] = data[756]; buffer[0][1] = data[757]; buffer[0][2] = data[758]; buffer[0][3] = data[759]; buffer[0][4] = data[760]; buffer[0][5] = data[761]; buffer[0][6] = data[762]; buffer[0][7] = data[763]; buffer[0][8] = data[764]; buffer[0][9] = data[765]; buffer[0][10] = data[766]; buffer[0][11] = data[767]; + + } + if (partition == 64) { + buffer[0][0] = data[768]; buffer[0][1] = data[769]; buffer[0][2] = data[770]; buffer[0][3] = data[771]; buffer[0][4] = data[772]; buffer[0][5] = data[773]; buffer[0][6] = data[774]; buffer[0][7] = data[775]; buffer[0][8] = data[776]; buffer[0][9] = data[777]; buffer[0][10] = data[778]; buffer[0][11] = data[779]; + + } + if (partition == 65) { + buffer[0][0] = data[780]; buffer[0][1] = data[781]; buffer[0][2] = data[782]; buffer[0][3] = data[783]; buffer[0][4] = data[784]; buffer[0][5] = data[785]; buffer[0][6] = data[786]; buffer[0][7] = data[787]; buffer[0][8] = data[788]; buffer[0][9] = data[789]; buffer[0][10] = data[790]; buffer[0][11] = data[791]; + + } + if (partition == 66) { + buffer[0][0] = data[792]; buffer[0][1] = data[793]; buffer[0][2] = data[794]; buffer[0][3] = data[795]; buffer[0][4] = data[796]; buffer[0][5] = data[797]; buffer[0][6] = data[798]; buffer[0][7] = data[799]; buffer[0][8] = data[800]; buffer[0][9] = data[801]; buffer[0][10] = data[802]; buffer[0][11] = data[803]; + + } + if (partition == 67) { + buffer[0][0] = data[804]; buffer[0][1] = data[805]; buffer[0][2] = data[806]; buffer[0][3] = data[807]; buffer[0][4] = data[808]; buffer[0][5] = data[809]; buffer[0][6] = data[810]; buffer[0][7] = data[811]; buffer[0][8] = data[812]; buffer[0][9] = data[813]; buffer[0][10] = data[814]; buffer[0][11] = data[815]; + + } + if (partition == 68) { + buffer[0][0] = data[816]; buffer[0][1] = data[817]; buffer[0][2] = data[818]; buffer[0][3] = data[819]; buffer[0][4] = data[820]; buffer[0][5] = data[821]; buffer[0][6] = data[822]; buffer[0][7] = data[823]; buffer[0][8] = data[824]; buffer[0][9] = data[825]; buffer[0][10] = data[826]; buffer[0][11] = data[827]; + + } + if (partition == 69) { + buffer[0][0] = data[828]; buffer[0][1] = data[829]; buffer[0][2] = data[830]; buffer[0][3] = data[831]; buffer[0][4] = data[832]; buffer[0][5] = data[833]; buffer[0][6] = data[834]; buffer[0][7] = data[835]; buffer[0][8] = data[836]; buffer[0][9] = data[837]; buffer[0][10] = data[838]; buffer[0][11] = data[839]; + + } + if (partition == 70) { + buffer[0][0] = data[840]; buffer[0][1] = data[841]; buffer[0][2] = data[842]; buffer[0][3] = data[843]; buffer[0][4] = data[844]; buffer[0][5] = data[845]; buffer[0][6] = data[846]; buffer[0][7] = data[847]; buffer[0][8] = data[848]; buffer[0][9] = data[849]; buffer[0][10] = data[850]; buffer[0][11] = data[851]; + + } + if (partition == 71) { + buffer[0][0] = data[852]; buffer[0][1] = data[853]; buffer[0][2] = data[854]; buffer[0][3] = data[855]; buffer[0][4] = data[856]; buffer[0][5] = data[857]; buffer[0][6] = data[858]; buffer[0][7] = data[859]; buffer[0][8] = data[860]; buffer[0][9] = data[861]; buffer[0][10] = data[862]; buffer[0][11] = data[863]; + + } + if (partition == 72) { + buffer[0][0] = data[864]; buffer[0][1] = data[865]; buffer[0][2] = data[866]; buffer[0][3] = data[867]; buffer[0][4] = data[868]; buffer[0][5] = data[869]; buffer[0][6] = data[870]; buffer[0][7] = data[871]; buffer[0][8] = data[872]; buffer[0][9] = data[873]; buffer[0][10] = data[874]; buffer[0][11] = data[875]; + + } + if (partition == 73) { + buffer[0][0] = data[876]; buffer[0][1] = data[877]; buffer[0][2] = data[878]; buffer[0][3] = data[879]; buffer[0][4] = data[880]; buffer[0][5] = data[881]; buffer[0][6] = data[882]; buffer[0][7] = data[883]; buffer[0][8] = data[884]; buffer[0][9] = data[885]; buffer[0][10] = data[886]; buffer[0][11] = data[887]; + + } + if (partition == 74) { + buffer[0][0] = data[888]; buffer[0][1] = data[889]; buffer[0][2] = data[890]; buffer[0][3] = data[891]; buffer[0][4] = data[892]; buffer[0][5] = data[893]; buffer[0][6] = data[894]; buffer[0][7] = data[895]; buffer[0][8] = data[896]; buffer[0][9] = data[897]; buffer[0][10] = data[898]; buffer[0][11] = data[899]; + + } + if (partition == 75) { + buffer[0][0] = data[900]; buffer[0][1] = data[901]; buffer[0][2] = data[902]; buffer[0][3] = data[903]; buffer[0][4] = data[904]; buffer[0][5] = data[905]; buffer[0][6] = data[906]; buffer[0][7] = data[907]; buffer[0][8] = data[908]; buffer[0][9] = data[909]; buffer[0][10] = data[910]; buffer[0][11] = data[911]; + + } + if (partition == 76) { + buffer[0][0] = data[912]; buffer[0][1] = data[913]; buffer[0][2] = data[914]; buffer[0][3] = data[915]; buffer[0][4] = data[916]; buffer[0][5] = data[917]; buffer[0][6] = data[918]; buffer[0][7] = data[919]; buffer[0][8] = data[920]; buffer[0][9] = data[921]; buffer[0][10] = data[922]; buffer[0][11] = data[923]; + + } + if (partition == 77) { + buffer[0][0] = data[924]; buffer[0][1] = data[925]; buffer[0][2] = data[926]; buffer[0][3] = data[927]; buffer[0][4] = data[928]; buffer[0][5] = data[929]; buffer[0][6] = data[930]; buffer[0][7] = data[931]; buffer[0][8] = data[932]; buffer[0][9] = data[933]; buffer[0][10] = data[934]; buffer[0][11] = data[935]; + + } + if (partition == 78) { + buffer[0][0] = data[936]; buffer[0][1] = data[937]; buffer[0][2] = data[938]; buffer[0][3] = data[939]; buffer[0][4] = data[940]; buffer[0][5] = data[941]; buffer[0][6] = data[942]; buffer[0][7] = data[943]; buffer[0][8] = data[944]; buffer[0][9] = data[945]; buffer[0][10] = data[946]; buffer[0][11] = data[947]; + + } + if (partition == 79) { + buffer[0][0] = data[948]; buffer[0][1] = data[949]; buffer[0][2] = data[950]; buffer[0][3] = data[951]; buffer[0][4] = data[952]; buffer[0][5] = data[953]; buffer[0][6] = data[954]; buffer[0][7] = data[955]; buffer[0][8] = data[956]; buffer[0][9] = data[957]; buffer[0][10] = data[958]; buffer[0][11] = data[959]; + + } + if (partition == 80) { + buffer[0][0] = data[960]; buffer[0][1] = data[961]; buffer[0][2] = data[962]; buffer[0][3] = data[963]; buffer[0][4] = data[964]; buffer[0][5] = data[965]; buffer[0][6] = data[966]; buffer[0][7] = data[967]; buffer[0][8] = data[968]; buffer[0][9] = data[969]; buffer[0][10] = data[970]; buffer[0][11] = data[971]; + + } + if (partition == 81) { + buffer[0][0] = data[972]; buffer[0][1] = data[973]; buffer[0][2] = data[974]; buffer[0][3] = data[975]; buffer[0][4] = data[976]; buffer[0][5] = data[977]; buffer[0][6] = data[978]; buffer[0][7] = data[979]; buffer[0][8] = data[980]; buffer[0][9] = data[981]; buffer[0][10] = data[982]; buffer[0][11] = data[983]; + + } + if (partition == 82) { + buffer[0][0] = data[984]; buffer[0][1] = data[985]; buffer[0][2] = data[986]; buffer[0][3] = data[987]; buffer[0][4] = data[988]; buffer[0][5] = data[989]; buffer[0][6] = data[990]; buffer[0][7] = data[991]; buffer[0][8] = data[992]; buffer[0][9] = data[993]; buffer[0][10] = data[994]; buffer[0][11] = data[995]; + + } + if (partition == 83) { + buffer[0][0] = data[996]; buffer[0][1] = data[997]; buffer[0][2] = data[998]; buffer[0][3] = data[999]; buffer[0][4] = data[1000]; buffer[0][5] = data[1001]; buffer[0][6] = data[1002]; buffer[0][7] = data[1003]; buffer[0][8] = data[1004]; buffer[0][9] = data[1005]; buffer[0][10] = data[1006]; buffer[0][11] = data[1007]; + + } + if (partition == 84) { + buffer[0][0] = data[1008]; buffer[0][1] = data[1009]; buffer[0][2] = data[1010]; buffer[0][3] = data[1011]; buffer[0][4] = data[1012]; buffer[0][5] = data[1013]; buffer[0][6] = data[1014]; buffer[0][7] = data[1015]; buffer[0][8] = data[1016]; buffer[0][9] = data[1017]; buffer[0][10] = data[1018]; buffer[0][11] = data[1019]; + + } + if (partition == 85) { + buffer[0][0] = data[1020]; buffer[0][1] = data[1021]; buffer[0][2] = data[1022]; buffer[0][3] = data[1023]; buffer[0][4] = data[1024]; buffer[0][5] = data[1025]; buffer[0][6] = data[1026]; buffer[0][7] = data[1027]; buffer[0][8] = data[1028]; buffer[0][9] = data[1029]; buffer[0][10] = data[1030]; buffer[0][11] = data[1031]; + + } + if (partition == 86) { + buffer[0][0] = data[1032]; buffer[0][1] = data[1033]; buffer[0][2] = data[1034]; buffer[0][3] = data[1035]; buffer[0][4] = data[1036]; buffer[0][5] = data[1037]; buffer[0][6] = data[1038]; buffer[0][7] = data[1039]; buffer[0][8] = data[1040]; buffer[0][9] = data[1041]; buffer[0][10] = data[1042]; buffer[0][11] = data[1043]; + + } + if (partition == 87) { + buffer[0][0] = data[1044]; buffer[0][1] = data[1045]; buffer[0][2] = data[1046]; buffer[0][3] = data[1047]; buffer[0][4] = data[1048]; buffer[0][5] = data[1049]; buffer[0][6] = data[1050]; buffer[0][7] = data[1051]; buffer[0][8] = data[1052]; buffer[0][9] = data[1053]; buffer[0][10] = data[1054]; buffer[0][11] = data[1055]; + + } + if (partition == 88) { + buffer[0][0] = data[1056]; buffer[0][1] = data[1057]; buffer[0][2] = data[1058]; buffer[0][3] = data[1059]; buffer[0][4] = data[1060]; buffer[0][5] = data[1061]; buffer[0][6] = data[1062]; buffer[0][7] = data[1063]; buffer[0][8] = data[1064]; buffer[0][9] = data[1065]; buffer[0][10] = data[1066]; buffer[0][11] = data[1067]; + + } + if (partition == 89) { + buffer[0][0] = data[1068]; buffer[0][1] = data[1069]; buffer[0][2] = data[1070]; buffer[0][3] = data[1071]; buffer[0][4] = data[1072]; buffer[0][5] = data[1073]; buffer[0][6] = data[1074]; buffer[0][7] = data[1075]; buffer[0][8] = data[1076]; buffer[0][9] = data[1077]; buffer[0][10] = data[1078]; buffer[0][11] = data[1079]; + + } + if (partition == 90) { + buffer[0][0] = data[1080]; buffer[0][1] = data[1081]; buffer[0][2] = data[1082]; buffer[0][3] = data[1083]; buffer[0][4] = data[1084]; buffer[0][5] = data[1085]; buffer[0][6] = data[1086]; buffer[0][7] = data[1087]; buffer[0][8] = data[1088]; buffer[0][9] = data[1089]; buffer[0][10] = data[1090]; buffer[0][11] = data[1091]; + + } + if (partition == 91) { + buffer[0][0] = data[1092]; buffer[0][1] = data[1093]; buffer[0][2] = data[1094]; buffer[0][3] = data[1095]; buffer[0][4] = data[1096]; buffer[0][5] = data[1097]; buffer[0][6] = data[1098]; buffer[0][7] = data[1099]; buffer[0][8] = data[1100]; buffer[0][9] = data[1101]; buffer[0][10] = data[1102]; buffer[0][11] = data[1103]; + + } + if (partition == 92) { + buffer[0][0] = data[1104]; buffer[0][1] = data[1105]; buffer[0][2] = data[1106]; buffer[0][3] = data[1107]; buffer[0][4] = data[1108]; buffer[0][5] = data[1109]; buffer[0][6] = data[1110]; buffer[0][7] = data[1111]; buffer[0][8] = data[1112]; buffer[0][9] = data[1113]; buffer[0][10] = data[1114]; buffer[0][11] = data[1115]; + + } + if (partition == 93) { + buffer[0][0] = data[1116]; buffer[0][1] = data[1117]; buffer[0][2] = data[1118]; buffer[0][3] = data[1119]; buffer[0][4] = data[1120]; buffer[0][5] = data[1121]; buffer[0][6] = data[1122]; buffer[0][7] = data[1123]; buffer[0][8] = data[1124]; buffer[0][9] = data[1125]; buffer[0][10] = data[1126]; buffer[0][11] = data[1127]; + + } + if (partition == 94) { + buffer[0][0] = data[1128]; buffer[0][1] = data[1129]; buffer[0][2] = data[1130]; buffer[0][3] = data[1131]; buffer[0][4] = data[1132]; buffer[0][5] = data[1133]; buffer[0][6] = data[1134]; buffer[0][7] = data[1135]; buffer[0][8] = data[1136]; buffer[0][9] = data[1137]; buffer[0][10] = data[1138]; buffer[0][11] = data[1139]; + + } + if (partition == 95) { + buffer[0][0] = data[1140]; buffer[0][1] = data[1141]; buffer[0][2] = data[1142]; buffer[0][3] = data[1143]; buffer[0][4] = data[1144]; buffer[0][5] = data[1145]; buffer[0][6] = data[1146]; buffer[0][7] = data[1147]; buffer[0][8] = data[1148]; buffer[0][9] = data[1149]; buffer[0][10] = data[1150]; buffer[0][11] = data[1151]; + + } + if (partition == 96) { + buffer[0][0] = data[1152]; buffer[0][1] = data[1153]; buffer[0][2] = data[1154]; buffer[0][3] = data[1155]; buffer[0][4] = data[1156]; buffer[0][5] = data[1157]; buffer[0][6] = data[1158]; buffer[0][7] = data[1159]; buffer[0][8] = data[1160]; buffer[0][9] = data[1161]; buffer[0][10] = data[1162]; buffer[0][11] = data[1163]; + + } + if (partition == 97) { + buffer[0][0] = data[1164]; buffer[0][1] = data[1165]; buffer[0][2] = data[1166]; buffer[0][3] = data[1167]; buffer[0][4] = data[1168]; buffer[0][5] = data[1169]; buffer[0][6] = data[1170]; buffer[0][7] = data[1171]; buffer[0][8] = data[1172]; buffer[0][9] = data[1173]; buffer[0][10] = data[1174]; buffer[0][11] = data[1175]; + + } + if (partition == 98) { + buffer[0][0] = data[1176]; buffer[0][1] = data[1177]; buffer[0][2] = data[1178]; buffer[0][3] = data[1179]; buffer[0][4] = data[1180]; buffer[0][5] = data[1181]; buffer[0][6] = data[1182]; buffer[0][7] = data[1183]; buffer[0][8] = data[1184]; buffer[0][9] = data[1185]; buffer[0][10] = data[1186]; buffer[0][11] = data[1187]; + + } + if (partition == 99) { + buffer[0][0] = data[1188]; buffer[0][1] = data[1189]; buffer[0][2] = data[1190]; buffer[0][3] = data[1191]; buffer[0][4] = data[1192]; buffer[0][5] = data[1193]; buffer[0][6] = data[1194]; buffer[0][7] = data[1195]; buffer[0][8] = data[1196]; buffer[0][9] = data[1197]; buffer[0][10] = data[1198]; buffer[0][11] = data[1199]; + + } + } +}; +template +class fill_buffer_24 : public FillConv1DBuffer { + public: + static void fill_buffer( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan], + const unsigned partition + ) { + if (partition == 0) { + buffer[0][0] = data[0]; buffer[0][1] = data[1]; buffer[0][2] = data[2]; buffer[0][3] = data[3]; buffer[0][4] = data[4]; buffer[0][5] = data[5]; buffer[0][6] = data[6]; buffer[0][7] = data[7]; buffer[0][8] = data[8]; buffer[0][9] = data[9]; buffer[0][10] = data[10]; buffer[0][11] = data[11]; buffer[0][12] = data[12]; buffer[0][13] = data[13]; buffer[0][14] = data[14]; buffer[0][15] = data[15]; buffer[0][16] = data[16]; buffer[0][17] = data[17]; buffer[0][18] = data[18]; buffer[0][19] = data[19]; buffer[0][20] = data[20]; buffer[0][21] = data[21]; buffer[0][22] = data[22]; buffer[0][23] = data[23]; buffer[0][24] = data[24]; buffer[0][25] = data[25]; buffer[0][26] = data[26]; buffer[0][27] = data[27]; buffer[0][28] = data[28]; buffer[0][29] = data[29]; buffer[0][30] = data[30]; buffer[0][31] = data[31]; buffer[0][32] = data[32]; buffer[0][33] = data[33]; buffer[0][34] = data[34]; buffer[0][35] = data[35]; + + } + if (partition == 1) { + buffer[0][0] = data[36]; buffer[0][1] = data[37]; buffer[0][2] = data[38]; buffer[0][3] = data[39]; buffer[0][4] = data[40]; buffer[0][5] = data[41]; buffer[0][6] = data[42]; buffer[0][7] = data[43]; buffer[0][8] = data[44]; buffer[0][9] = data[45]; buffer[0][10] = data[46]; buffer[0][11] = data[47]; buffer[0][12] = data[48]; buffer[0][13] = data[49]; buffer[0][14] = data[50]; buffer[0][15] = data[51]; buffer[0][16] = data[52]; buffer[0][17] = data[53]; buffer[0][18] = data[54]; buffer[0][19] = data[55]; buffer[0][20] = data[56]; buffer[0][21] = data[57]; buffer[0][22] = data[58]; buffer[0][23] = data[59]; buffer[0][24] = data[60]; buffer[0][25] = data[61]; buffer[0][26] = data[62]; buffer[0][27] = data[63]; buffer[0][28] = data[64]; buffer[0][29] = data[65]; buffer[0][30] = data[66]; buffer[0][31] = data[67]; buffer[0][32] = data[68]; buffer[0][33] = data[69]; buffer[0][34] = data[70]; buffer[0][35] = data[71]; + + } + if (partition == 2) { + buffer[0][0] = data[72]; buffer[0][1] = data[73]; buffer[0][2] = data[74]; buffer[0][3] = data[75]; buffer[0][4] = data[76]; buffer[0][5] = data[77]; buffer[0][6] = data[78]; buffer[0][7] = data[79]; buffer[0][8] = data[80]; buffer[0][9] = data[81]; buffer[0][10] = data[82]; buffer[0][11] = data[83]; buffer[0][12] = data[84]; buffer[0][13] = data[85]; buffer[0][14] = data[86]; buffer[0][15] = data[87]; buffer[0][16] = data[88]; buffer[0][17] = data[89]; buffer[0][18] = data[90]; buffer[0][19] = data[91]; buffer[0][20] = data[92]; buffer[0][21] = data[93]; buffer[0][22] = data[94]; buffer[0][23] = data[95]; buffer[0][24] = data[96]; buffer[0][25] = data[97]; buffer[0][26] = data[98]; buffer[0][27] = data[99]; buffer[0][28] = data[100]; buffer[0][29] = data[101]; buffer[0][30] = data[102]; buffer[0][31] = data[103]; buffer[0][32] = data[104]; buffer[0][33] = data[105]; buffer[0][34] = data[106]; buffer[0][35] = data[107]; + + } + if (partition == 3) { + buffer[0][0] = data[108]; buffer[0][1] = data[109]; buffer[0][2] = data[110]; buffer[0][3] = data[111]; buffer[0][4] = data[112]; buffer[0][5] = data[113]; buffer[0][6] = data[114]; buffer[0][7] = data[115]; buffer[0][8] = data[116]; buffer[0][9] = data[117]; buffer[0][10] = data[118]; buffer[0][11] = data[119]; buffer[0][12] = data[120]; buffer[0][13] = data[121]; buffer[0][14] = data[122]; buffer[0][15] = data[123]; buffer[0][16] = data[124]; buffer[0][17] = data[125]; buffer[0][18] = data[126]; buffer[0][19] = data[127]; buffer[0][20] = data[128]; buffer[0][21] = data[129]; buffer[0][22] = data[130]; buffer[0][23] = data[131]; buffer[0][24] = data[132]; buffer[0][25] = data[133]; buffer[0][26] = data[134]; buffer[0][27] = data[135]; buffer[0][28] = data[136]; buffer[0][29] = data[137]; buffer[0][30] = data[138]; buffer[0][31] = data[139]; buffer[0][32] = data[140]; buffer[0][33] = data[141]; buffer[0][34] = data[142]; buffer[0][35] = data[143]; + + } + if (partition == 4) { + buffer[0][0] = data[144]; buffer[0][1] = data[145]; buffer[0][2] = data[146]; buffer[0][3] = data[147]; buffer[0][4] = data[148]; buffer[0][5] = data[149]; buffer[0][6] = data[150]; buffer[0][7] = data[151]; buffer[0][8] = data[152]; buffer[0][9] = data[153]; buffer[0][10] = data[154]; buffer[0][11] = data[155]; buffer[0][12] = data[156]; buffer[0][13] = data[157]; buffer[0][14] = data[158]; buffer[0][15] = data[159]; buffer[0][16] = data[160]; buffer[0][17] = data[161]; buffer[0][18] = data[162]; buffer[0][19] = data[163]; buffer[0][20] = data[164]; buffer[0][21] = data[165]; buffer[0][22] = data[166]; buffer[0][23] = data[167]; buffer[0][24] = data[168]; buffer[0][25] = data[169]; buffer[0][26] = data[170]; buffer[0][27] = data[171]; buffer[0][28] = data[172]; buffer[0][29] = data[173]; buffer[0][30] = data[174]; buffer[0][31] = data[175]; buffer[0][32] = data[176]; buffer[0][33] = data[177]; buffer[0][34] = data[178]; buffer[0][35] = data[179]; + + } + if (partition == 5) { + buffer[0][0] = data[180]; buffer[0][1] = data[181]; buffer[0][2] = data[182]; buffer[0][3] = data[183]; buffer[0][4] = data[184]; buffer[0][5] = data[185]; buffer[0][6] = data[186]; buffer[0][7] = data[187]; buffer[0][8] = data[188]; buffer[0][9] = data[189]; buffer[0][10] = data[190]; buffer[0][11] = data[191]; buffer[0][12] = data[192]; buffer[0][13] = data[193]; buffer[0][14] = data[194]; buffer[0][15] = data[195]; buffer[0][16] = data[196]; buffer[0][17] = data[197]; buffer[0][18] = data[198]; buffer[0][19] = data[199]; buffer[0][20] = data[200]; buffer[0][21] = data[201]; buffer[0][22] = data[202]; buffer[0][23] = data[203]; buffer[0][24] = data[204]; buffer[0][25] = data[205]; buffer[0][26] = data[206]; buffer[0][27] = data[207]; buffer[0][28] = data[208]; buffer[0][29] = data[209]; buffer[0][30] = data[210]; buffer[0][31] = data[211]; buffer[0][32] = data[212]; buffer[0][33] = data[213]; buffer[0][34] = data[214]; buffer[0][35] = data[215]; + + } + if (partition == 6) { + buffer[0][0] = data[216]; buffer[0][1] = data[217]; buffer[0][2] = data[218]; buffer[0][3] = data[219]; buffer[0][4] = data[220]; buffer[0][5] = data[221]; buffer[0][6] = data[222]; buffer[0][7] = data[223]; buffer[0][8] = data[224]; buffer[0][9] = data[225]; buffer[0][10] = data[226]; buffer[0][11] = data[227]; buffer[0][12] = data[228]; buffer[0][13] = data[229]; buffer[0][14] = data[230]; buffer[0][15] = data[231]; buffer[0][16] = data[232]; buffer[0][17] = data[233]; buffer[0][18] = data[234]; buffer[0][19] = data[235]; buffer[0][20] = data[236]; buffer[0][21] = data[237]; buffer[0][22] = data[238]; buffer[0][23] = data[239]; buffer[0][24] = data[240]; buffer[0][25] = data[241]; buffer[0][26] = data[242]; buffer[0][27] = data[243]; buffer[0][28] = data[244]; buffer[0][29] = data[245]; buffer[0][30] = data[246]; buffer[0][31] = data[247]; buffer[0][32] = data[248]; buffer[0][33] = data[249]; buffer[0][34] = data[250]; buffer[0][35] = data[251]; + + } + if (partition == 7) { + buffer[0][0] = data[252]; buffer[0][1] = data[253]; buffer[0][2] = data[254]; buffer[0][3] = data[255]; buffer[0][4] = data[256]; buffer[0][5] = data[257]; buffer[0][6] = data[258]; buffer[0][7] = data[259]; buffer[0][8] = data[260]; buffer[0][9] = data[261]; buffer[0][10] = data[262]; buffer[0][11] = data[263]; buffer[0][12] = data[264]; buffer[0][13] = data[265]; buffer[0][14] = data[266]; buffer[0][15] = data[267]; buffer[0][16] = data[268]; buffer[0][17] = data[269]; buffer[0][18] = data[270]; buffer[0][19] = data[271]; buffer[0][20] = data[272]; buffer[0][21] = data[273]; buffer[0][22] = data[274]; buffer[0][23] = data[275]; buffer[0][24] = data[276]; buffer[0][25] = data[277]; buffer[0][26] = data[278]; buffer[0][27] = data[279]; buffer[0][28] = data[280]; buffer[0][29] = data[281]; buffer[0][30] = data[282]; buffer[0][31] = data[283]; buffer[0][32] = data[284]; buffer[0][33] = data[285]; buffer[0][34] = data[286]; buffer[0][35] = data[287]; + + } + if (partition == 8) { + buffer[0][0] = data[288]; buffer[0][1] = data[289]; buffer[0][2] = data[290]; buffer[0][3] = data[291]; buffer[0][4] = data[292]; buffer[0][5] = data[293]; buffer[0][6] = data[294]; buffer[0][7] = data[295]; buffer[0][8] = data[296]; buffer[0][9] = data[297]; buffer[0][10] = data[298]; buffer[0][11] = data[299]; buffer[0][12] = data[300]; buffer[0][13] = data[301]; buffer[0][14] = data[302]; buffer[0][15] = data[303]; buffer[0][16] = data[304]; buffer[0][17] = data[305]; buffer[0][18] = data[306]; buffer[0][19] = data[307]; buffer[0][20] = data[308]; buffer[0][21] = data[309]; buffer[0][22] = data[310]; buffer[0][23] = data[311]; buffer[0][24] = data[312]; buffer[0][25] = data[313]; buffer[0][26] = data[314]; buffer[0][27] = data[315]; buffer[0][28] = data[316]; buffer[0][29] = data[317]; buffer[0][30] = data[318]; buffer[0][31] = data[319]; buffer[0][32] = data[320]; buffer[0][33] = data[321]; buffer[0][34] = data[322]; buffer[0][35] = data[323]; + + } + if (partition == 9) { + buffer[0][0] = data[324]; buffer[0][1] = data[325]; buffer[0][2] = data[326]; buffer[0][3] = data[327]; buffer[0][4] = data[328]; buffer[0][5] = data[329]; buffer[0][6] = data[330]; buffer[0][7] = data[331]; buffer[0][8] = data[332]; buffer[0][9] = data[333]; buffer[0][10] = data[334]; buffer[0][11] = data[335]; buffer[0][12] = data[336]; buffer[0][13] = data[337]; buffer[0][14] = data[338]; buffer[0][15] = data[339]; buffer[0][16] = data[340]; buffer[0][17] = data[341]; buffer[0][18] = data[342]; buffer[0][19] = data[343]; buffer[0][20] = data[344]; buffer[0][21] = data[345]; buffer[0][22] = data[346]; buffer[0][23] = data[347]; buffer[0][24] = data[348]; buffer[0][25] = data[349]; buffer[0][26] = data[350]; buffer[0][27] = data[351]; buffer[0][28] = data[352]; buffer[0][29] = data[353]; buffer[0][30] = data[354]; buffer[0][31] = data[355]; buffer[0][32] = data[356]; buffer[0][33] = data[357]; buffer[0][34] = data[358]; buffer[0][35] = data[359]; + + } + if (partition == 10) { + buffer[0][0] = data[360]; buffer[0][1] = data[361]; buffer[0][2] = data[362]; buffer[0][3] = data[363]; buffer[0][4] = data[364]; buffer[0][5] = data[365]; buffer[0][6] = data[366]; buffer[0][7] = data[367]; buffer[0][8] = data[368]; buffer[0][9] = data[369]; buffer[0][10] = data[370]; buffer[0][11] = data[371]; buffer[0][12] = data[372]; buffer[0][13] = data[373]; buffer[0][14] = data[374]; buffer[0][15] = data[375]; buffer[0][16] = data[376]; buffer[0][17] = data[377]; buffer[0][18] = data[378]; buffer[0][19] = data[379]; buffer[0][20] = data[380]; buffer[0][21] = data[381]; buffer[0][22] = data[382]; buffer[0][23] = data[383]; buffer[0][24] = data[384]; buffer[0][25] = data[385]; buffer[0][26] = data[386]; buffer[0][27] = data[387]; buffer[0][28] = data[388]; buffer[0][29] = data[389]; buffer[0][30] = data[390]; buffer[0][31] = data[391]; buffer[0][32] = data[392]; buffer[0][33] = data[393]; buffer[0][34] = data[394]; buffer[0][35] = data[395]; + + } + if (partition == 11) { + buffer[0][0] = data[396]; buffer[0][1] = data[397]; buffer[0][2] = data[398]; buffer[0][3] = data[399]; buffer[0][4] = data[400]; buffer[0][5] = data[401]; buffer[0][6] = data[402]; buffer[0][7] = data[403]; buffer[0][8] = data[404]; buffer[0][9] = data[405]; buffer[0][10] = data[406]; buffer[0][11] = data[407]; buffer[0][12] = data[408]; buffer[0][13] = data[409]; buffer[0][14] = data[410]; buffer[0][15] = data[411]; buffer[0][16] = data[412]; buffer[0][17] = data[413]; buffer[0][18] = data[414]; buffer[0][19] = data[415]; buffer[0][20] = data[416]; buffer[0][21] = data[417]; buffer[0][22] = data[418]; buffer[0][23] = data[419]; buffer[0][24] = data[420]; buffer[0][25] = data[421]; buffer[0][26] = data[422]; buffer[0][27] = data[423]; buffer[0][28] = data[424]; buffer[0][29] = data[425]; buffer[0][30] = data[426]; buffer[0][31] = data[427]; buffer[0][32] = data[428]; buffer[0][33] = data[429]; buffer[0][34] = data[430]; buffer[0][35] = data[431]; + + } + if (partition == 12) { + buffer[0][0] = data[432]; buffer[0][1] = data[433]; buffer[0][2] = data[434]; buffer[0][3] = data[435]; buffer[0][4] = data[436]; buffer[0][5] = data[437]; buffer[0][6] = data[438]; buffer[0][7] = data[439]; buffer[0][8] = data[440]; buffer[0][9] = data[441]; buffer[0][10] = data[442]; buffer[0][11] = data[443]; buffer[0][12] = data[444]; buffer[0][13] = data[445]; buffer[0][14] = data[446]; buffer[0][15] = data[447]; buffer[0][16] = data[448]; buffer[0][17] = data[449]; buffer[0][18] = data[450]; buffer[0][19] = data[451]; buffer[0][20] = data[452]; buffer[0][21] = data[453]; buffer[0][22] = data[454]; buffer[0][23] = data[455]; buffer[0][24] = data[456]; buffer[0][25] = data[457]; buffer[0][26] = data[458]; buffer[0][27] = data[459]; buffer[0][28] = data[460]; buffer[0][29] = data[461]; buffer[0][30] = data[462]; buffer[0][31] = data[463]; buffer[0][32] = data[464]; buffer[0][33] = data[465]; buffer[0][34] = data[466]; buffer[0][35] = data[467]; + + } + if (partition == 13) { + buffer[0][0] = data[468]; buffer[0][1] = data[469]; buffer[0][2] = data[470]; buffer[0][3] = data[471]; buffer[0][4] = data[472]; buffer[0][5] = data[473]; buffer[0][6] = data[474]; buffer[0][7] = data[475]; buffer[0][8] = data[476]; buffer[0][9] = data[477]; buffer[0][10] = data[478]; buffer[0][11] = data[479]; buffer[0][12] = data[480]; buffer[0][13] = data[481]; buffer[0][14] = data[482]; buffer[0][15] = data[483]; buffer[0][16] = data[484]; buffer[0][17] = data[485]; buffer[0][18] = data[486]; buffer[0][19] = data[487]; buffer[0][20] = data[488]; buffer[0][21] = data[489]; buffer[0][22] = data[490]; buffer[0][23] = data[491]; buffer[0][24] = data[492]; buffer[0][25] = data[493]; buffer[0][26] = data[494]; buffer[0][27] = data[495]; buffer[0][28] = data[496]; buffer[0][29] = data[497]; buffer[0][30] = data[498]; buffer[0][31] = data[499]; buffer[0][32] = data[500]; buffer[0][33] = data[501]; buffer[0][34] = data[502]; buffer[0][35] = data[503]; + + } + if (partition == 14) { + buffer[0][0] = data[504]; buffer[0][1] = data[505]; buffer[0][2] = data[506]; buffer[0][3] = data[507]; buffer[0][4] = data[508]; buffer[0][5] = data[509]; buffer[0][6] = data[510]; buffer[0][7] = data[511]; buffer[0][8] = data[512]; buffer[0][9] = data[513]; buffer[0][10] = data[514]; buffer[0][11] = data[515]; buffer[0][12] = data[516]; buffer[0][13] = data[517]; buffer[0][14] = data[518]; buffer[0][15] = data[519]; buffer[0][16] = data[520]; buffer[0][17] = data[521]; buffer[0][18] = data[522]; buffer[0][19] = data[523]; buffer[0][20] = data[524]; buffer[0][21] = data[525]; buffer[0][22] = data[526]; buffer[0][23] = data[527]; buffer[0][24] = data[528]; buffer[0][25] = data[529]; buffer[0][26] = data[530]; buffer[0][27] = data[531]; buffer[0][28] = data[532]; buffer[0][29] = data[533]; buffer[0][30] = data[534]; buffer[0][31] = data[535]; buffer[0][32] = data[536]; buffer[0][33] = data[537]; buffer[0][34] = data[538]; buffer[0][35] = data[539]; + + } + if (partition == 15) { + buffer[0][0] = data[540]; buffer[0][1] = data[541]; buffer[0][2] = data[542]; buffer[0][3] = data[543]; buffer[0][4] = data[544]; buffer[0][5] = data[545]; buffer[0][6] = data[546]; buffer[0][7] = data[547]; buffer[0][8] = data[548]; buffer[0][9] = data[549]; buffer[0][10] = data[550]; buffer[0][11] = data[551]; buffer[0][12] = data[552]; buffer[0][13] = data[553]; buffer[0][14] = data[554]; buffer[0][15] = data[555]; buffer[0][16] = data[556]; buffer[0][17] = data[557]; buffer[0][18] = data[558]; buffer[0][19] = data[559]; buffer[0][20] = data[560]; buffer[0][21] = data[561]; buffer[0][22] = data[562]; buffer[0][23] = data[563]; buffer[0][24] = data[564]; buffer[0][25] = data[565]; buffer[0][26] = data[566]; buffer[0][27] = data[567]; buffer[0][28] = data[568]; buffer[0][29] = data[569]; buffer[0][30] = data[570]; buffer[0][31] = data[571]; buffer[0][32] = data[572]; buffer[0][33] = data[573]; buffer[0][34] = data[574]; buffer[0][35] = data[575]; + + } + if (partition == 16) { + buffer[0][0] = data[576]; buffer[0][1] = data[577]; buffer[0][2] = data[578]; buffer[0][3] = data[579]; buffer[0][4] = data[580]; buffer[0][5] = data[581]; buffer[0][6] = data[582]; buffer[0][7] = data[583]; buffer[0][8] = data[584]; buffer[0][9] = data[585]; buffer[0][10] = data[586]; buffer[0][11] = data[587]; buffer[0][12] = data[588]; buffer[0][13] = data[589]; buffer[0][14] = data[590]; buffer[0][15] = data[591]; buffer[0][16] = data[592]; buffer[0][17] = data[593]; buffer[0][18] = data[594]; buffer[0][19] = data[595]; buffer[0][20] = data[596]; buffer[0][21] = data[597]; buffer[0][22] = data[598]; buffer[0][23] = data[599]; buffer[0][24] = data[600]; buffer[0][25] = data[601]; buffer[0][26] = data[602]; buffer[0][27] = data[603]; buffer[0][28] = data[604]; buffer[0][29] = data[605]; buffer[0][30] = data[606]; buffer[0][31] = data[607]; buffer[0][32] = data[608]; buffer[0][33] = data[609]; buffer[0][34] = data[610]; buffer[0][35] = data[611]; + + } + if (partition == 17) { + buffer[0][0] = data[612]; buffer[0][1] = data[613]; buffer[0][2] = data[614]; buffer[0][3] = data[615]; buffer[0][4] = data[616]; buffer[0][5] = data[617]; buffer[0][6] = data[618]; buffer[0][7] = data[619]; buffer[0][8] = data[620]; buffer[0][9] = data[621]; buffer[0][10] = data[622]; buffer[0][11] = data[623]; buffer[0][12] = data[624]; buffer[0][13] = data[625]; buffer[0][14] = data[626]; buffer[0][15] = data[627]; buffer[0][16] = data[628]; buffer[0][17] = data[629]; buffer[0][18] = data[630]; buffer[0][19] = data[631]; buffer[0][20] = data[632]; buffer[0][21] = data[633]; buffer[0][22] = data[634]; buffer[0][23] = data[635]; buffer[0][24] = data[636]; buffer[0][25] = data[637]; buffer[0][26] = data[638]; buffer[0][27] = data[639]; buffer[0][28] = data[640]; buffer[0][29] = data[641]; buffer[0][30] = data[642]; buffer[0][31] = data[643]; buffer[0][32] = data[644]; buffer[0][33] = data[645]; buffer[0][34] = data[646]; buffer[0][35] = data[647]; + + } + if (partition == 18) { + buffer[0][0] = data[648]; buffer[0][1] = data[649]; buffer[0][2] = data[650]; buffer[0][3] = data[651]; buffer[0][4] = data[652]; buffer[0][5] = data[653]; buffer[0][6] = data[654]; buffer[0][7] = data[655]; buffer[0][8] = data[656]; buffer[0][9] = data[657]; buffer[0][10] = data[658]; buffer[0][11] = data[659]; buffer[0][12] = data[660]; buffer[0][13] = data[661]; buffer[0][14] = data[662]; buffer[0][15] = data[663]; buffer[0][16] = data[664]; buffer[0][17] = data[665]; buffer[0][18] = data[666]; buffer[0][19] = data[667]; buffer[0][20] = data[668]; buffer[0][21] = data[669]; buffer[0][22] = data[670]; buffer[0][23] = data[671]; buffer[0][24] = data[672]; buffer[0][25] = data[673]; buffer[0][26] = data[674]; buffer[0][27] = data[675]; buffer[0][28] = data[676]; buffer[0][29] = data[677]; buffer[0][30] = data[678]; buffer[0][31] = data[679]; buffer[0][32] = data[680]; buffer[0][33] = data[681]; buffer[0][34] = data[682]; buffer[0][35] = data[683]; + + } + if (partition == 19) { + buffer[0][0] = data[684]; buffer[0][1] = data[685]; buffer[0][2] = data[686]; buffer[0][3] = data[687]; buffer[0][4] = data[688]; buffer[0][5] = data[689]; buffer[0][6] = data[690]; buffer[0][7] = data[691]; buffer[0][8] = data[692]; buffer[0][9] = data[693]; buffer[0][10] = data[694]; buffer[0][11] = data[695]; buffer[0][12] = data[696]; buffer[0][13] = data[697]; buffer[0][14] = data[698]; buffer[0][15] = data[699]; buffer[0][16] = data[700]; buffer[0][17] = data[701]; buffer[0][18] = data[702]; buffer[0][19] = data[703]; buffer[0][20] = data[704]; buffer[0][21] = data[705]; buffer[0][22] = data[706]; buffer[0][23] = data[707]; buffer[0][24] = data[708]; buffer[0][25] = data[709]; buffer[0][26] = data[710]; buffer[0][27] = data[711]; buffer[0][28] = data[712]; buffer[0][29] = data[713]; buffer[0][30] = data[714]; buffer[0][31] = data[715]; buffer[0][32] = data[716]; buffer[0][33] = data[717]; buffer[0][34] = data[718]; buffer[0][35] = data[719]; + + } + if (partition == 20) { + buffer[0][0] = data[720]; buffer[0][1] = data[721]; buffer[0][2] = data[722]; buffer[0][3] = data[723]; buffer[0][4] = data[724]; buffer[0][5] = data[725]; buffer[0][6] = data[726]; buffer[0][7] = data[727]; buffer[0][8] = data[728]; buffer[0][9] = data[729]; buffer[0][10] = data[730]; buffer[0][11] = data[731]; buffer[0][12] = data[732]; buffer[0][13] = data[733]; buffer[0][14] = data[734]; buffer[0][15] = data[735]; buffer[0][16] = data[736]; buffer[0][17] = data[737]; buffer[0][18] = data[738]; buffer[0][19] = data[739]; buffer[0][20] = data[740]; buffer[0][21] = data[741]; buffer[0][22] = data[742]; buffer[0][23] = data[743]; buffer[0][24] = data[744]; buffer[0][25] = data[745]; buffer[0][26] = data[746]; buffer[0][27] = data[747]; buffer[0][28] = data[748]; buffer[0][29] = data[749]; buffer[0][30] = data[750]; buffer[0][31] = data[751]; buffer[0][32] = data[752]; buffer[0][33] = data[753]; buffer[0][34] = data[754]; buffer[0][35] = data[755]; + + } + if (partition == 21) { + buffer[0][0] = data[756]; buffer[0][1] = data[757]; buffer[0][2] = data[758]; buffer[0][3] = data[759]; buffer[0][4] = data[760]; buffer[0][5] = data[761]; buffer[0][6] = data[762]; buffer[0][7] = data[763]; buffer[0][8] = data[764]; buffer[0][9] = data[765]; buffer[0][10] = data[766]; buffer[0][11] = data[767]; buffer[0][12] = data[768]; buffer[0][13] = data[769]; buffer[0][14] = data[770]; buffer[0][15] = data[771]; buffer[0][16] = data[772]; buffer[0][17] = data[773]; buffer[0][18] = data[774]; buffer[0][19] = data[775]; buffer[0][20] = data[776]; buffer[0][21] = data[777]; buffer[0][22] = data[778]; buffer[0][23] = data[779]; buffer[0][24] = data[780]; buffer[0][25] = data[781]; buffer[0][26] = data[782]; buffer[0][27] = data[783]; buffer[0][28] = data[784]; buffer[0][29] = data[785]; buffer[0][30] = data[786]; buffer[0][31] = data[787]; buffer[0][32] = data[788]; buffer[0][33] = data[789]; buffer[0][34] = data[790]; buffer[0][35] = data[791]; + + } + if (partition == 22) { + buffer[0][0] = data[792]; buffer[0][1] = data[793]; buffer[0][2] = data[794]; buffer[0][3] = data[795]; buffer[0][4] = data[796]; buffer[0][5] = data[797]; buffer[0][6] = data[798]; buffer[0][7] = data[799]; buffer[0][8] = data[800]; buffer[0][9] = data[801]; buffer[0][10] = data[802]; buffer[0][11] = data[803]; buffer[0][12] = data[804]; buffer[0][13] = data[805]; buffer[0][14] = data[806]; buffer[0][15] = data[807]; buffer[0][16] = data[808]; buffer[0][17] = data[809]; buffer[0][18] = data[810]; buffer[0][19] = data[811]; buffer[0][20] = data[812]; buffer[0][21] = data[813]; buffer[0][22] = data[814]; buffer[0][23] = data[815]; buffer[0][24] = data[816]; buffer[0][25] = data[817]; buffer[0][26] = data[818]; buffer[0][27] = data[819]; buffer[0][28] = data[820]; buffer[0][29] = data[821]; buffer[0][30] = data[822]; buffer[0][31] = data[823]; buffer[0][32] = data[824]; buffer[0][33] = data[825]; buffer[0][34] = data[826]; buffer[0][35] = data[827]; + + } + if (partition == 23) { + buffer[0][0] = data[828]; buffer[0][1] = data[829]; buffer[0][2] = data[830]; buffer[0][3] = data[831]; buffer[0][4] = data[832]; buffer[0][5] = data[833]; buffer[0][6] = data[834]; buffer[0][7] = data[835]; buffer[0][8] = data[836]; buffer[0][9] = data[837]; buffer[0][10] = data[838]; buffer[0][11] = data[839]; buffer[0][12] = data[840]; buffer[0][13] = data[841]; buffer[0][14] = data[842]; buffer[0][15] = data[843]; buffer[0][16] = data[844]; buffer[0][17] = data[845]; buffer[0][18] = data[846]; buffer[0][19] = data[847]; buffer[0][20] = data[848]; buffer[0][21] = data[849]; buffer[0][22] = data[850]; buffer[0][23] = data[851]; buffer[0][24] = data[852]; buffer[0][25] = data[853]; buffer[0][26] = data[854]; buffer[0][27] = data[855]; buffer[0][28] = data[856]; buffer[0][29] = data[857]; buffer[0][30] = data[858]; buffer[0][31] = data[859]; buffer[0][32] = data[860]; buffer[0][33] = data[861]; buffer[0][34] = data[862]; buffer[0][35] = data[863]; + + } + if (partition == 24) { + buffer[0][0] = data[864]; buffer[0][1] = data[865]; buffer[0][2] = data[866]; buffer[0][3] = data[867]; buffer[0][4] = data[868]; buffer[0][5] = data[869]; buffer[0][6] = data[870]; buffer[0][7] = data[871]; buffer[0][8] = data[872]; buffer[0][9] = data[873]; buffer[0][10] = data[874]; buffer[0][11] = data[875]; buffer[0][12] = data[876]; buffer[0][13] = data[877]; buffer[0][14] = data[878]; buffer[0][15] = data[879]; buffer[0][16] = data[880]; buffer[0][17] = data[881]; buffer[0][18] = data[882]; buffer[0][19] = data[883]; buffer[0][20] = data[884]; buffer[0][21] = data[885]; buffer[0][22] = data[886]; buffer[0][23] = data[887]; buffer[0][24] = data[888]; buffer[0][25] = data[889]; buffer[0][26] = data[890]; buffer[0][27] = data[891]; buffer[0][28] = data[892]; buffer[0][29] = data[893]; buffer[0][30] = data[894]; buffer[0][31] = data[895]; buffer[0][32] = data[896]; buffer[0][33] = data[897]; buffer[0][34] = data[898]; buffer[0][35] = data[899]; + + } + if (partition == 25) { + buffer[0][0] = data[900]; buffer[0][1] = data[901]; buffer[0][2] = data[902]; buffer[0][3] = data[903]; buffer[0][4] = data[904]; buffer[0][5] = data[905]; buffer[0][6] = data[906]; buffer[0][7] = data[907]; buffer[0][8] = data[908]; buffer[0][9] = data[909]; buffer[0][10] = data[910]; buffer[0][11] = data[911]; buffer[0][12] = data[912]; buffer[0][13] = data[913]; buffer[0][14] = data[914]; buffer[0][15] = data[915]; buffer[0][16] = data[916]; buffer[0][17] = data[917]; buffer[0][18] = data[918]; buffer[0][19] = data[919]; buffer[0][20] = data[920]; buffer[0][21] = data[921]; buffer[0][22] = data[922]; buffer[0][23] = data[923]; buffer[0][24] = data[924]; buffer[0][25] = data[925]; buffer[0][26] = data[926]; buffer[0][27] = data[927]; buffer[0][28] = data[928]; buffer[0][29] = data[929]; buffer[0][30] = data[930]; buffer[0][31] = data[931]; buffer[0][32] = data[932]; buffer[0][33] = data[933]; buffer[0][34] = data[934]; buffer[0][35] = data[935]; + + } + if (partition == 26) { + buffer[0][0] = data[936]; buffer[0][1] = data[937]; buffer[0][2] = data[938]; buffer[0][3] = data[939]; buffer[0][4] = data[940]; buffer[0][5] = data[941]; buffer[0][6] = data[942]; buffer[0][7] = data[943]; buffer[0][8] = data[944]; buffer[0][9] = data[945]; buffer[0][10] = data[946]; buffer[0][11] = data[947]; buffer[0][12] = data[948]; buffer[0][13] = data[949]; buffer[0][14] = data[950]; buffer[0][15] = data[951]; buffer[0][16] = data[952]; buffer[0][17] = data[953]; buffer[0][18] = data[954]; buffer[0][19] = data[955]; buffer[0][20] = data[956]; buffer[0][21] = data[957]; buffer[0][22] = data[958]; buffer[0][23] = data[959]; buffer[0][24] = data[960]; buffer[0][25] = data[961]; buffer[0][26] = data[962]; buffer[0][27] = data[963]; buffer[0][28] = data[964]; buffer[0][29] = data[965]; buffer[0][30] = data[966]; buffer[0][31] = data[967]; buffer[0][32] = data[968]; buffer[0][33] = data[969]; buffer[0][34] = data[970]; buffer[0][35] = data[971]; + + } + if (partition == 27) { + buffer[0][0] = data[972]; buffer[0][1] = data[973]; buffer[0][2] = data[974]; buffer[0][3] = data[975]; buffer[0][4] = data[976]; buffer[0][5] = data[977]; buffer[0][6] = data[978]; buffer[0][7] = data[979]; buffer[0][8] = data[980]; buffer[0][9] = data[981]; buffer[0][10] = data[982]; buffer[0][11] = data[983]; buffer[0][12] = data[984]; buffer[0][13] = data[985]; buffer[0][14] = data[986]; buffer[0][15] = data[987]; buffer[0][16] = data[988]; buffer[0][17] = data[989]; buffer[0][18] = data[990]; buffer[0][19] = data[991]; buffer[0][20] = data[992]; buffer[0][21] = data[993]; buffer[0][22] = data[994]; buffer[0][23] = data[995]; buffer[0][24] = data[996]; buffer[0][25] = data[997]; buffer[0][26] = data[998]; buffer[0][27] = data[999]; buffer[0][28] = data[1000]; buffer[0][29] = data[1001]; buffer[0][30] = data[1002]; buffer[0][31] = data[1003]; buffer[0][32] = data[1004]; buffer[0][33] = data[1005]; buffer[0][34] = data[1006]; buffer[0][35] = data[1007]; + + } + if (partition == 28) { + buffer[0][0] = data[1008]; buffer[0][1] = data[1009]; buffer[0][2] = data[1010]; buffer[0][3] = data[1011]; buffer[0][4] = data[1012]; buffer[0][5] = data[1013]; buffer[0][6] = data[1014]; buffer[0][7] = data[1015]; buffer[0][8] = data[1016]; buffer[0][9] = data[1017]; buffer[0][10] = data[1018]; buffer[0][11] = data[1019]; buffer[0][12] = data[1020]; buffer[0][13] = data[1021]; buffer[0][14] = data[1022]; buffer[0][15] = data[1023]; buffer[0][16] = data[1024]; buffer[0][17] = data[1025]; buffer[0][18] = data[1026]; buffer[0][19] = data[1027]; buffer[0][20] = data[1028]; buffer[0][21] = data[1029]; buffer[0][22] = data[1030]; buffer[0][23] = data[1031]; buffer[0][24] = data[1032]; buffer[0][25] = data[1033]; buffer[0][26] = data[1034]; buffer[0][27] = data[1035]; buffer[0][28] = data[1036]; buffer[0][29] = data[1037]; buffer[0][30] = data[1038]; buffer[0][31] = data[1039]; buffer[0][32] = data[1040]; buffer[0][33] = data[1041]; buffer[0][34] = data[1042]; buffer[0][35] = data[1043]; + + } + if (partition == 29) { + buffer[0][0] = data[1044]; buffer[0][1] = data[1045]; buffer[0][2] = data[1046]; buffer[0][3] = data[1047]; buffer[0][4] = data[1048]; buffer[0][5] = data[1049]; buffer[0][6] = data[1050]; buffer[0][7] = data[1051]; buffer[0][8] = data[1052]; buffer[0][9] = data[1053]; buffer[0][10] = data[1054]; buffer[0][11] = data[1055]; buffer[0][12] = data[1056]; buffer[0][13] = data[1057]; buffer[0][14] = data[1058]; buffer[0][15] = data[1059]; buffer[0][16] = data[1060]; buffer[0][17] = data[1061]; buffer[0][18] = data[1062]; buffer[0][19] = data[1063]; buffer[0][20] = data[1064]; buffer[0][21] = data[1065]; buffer[0][22] = data[1066]; buffer[0][23] = data[1067]; buffer[0][24] = data[1068]; buffer[0][25] = data[1069]; buffer[0][26] = data[1070]; buffer[0][27] = data[1071]; buffer[0][28] = data[1072]; buffer[0][29] = data[1073]; buffer[0][30] = data[1074]; buffer[0][31] = data[1075]; buffer[0][32] = data[1076]; buffer[0][33] = data[1077]; buffer[0][34] = data[1078]; buffer[0][35] = data[1079]; + + } + if (partition == 30) { + buffer[0][0] = data[1080]; buffer[0][1] = data[1081]; buffer[0][2] = data[1082]; buffer[0][3] = data[1083]; buffer[0][4] = data[1084]; buffer[0][5] = data[1085]; buffer[0][6] = data[1086]; buffer[0][7] = data[1087]; buffer[0][8] = data[1088]; buffer[0][9] = data[1089]; buffer[0][10] = data[1090]; buffer[0][11] = data[1091]; buffer[0][12] = data[1092]; buffer[0][13] = data[1093]; buffer[0][14] = data[1094]; buffer[0][15] = data[1095]; buffer[0][16] = data[1096]; buffer[0][17] = data[1097]; buffer[0][18] = data[1098]; buffer[0][19] = data[1099]; buffer[0][20] = data[1100]; buffer[0][21] = data[1101]; buffer[0][22] = data[1102]; buffer[0][23] = data[1103]; buffer[0][24] = data[1104]; buffer[0][25] = data[1105]; buffer[0][26] = data[1106]; buffer[0][27] = data[1107]; buffer[0][28] = data[1108]; buffer[0][29] = data[1109]; buffer[0][30] = data[1110]; buffer[0][31] = data[1111]; buffer[0][32] = data[1112]; buffer[0][33] = data[1113]; buffer[0][34] = data[1114]; buffer[0][35] = data[1115]; + + } + if (partition == 31) { + buffer[0][0] = data[1116]; buffer[0][1] = data[1117]; buffer[0][2] = data[1118]; buffer[0][3] = data[1119]; buffer[0][4] = data[1120]; buffer[0][5] = data[1121]; buffer[0][6] = data[1122]; buffer[0][7] = data[1123]; buffer[0][8] = data[1124]; buffer[0][9] = data[1125]; buffer[0][10] = data[1126]; buffer[0][11] = data[1127]; buffer[0][12] = data[1128]; buffer[0][13] = data[1129]; buffer[0][14] = data[1130]; buffer[0][15] = data[1131]; buffer[0][16] = data[1132]; buffer[0][17] = data[1133]; buffer[0][18] = data[1134]; buffer[0][19] = data[1135]; buffer[0][20] = data[1136]; buffer[0][21] = data[1137]; buffer[0][22] = data[1138]; buffer[0][23] = data[1139]; buffer[0][24] = data[1140]; buffer[0][25] = data[1141]; buffer[0][26] = data[1142]; buffer[0][27] = data[1143]; buffer[0][28] = data[1144]; buffer[0][29] = data[1145]; buffer[0][30] = data[1146]; buffer[0][31] = data[1147]; buffer[0][32] = data[1148]; buffer[0][33] = data[1149]; buffer[0][34] = data[1150]; buffer[0][35] = data[1151]; + + } + if (partition == 32) { + buffer[0][0] = data[1152]; buffer[0][1] = data[1153]; buffer[0][2] = data[1154]; buffer[0][3] = data[1155]; buffer[0][4] = data[1156]; buffer[0][5] = data[1157]; buffer[0][6] = data[1158]; buffer[0][7] = data[1159]; buffer[0][8] = data[1160]; buffer[0][9] = data[1161]; buffer[0][10] = data[1162]; buffer[0][11] = data[1163]; buffer[0][12] = data[1164]; buffer[0][13] = data[1165]; buffer[0][14] = data[1166]; buffer[0][15] = data[1167]; buffer[0][16] = data[1168]; buffer[0][17] = data[1169]; buffer[0][18] = data[1170]; buffer[0][19] = data[1171]; buffer[0][20] = data[1172]; buffer[0][21] = data[1173]; buffer[0][22] = data[1174]; buffer[0][23] = data[1175]; buffer[0][24] = data[1176]; buffer[0][25] = data[1177]; buffer[0][26] = data[1178]; buffer[0][27] = data[1179]; buffer[0][28] = data[1180]; buffer[0][29] = data[1181]; buffer[0][30] = data[1182]; buffer[0][31] = data[1183]; buffer[0][32] = data[1184]; buffer[0][33] = data[1185]; buffer[0][34] = data[1186]; buffer[0][35] = data[1187]; + + } + if (partition == 33) { + buffer[0][0] = data[1188]; buffer[0][1] = data[1189]; buffer[0][2] = data[1190]; buffer[0][3] = data[1191]; buffer[0][4] = data[1192]; buffer[0][5] = data[1193]; buffer[0][6] = data[1194]; buffer[0][7] = data[1195]; buffer[0][8] = data[1196]; buffer[0][9] = data[1197]; buffer[0][10] = data[1198]; buffer[0][11] = data[1199]; buffer[0][12] = data[1200]; buffer[0][13] = data[1201]; buffer[0][14] = data[1202]; buffer[0][15] = data[1203]; buffer[0][16] = data[1204]; buffer[0][17] = data[1205]; buffer[0][18] = data[1206]; buffer[0][19] = data[1207]; buffer[0][20] = data[1208]; buffer[0][21] = data[1209]; buffer[0][22] = data[1210]; buffer[0][23] = data[1211]; buffer[0][24] = data[1212]; buffer[0][25] = data[1213]; buffer[0][26] = data[1214]; buffer[0][27] = data[1215]; buffer[0][28] = data[1216]; buffer[0][29] = data[1217]; buffer[0][30] = data[1218]; buffer[0][31] = data[1219]; buffer[0][32] = data[1220]; buffer[0][33] = data[1221]; buffer[0][34] = data[1222]; buffer[0][35] = data[1223]; + + } + if (partition == 34) { + buffer[0][0] = data[1224]; buffer[0][1] = data[1225]; buffer[0][2] = data[1226]; buffer[0][3] = data[1227]; buffer[0][4] = data[1228]; buffer[0][5] = data[1229]; buffer[0][6] = data[1230]; buffer[0][7] = data[1231]; buffer[0][8] = data[1232]; buffer[0][9] = data[1233]; buffer[0][10] = data[1234]; buffer[0][11] = data[1235]; buffer[0][12] = data[1236]; buffer[0][13] = data[1237]; buffer[0][14] = data[1238]; buffer[0][15] = data[1239]; buffer[0][16] = data[1240]; buffer[0][17] = data[1241]; buffer[0][18] = data[1242]; buffer[0][19] = data[1243]; buffer[0][20] = data[1244]; buffer[0][21] = data[1245]; buffer[0][22] = data[1246]; buffer[0][23] = data[1247]; buffer[0][24] = data[1248]; buffer[0][25] = data[1249]; buffer[0][26] = data[1250]; buffer[0][27] = data[1251]; buffer[0][28] = data[1252]; buffer[0][29] = data[1253]; buffer[0][30] = data[1254]; buffer[0][31] = data[1255]; buffer[0][32] = data[1256]; buffer[0][33] = data[1257]; buffer[0][34] = data[1258]; buffer[0][35] = data[1259]; + + } + if (partition == 35) { + buffer[0][0] = data[1260]; buffer[0][1] = data[1261]; buffer[0][2] = data[1262]; buffer[0][3] = data[1263]; buffer[0][4] = data[1264]; buffer[0][5] = data[1265]; buffer[0][6] = data[1266]; buffer[0][7] = data[1267]; buffer[0][8] = data[1268]; buffer[0][9] = data[1269]; buffer[0][10] = data[1270]; buffer[0][11] = data[1271]; buffer[0][12] = data[1272]; buffer[0][13] = data[1273]; buffer[0][14] = data[1274]; buffer[0][15] = data[1275]; buffer[0][16] = data[1276]; buffer[0][17] = data[1277]; buffer[0][18] = data[1278]; buffer[0][19] = data[1279]; buffer[0][20] = data[1280]; buffer[0][21] = data[1281]; buffer[0][22] = data[1282]; buffer[0][23] = data[1283]; buffer[0][24] = data[1284]; buffer[0][25] = data[1285]; buffer[0][26] = data[1286]; buffer[0][27] = data[1287]; buffer[0][28] = data[1288]; buffer[0][29] = data[1289]; buffer[0][30] = data[1290]; buffer[0][31] = data[1291]; buffer[0][32] = data[1292]; buffer[0][33] = data[1293]; buffer[0][34] = data[1294]; buffer[0][35] = data[1295]; + + } + if (partition == 36) { + buffer[0][0] = data[1296]; buffer[0][1] = data[1297]; buffer[0][2] = data[1298]; buffer[0][3] = data[1299]; buffer[0][4] = data[1300]; buffer[0][5] = data[1301]; buffer[0][6] = data[1302]; buffer[0][7] = data[1303]; buffer[0][8] = data[1304]; buffer[0][9] = data[1305]; buffer[0][10] = data[1306]; buffer[0][11] = data[1307]; buffer[0][12] = data[1308]; buffer[0][13] = data[1309]; buffer[0][14] = data[1310]; buffer[0][15] = data[1311]; buffer[0][16] = data[1312]; buffer[0][17] = data[1313]; buffer[0][18] = data[1314]; buffer[0][19] = data[1315]; buffer[0][20] = data[1316]; buffer[0][21] = data[1317]; buffer[0][22] = data[1318]; buffer[0][23] = data[1319]; buffer[0][24] = data[1320]; buffer[0][25] = data[1321]; buffer[0][26] = data[1322]; buffer[0][27] = data[1323]; buffer[0][28] = data[1324]; buffer[0][29] = data[1325]; buffer[0][30] = data[1326]; buffer[0][31] = data[1327]; buffer[0][32] = data[1328]; buffer[0][33] = data[1329]; buffer[0][34] = data[1330]; buffer[0][35] = data[1331]; + + } + if (partition == 37) { + buffer[0][0] = data[1332]; buffer[0][1] = data[1333]; buffer[0][2] = data[1334]; buffer[0][3] = data[1335]; buffer[0][4] = data[1336]; buffer[0][5] = data[1337]; buffer[0][6] = data[1338]; buffer[0][7] = data[1339]; buffer[0][8] = data[1340]; buffer[0][9] = data[1341]; buffer[0][10] = data[1342]; buffer[0][11] = data[1343]; buffer[0][12] = data[1344]; buffer[0][13] = data[1345]; buffer[0][14] = data[1346]; buffer[0][15] = data[1347]; buffer[0][16] = data[1348]; buffer[0][17] = data[1349]; buffer[0][18] = data[1350]; buffer[0][19] = data[1351]; buffer[0][20] = data[1352]; buffer[0][21] = data[1353]; buffer[0][22] = data[1354]; buffer[0][23] = data[1355]; buffer[0][24] = data[1356]; buffer[0][25] = data[1357]; buffer[0][26] = data[1358]; buffer[0][27] = data[1359]; buffer[0][28] = data[1360]; buffer[0][29] = data[1361]; buffer[0][30] = data[1362]; buffer[0][31] = data[1363]; buffer[0][32] = data[1364]; buffer[0][33] = data[1365]; buffer[0][34] = data[1366]; buffer[0][35] = data[1367]; + + } + if (partition == 38) { + buffer[0][0] = data[1368]; buffer[0][1] = data[1369]; buffer[0][2] = data[1370]; buffer[0][3] = data[1371]; buffer[0][4] = data[1372]; buffer[0][5] = data[1373]; buffer[0][6] = data[1374]; buffer[0][7] = data[1375]; buffer[0][8] = data[1376]; buffer[0][9] = data[1377]; buffer[0][10] = data[1378]; buffer[0][11] = data[1379]; buffer[0][12] = data[1380]; buffer[0][13] = data[1381]; buffer[0][14] = data[1382]; buffer[0][15] = data[1383]; buffer[0][16] = data[1384]; buffer[0][17] = data[1385]; buffer[0][18] = data[1386]; buffer[0][19] = data[1387]; buffer[0][20] = data[1388]; buffer[0][21] = data[1389]; buffer[0][22] = data[1390]; buffer[0][23] = data[1391]; buffer[0][24] = data[1392]; buffer[0][25] = data[1393]; buffer[0][26] = data[1394]; buffer[0][27] = data[1395]; buffer[0][28] = data[1396]; buffer[0][29] = data[1397]; buffer[0][30] = data[1398]; buffer[0][31] = data[1399]; buffer[0][32] = data[1400]; buffer[0][33] = data[1401]; buffer[0][34] = data[1402]; buffer[0][35] = data[1403]; + + } + if (partition == 39) { + buffer[0][0] = data[1404]; buffer[0][1] = data[1405]; buffer[0][2] = data[1406]; buffer[0][3] = data[1407]; buffer[0][4] = data[1408]; buffer[0][5] = data[1409]; buffer[0][6] = data[1410]; buffer[0][7] = data[1411]; buffer[0][8] = data[1412]; buffer[0][9] = data[1413]; buffer[0][10] = data[1414]; buffer[0][11] = data[1415]; buffer[0][12] = data[1416]; buffer[0][13] = data[1417]; buffer[0][14] = data[1418]; buffer[0][15] = data[1419]; buffer[0][16] = data[1420]; buffer[0][17] = data[1421]; buffer[0][18] = data[1422]; buffer[0][19] = data[1423]; buffer[0][20] = data[1424]; buffer[0][21] = data[1425]; buffer[0][22] = data[1426]; buffer[0][23] = data[1427]; buffer[0][24] = data[1428]; buffer[0][25] = data[1429]; buffer[0][26] = data[1430]; buffer[0][27] = data[1431]; buffer[0][28] = data[1432]; buffer[0][29] = data[1433]; buffer[0][30] = data[1434]; buffer[0][31] = data[1435]; buffer[0][32] = data[1436]; buffer[0][33] = data[1437]; buffer[0][34] = data[1438]; buffer[0][35] = data[1439]; + + } + if (partition == 40) { + buffer[0][0] = data[1440]; buffer[0][1] = data[1441]; buffer[0][2] = data[1442]; buffer[0][3] = data[1443]; buffer[0][4] = data[1444]; buffer[0][5] = data[1445]; buffer[0][6] = data[1446]; buffer[0][7] = data[1447]; buffer[0][8] = data[1448]; buffer[0][9] = data[1449]; buffer[0][10] = data[1450]; buffer[0][11] = data[1451]; buffer[0][12] = data[1452]; buffer[0][13] = data[1453]; buffer[0][14] = data[1454]; buffer[0][15] = data[1455]; buffer[0][16] = data[1456]; buffer[0][17] = data[1457]; buffer[0][18] = data[1458]; buffer[0][19] = data[1459]; buffer[0][20] = data[1460]; buffer[0][21] = data[1461]; buffer[0][22] = data[1462]; buffer[0][23] = data[1463]; buffer[0][24] = data[1464]; buffer[0][25] = data[1465]; buffer[0][26] = data[1466]; buffer[0][27] = data[1467]; buffer[0][28] = data[1468]; buffer[0][29] = data[1469]; buffer[0][30] = data[1470]; buffer[0][31] = data[1471]; buffer[0][32] = data[1472]; buffer[0][33] = data[1473]; buffer[0][34] = data[1474]; buffer[0][35] = data[1475]; + + } + if (partition == 41) { + buffer[0][0] = data[1476]; buffer[0][1] = data[1477]; buffer[0][2] = data[1478]; buffer[0][3] = data[1479]; buffer[0][4] = data[1480]; buffer[0][5] = data[1481]; buffer[0][6] = data[1482]; buffer[0][7] = data[1483]; buffer[0][8] = data[1484]; buffer[0][9] = data[1485]; buffer[0][10] = data[1486]; buffer[0][11] = data[1487]; buffer[0][12] = data[1488]; buffer[0][13] = data[1489]; buffer[0][14] = data[1490]; buffer[0][15] = data[1491]; buffer[0][16] = data[1492]; buffer[0][17] = data[1493]; buffer[0][18] = data[1494]; buffer[0][19] = data[1495]; buffer[0][20] = data[1496]; buffer[0][21] = data[1497]; buffer[0][22] = data[1498]; buffer[0][23] = data[1499]; buffer[0][24] = data[1500]; buffer[0][25] = data[1501]; buffer[0][26] = data[1502]; buffer[0][27] = data[1503]; buffer[0][28] = data[1504]; buffer[0][29] = data[1505]; buffer[0][30] = data[1506]; buffer[0][31] = data[1507]; buffer[0][32] = data[1508]; buffer[0][33] = data[1509]; buffer[0][34] = data[1510]; buffer[0][35] = data[1511]; + + } + if (partition == 42) { + buffer[0][0] = data[1512]; buffer[0][1] = data[1513]; buffer[0][2] = data[1514]; buffer[0][3] = data[1515]; buffer[0][4] = data[1516]; buffer[0][5] = data[1517]; buffer[0][6] = data[1518]; buffer[0][7] = data[1519]; buffer[0][8] = data[1520]; buffer[0][9] = data[1521]; buffer[0][10] = data[1522]; buffer[0][11] = data[1523]; buffer[0][12] = data[1524]; buffer[0][13] = data[1525]; buffer[0][14] = data[1526]; buffer[0][15] = data[1527]; buffer[0][16] = data[1528]; buffer[0][17] = data[1529]; buffer[0][18] = data[1530]; buffer[0][19] = data[1531]; buffer[0][20] = data[1532]; buffer[0][21] = data[1533]; buffer[0][22] = data[1534]; buffer[0][23] = data[1535]; buffer[0][24] = data[1536]; buffer[0][25] = data[1537]; buffer[0][26] = data[1538]; buffer[0][27] = data[1539]; buffer[0][28] = data[1540]; buffer[0][29] = data[1541]; buffer[0][30] = data[1542]; buffer[0][31] = data[1543]; buffer[0][32] = data[1544]; buffer[0][33] = data[1545]; buffer[0][34] = data[1546]; buffer[0][35] = data[1547]; + + } + if (partition == 43) { + buffer[0][0] = data[1548]; buffer[0][1] = data[1549]; buffer[0][2] = data[1550]; buffer[0][3] = data[1551]; buffer[0][4] = data[1552]; buffer[0][5] = data[1553]; buffer[0][6] = data[1554]; buffer[0][7] = data[1555]; buffer[0][8] = data[1556]; buffer[0][9] = data[1557]; buffer[0][10] = data[1558]; buffer[0][11] = data[1559]; buffer[0][12] = data[1560]; buffer[0][13] = data[1561]; buffer[0][14] = data[1562]; buffer[0][15] = data[1563]; buffer[0][16] = data[1564]; buffer[0][17] = data[1565]; buffer[0][18] = data[1566]; buffer[0][19] = data[1567]; buffer[0][20] = data[1568]; buffer[0][21] = data[1569]; buffer[0][22] = data[1570]; buffer[0][23] = data[1571]; buffer[0][24] = data[1572]; buffer[0][25] = data[1573]; buffer[0][26] = data[1574]; buffer[0][27] = data[1575]; buffer[0][28] = data[1576]; buffer[0][29] = data[1577]; buffer[0][30] = data[1578]; buffer[0][31] = data[1579]; buffer[0][32] = data[1580]; buffer[0][33] = data[1581]; buffer[0][34] = data[1582]; buffer[0][35] = data[1583]; + + } + if (partition == 44) { + buffer[0][0] = data[1584]; buffer[0][1] = data[1585]; buffer[0][2] = data[1586]; buffer[0][3] = data[1587]; buffer[0][4] = data[1588]; buffer[0][5] = data[1589]; buffer[0][6] = data[1590]; buffer[0][7] = data[1591]; buffer[0][8] = data[1592]; buffer[0][9] = data[1593]; buffer[0][10] = data[1594]; buffer[0][11] = data[1595]; buffer[0][12] = data[1596]; buffer[0][13] = data[1597]; buffer[0][14] = data[1598]; buffer[0][15] = data[1599]; buffer[0][16] = data[1600]; buffer[0][17] = data[1601]; buffer[0][18] = data[1602]; buffer[0][19] = data[1603]; buffer[0][20] = data[1604]; buffer[0][21] = data[1605]; buffer[0][22] = data[1606]; buffer[0][23] = data[1607]; buffer[0][24] = data[1608]; buffer[0][25] = data[1609]; buffer[0][26] = data[1610]; buffer[0][27] = data[1611]; buffer[0][28] = data[1612]; buffer[0][29] = data[1613]; buffer[0][30] = data[1614]; buffer[0][31] = data[1615]; buffer[0][32] = data[1616]; buffer[0][33] = data[1617]; buffer[0][34] = data[1618]; buffer[0][35] = data[1619]; + + } + if (partition == 45) { + buffer[0][0] = data[1620]; buffer[0][1] = data[1621]; buffer[0][2] = data[1622]; buffer[0][3] = data[1623]; buffer[0][4] = data[1624]; buffer[0][5] = data[1625]; buffer[0][6] = data[1626]; buffer[0][7] = data[1627]; buffer[0][8] = data[1628]; buffer[0][9] = data[1629]; buffer[0][10] = data[1630]; buffer[0][11] = data[1631]; buffer[0][12] = data[1632]; buffer[0][13] = data[1633]; buffer[0][14] = data[1634]; buffer[0][15] = data[1635]; buffer[0][16] = data[1636]; buffer[0][17] = data[1637]; buffer[0][18] = data[1638]; buffer[0][19] = data[1639]; buffer[0][20] = data[1640]; buffer[0][21] = data[1641]; buffer[0][22] = data[1642]; buffer[0][23] = data[1643]; buffer[0][24] = data[1644]; buffer[0][25] = data[1645]; buffer[0][26] = data[1646]; buffer[0][27] = data[1647]; buffer[0][28] = data[1648]; buffer[0][29] = data[1649]; buffer[0][30] = data[1650]; buffer[0][31] = data[1651]; buffer[0][32] = data[1652]; buffer[0][33] = data[1653]; buffer[0][34] = data[1654]; buffer[0][35] = data[1655]; + + } + if (partition == 46) { + buffer[0][0] = data[1656]; buffer[0][1] = data[1657]; buffer[0][2] = data[1658]; buffer[0][3] = data[1659]; buffer[0][4] = data[1660]; buffer[0][5] = data[1661]; buffer[0][6] = data[1662]; buffer[0][7] = data[1663]; buffer[0][8] = data[1664]; buffer[0][9] = data[1665]; buffer[0][10] = data[1666]; buffer[0][11] = data[1667]; buffer[0][12] = data[1668]; buffer[0][13] = data[1669]; buffer[0][14] = data[1670]; buffer[0][15] = data[1671]; buffer[0][16] = data[1672]; buffer[0][17] = data[1673]; buffer[0][18] = data[1674]; buffer[0][19] = data[1675]; buffer[0][20] = data[1676]; buffer[0][21] = data[1677]; buffer[0][22] = data[1678]; buffer[0][23] = data[1679]; buffer[0][24] = data[1680]; buffer[0][25] = data[1681]; buffer[0][26] = data[1682]; buffer[0][27] = data[1683]; buffer[0][28] = data[1684]; buffer[0][29] = data[1685]; buffer[0][30] = data[1686]; buffer[0][31] = data[1687]; buffer[0][32] = data[1688]; buffer[0][33] = data[1689]; buffer[0][34] = data[1690]; buffer[0][35] = data[1691]; + + } + if (partition == 47) { + buffer[0][0] = data[1692]; buffer[0][1] = data[1693]; buffer[0][2] = data[1694]; buffer[0][3] = data[1695]; buffer[0][4] = data[1696]; buffer[0][5] = data[1697]; buffer[0][6] = data[1698]; buffer[0][7] = data[1699]; buffer[0][8] = data[1700]; buffer[0][9] = data[1701]; buffer[0][10] = data[1702]; buffer[0][11] = data[1703]; buffer[0][12] = data[1704]; buffer[0][13] = data[1705]; buffer[0][14] = data[1706]; buffer[0][15] = data[1707]; buffer[0][16] = data[1708]; buffer[0][17] = data[1709]; buffer[0][18] = data[1710]; buffer[0][19] = data[1711]; buffer[0][20] = data[1712]; buffer[0][21] = data[1713]; buffer[0][22] = data[1714]; buffer[0][23] = data[1715]; buffer[0][24] = data[1716]; buffer[0][25] = data[1717]; buffer[0][26] = data[1718]; buffer[0][27] = data[1719]; buffer[0][28] = data[1720]; buffer[0][29] = data[1721]; buffer[0][30] = data[1722]; buffer[0][31] = data[1723]; buffer[0][32] = data[1724]; buffer[0][33] = data[1725]; buffer[0][34] = data[1726]; buffer[0][35] = data[1727]; + + } + if (partition == 48) { + buffer[0][0] = data[1728]; buffer[0][1] = data[1729]; buffer[0][2] = data[1730]; buffer[0][3] = data[1731]; buffer[0][4] = data[1732]; buffer[0][5] = data[1733]; buffer[0][6] = data[1734]; buffer[0][7] = data[1735]; buffer[0][8] = data[1736]; buffer[0][9] = data[1737]; buffer[0][10] = data[1738]; buffer[0][11] = data[1739]; buffer[0][12] = data[1740]; buffer[0][13] = data[1741]; buffer[0][14] = data[1742]; buffer[0][15] = data[1743]; buffer[0][16] = data[1744]; buffer[0][17] = data[1745]; buffer[0][18] = data[1746]; buffer[0][19] = data[1747]; buffer[0][20] = data[1748]; buffer[0][21] = data[1749]; buffer[0][22] = data[1750]; buffer[0][23] = data[1751]; buffer[0][24] = data[1752]; buffer[0][25] = data[1753]; buffer[0][26] = data[1754]; buffer[0][27] = data[1755]; buffer[0][28] = data[1756]; buffer[0][29] = data[1757]; buffer[0][30] = data[1758]; buffer[0][31] = data[1759]; buffer[0][32] = data[1760]; buffer[0][33] = data[1761]; buffer[0][34] = data[1762]; buffer[0][35] = data[1763]; + + } + if (partition == 49) { + buffer[0][0] = data[1764]; buffer[0][1] = data[1765]; buffer[0][2] = data[1766]; buffer[0][3] = data[1767]; buffer[0][4] = data[1768]; buffer[0][5] = data[1769]; buffer[0][6] = data[1770]; buffer[0][7] = data[1771]; buffer[0][8] = data[1772]; buffer[0][9] = data[1773]; buffer[0][10] = data[1774]; buffer[0][11] = data[1775]; buffer[0][12] = data[1776]; buffer[0][13] = data[1777]; buffer[0][14] = data[1778]; buffer[0][15] = data[1779]; buffer[0][16] = data[1780]; buffer[0][17] = data[1781]; buffer[0][18] = data[1782]; buffer[0][19] = data[1783]; buffer[0][20] = data[1784]; buffer[0][21] = data[1785]; buffer[0][22] = data[1786]; buffer[0][23] = data[1787]; buffer[0][24] = data[1788]; buffer[0][25] = data[1789]; buffer[0][26] = data[1790]; buffer[0][27] = data[1791]; buffer[0][28] = data[1792]; buffer[0][29] = data[1793]; buffer[0][30] = data[1794]; buffer[0][31] = data[1795]; buffer[0][32] = data[1796]; buffer[0][33] = data[1797]; buffer[0][34] = data[1798]; buffer[0][35] = data[1799]; + + } + if (partition == 50) { + buffer[0][0] = data[1800]; buffer[0][1] = data[1801]; buffer[0][2] = data[1802]; buffer[0][3] = data[1803]; buffer[0][4] = data[1804]; buffer[0][5] = data[1805]; buffer[0][6] = data[1806]; buffer[0][7] = data[1807]; buffer[0][8] = data[1808]; buffer[0][9] = data[1809]; buffer[0][10] = data[1810]; buffer[0][11] = data[1811]; buffer[0][12] = data[1812]; buffer[0][13] = data[1813]; buffer[0][14] = data[1814]; buffer[0][15] = data[1815]; buffer[0][16] = data[1816]; buffer[0][17] = data[1817]; buffer[0][18] = data[1818]; buffer[0][19] = data[1819]; buffer[0][20] = data[1820]; buffer[0][21] = data[1821]; buffer[0][22] = data[1822]; buffer[0][23] = data[1823]; buffer[0][24] = data[1824]; buffer[0][25] = data[1825]; buffer[0][26] = data[1826]; buffer[0][27] = data[1827]; buffer[0][28] = data[1828]; buffer[0][29] = data[1829]; buffer[0][30] = data[1830]; buffer[0][31] = data[1831]; buffer[0][32] = data[1832]; buffer[0][33] = data[1833]; buffer[0][34] = data[1834]; buffer[0][35] = data[1835]; + + } + if (partition == 51) { + buffer[0][0] = data[1836]; buffer[0][1] = data[1837]; buffer[0][2] = data[1838]; buffer[0][3] = data[1839]; buffer[0][4] = data[1840]; buffer[0][5] = data[1841]; buffer[0][6] = data[1842]; buffer[0][7] = data[1843]; buffer[0][8] = data[1844]; buffer[0][9] = data[1845]; buffer[0][10] = data[1846]; buffer[0][11] = data[1847]; buffer[0][12] = data[1848]; buffer[0][13] = data[1849]; buffer[0][14] = data[1850]; buffer[0][15] = data[1851]; buffer[0][16] = data[1852]; buffer[0][17] = data[1853]; buffer[0][18] = data[1854]; buffer[0][19] = data[1855]; buffer[0][20] = data[1856]; buffer[0][21] = data[1857]; buffer[0][22] = data[1858]; buffer[0][23] = data[1859]; buffer[0][24] = data[1860]; buffer[0][25] = data[1861]; buffer[0][26] = data[1862]; buffer[0][27] = data[1863]; buffer[0][28] = data[1864]; buffer[0][29] = data[1865]; buffer[0][30] = data[1866]; buffer[0][31] = data[1867]; buffer[0][32] = data[1868]; buffer[0][33] = data[1869]; buffer[0][34] = data[1870]; buffer[0][35] = data[1871]; + + } + if (partition == 52) { + buffer[0][0] = data[1872]; buffer[0][1] = data[1873]; buffer[0][2] = data[1874]; buffer[0][3] = data[1875]; buffer[0][4] = data[1876]; buffer[0][5] = data[1877]; buffer[0][6] = data[1878]; buffer[0][7] = data[1879]; buffer[0][8] = data[1880]; buffer[0][9] = data[1881]; buffer[0][10] = data[1882]; buffer[0][11] = data[1883]; buffer[0][12] = data[1884]; buffer[0][13] = data[1885]; buffer[0][14] = data[1886]; buffer[0][15] = data[1887]; buffer[0][16] = data[1888]; buffer[0][17] = data[1889]; buffer[0][18] = data[1890]; buffer[0][19] = data[1891]; buffer[0][20] = data[1892]; buffer[0][21] = data[1893]; buffer[0][22] = data[1894]; buffer[0][23] = data[1895]; buffer[0][24] = data[1896]; buffer[0][25] = data[1897]; buffer[0][26] = data[1898]; buffer[0][27] = data[1899]; buffer[0][28] = data[1900]; buffer[0][29] = data[1901]; buffer[0][30] = data[1902]; buffer[0][31] = data[1903]; buffer[0][32] = data[1904]; buffer[0][33] = data[1905]; buffer[0][34] = data[1906]; buffer[0][35] = data[1907]; + + } + if (partition == 53) { + buffer[0][0] = data[1908]; buffer[0][1] = data[1909]; buffer[0][2] = data[1910]; buffer[0][3] = data[1911]; buffer[0][4] = data[1912]; buffer[0][5] = data[1913]; buffer[0][6] = data[1914]; buffer[0][7] = data[1915]; buffer[0][8] = data[1916]; buffer[0][9] = data[1917]; buffer[0][10] = data[1918]; buffer[0][11] = data[1919]; buffer[0][12] = data[1920]; buffer[0][13] = data[1921]; buffer[0][14] = data[1922]; buffer[0][15] = data[1923]; buffer[0][16] = data[1924]; buffer[0][17] = data[1925]; buffer[0][18] = data[1926]; buffer[0][19] = data[1927]; buffer[0][20] = data[1928]; buffer[0][21] = data[1929]; buffer[0][22] = data[1930]; buffer[0][23] = data[1931]; buffer[0][24] = data[1932]; buffer[0][25] = data[1933]; buffer[0][26] = data[1934]; buffer[0][27] = data[1935]; buffer[0][28] = data[1936]; buffer[0][29] = data[1937]; buffer[0][30] = data[1938]; buffer[0][31] = data[1939]; buffer[0][32] = data[1940]; buffer[0][33] = data[1941]; buffer[0][34] = data[1942]; buffer[0][35] = data[1943]; + + } + if (partition == 54) { + buffer[0][0] = data[1944]; buffer[0][1] = data[1945]; buffer[0][2] = data[1946]; buffer[0][3] = data[1947]; buffer[0][4] = data[1948]; buffer[0][5] = data[1949]; buffer[0][6] = data[1950]; buffer[0][7] = data[1951]; buffer[0][8] = data[1952]; buffer[0][9] = data[1953]; buffer[0][10] = data[1954]; buffer[0][11] = data[1955]; buffer[0][12] = data[1956]; buffer[0][13] = data[1957]; buffer[0][14] = data[1958]; buffer[0][15] = data[1959]; buffer[0][16] = data[1960]; buffer[0][17] = data[1961]; buffer[0][18] = data[1962]; buffer[0][19] = data[1963]; buffer[0][20] = data[1964]; buffer[0][21] = data[1965]; buffer[0][22] = data[1966]; buffer[0][23] = data[1967]; buffer[0][24] = data[1968]; buffer[0][25] = data[1969]; buffer[0][26] = data[1970]; buffer[0][27] = data[1971]; buffer[0][28] = data[1972]; buffer[0][29] = data[1973]; buffer[0][30] = data[1974]; buffer[0][31] = data[1975]; buffer[0][32] = data[1976]; buffer[0][33] = data[1977]; buffer[0][34] = data[1978]; buffer[0][35] = data[1979]; + + } + if (partition == 55) { + buffer[0][0] = data[1980]; buffer[0][1] = data[1981]; buffer[0][2] = data[1982]; buffer[0][3] = data[1983]; buffer[0][4] = data[1984]; buffer[0][5] = data[1985]; buffer[0][6] = data[1986]; buffer[0][7] = data[1987]; buffer[0][8] = data[1988]; buffer[0][9] = data[1989]; buffer[0][10] = data[1990]; buffer[0][11] = data[1991]; buffer[0][12] = data[1992]; buffer[0][13] = data[1993]; buffer[0][14] = data[1994]; buffer[0][15] = data[1995]; buffer[0][16] = data[1996]; buffer[0][17] = data[1997]; buffer[0][18] = data[1998]; buffer[0][19] = data[1999]; buffer[0][20] = data[2000]; buffer[0][21] = data[2001]; buffer[0][22] = data[2002]; buffer[0][23] = data[2003]; buffer[0][24] = data[2004]; buffer[0][25] = data[2005]; buffer[0][26] = data[2006]; buffer[0][27] = data[2007]; buffer[0][28] = data[2008]; buffer[0][29] = data[2009]; buffer[0][30] = data[2010]; buffer[0][31] = data[2011]; buffer[0][32] = data[2012]; buffer[0][33] = data[2013]; buffer[0][34] = data[2014]; buffer[0][35] = data[2015]; + + } + if (partition == 56) { + buffer[0][0] = data[2016]; buffer[0][1] = data[2017]; buffer[0][2] = data[2018]; buffer[0][3] = data[2019]; buffer[0][4] = data[2020]; buffer[0][5] = data[2021]; buffer[0][6] = data[2022]; buffer[0][7] = data[2023]; buffer[0][8] = data[2024]; buffer[0][9] = data[2025]; buffer[0][10] = data[2026]; buffer[0][11] = data[2027]; buffer[0][12] = data[2028]; buffer[0][13] = data[2029]; buffer[0][14] = data[2030]; buffer[0][15] = data[2031]; buffer[0][16] = data[2032]; buffer[0][17] = data[2033]; buffer[0][18] = data[2034]; buffer[0][19] = data[2035]; buffer[0][20] = data[2036]; buffer[0][21] = data[2037]; buffer[0][22] = data[2038]; buffer[0][23] = data[2039]; buffer[0][24] = data[2040]; buffer[0][25] = data[2041]; buffer[0][26] = data[2042]; buffer[0][27] = data[2043]; buffer[0][28] = data[2044]; buffer[0][29] = data[2045]; buffer[0][30] = data[2046]; buffer[0][31] = data[2047]; buffer[0][32] = data[2048]; buffer[0][33] = data[2049]; buffer[0][34] = data[2050]; buffer[0][35] = data[2051]; + + } + if (partition == 57) { + buffer[0][0] = data[2052]; buffer[0][1] = data[2053]; buffer[0][2] = data[2054]; buffer[0][3] = data[2055]; buffer[0][4] = data[2056]; buffer[0][5] = data[2057]; buffer[0][6] = data[2058]; buffer[0][7] = data[2059]; buffer[0][8] = data[2060]; buffer[0][9] = data[2061]; buffer[0][10] = data[2062]; buffer[0][11] = data[2063]; buffer[0][12] = data[2064]; buffer[0][13] = data[2065]; buffer[0][14] = data[2066]; buffer[0][15] = data[2067]; buffer[0][16] = data[2068]; buffer[0][17] = data[2069]; buffer[0][18] = data[2070]; buffer[0][19] = data[2071]; buffer[0][20] = data[2072]; buffer[0][21] = data[2073]; buffer[0][22] = data[2074]; buffer[0][23] = data[2075]; buffer[0][24] = data[2076]; buffer[0][25] = data[2077]; buffer[0][26] = data[2078]; buffer[0][27] = data[2079]; buffer[0][28] = data[2080]; buffer[0][29] = data[2081]; buffer[0][30] = data[2082]; buffer[0][31] = data[2083]; buffer[0][32] = data[2084]; buffer[0][33] = data[2085]; buffer[0][34] = data[2086]; buffer[0][35] = data[2087]; + + } + if (partition == 58) { + buffer[0][0] = data[2088]; buffer[0][1] = data[2089]; buffer[0][2] = data[2090]; buffer[0][3] = data[2091]; buffer[0][4] = data[2092]; buffer[0][5] = data[2093]; buffer[0][6] = data[2094]; buffer[0][7] = data[2095]; buffer[0][8] = data[2096]; buffer[0][9] = data[2097]; buffer[0][10] = data[2098]; buffer[0][11] = data[2099]; buffer[0][12] = data[2100]; buffer[0][13] = data[2101]; buffer[0][14] = data[2102]; buffer[0][15] = data[2103]; buffer[0][16] = data[2104]; buffer[0][17] = data[2105]; buffer[0][18] = data[2106]; buffer[0][19] = data[2107]; buffer[0][20] = data[2108]; buffer[0][21] = data[2109]; buffer[0][22] = data[2110]; buffer[0][23] = data[2111]; buffer[0][24] = data[2112]; buffer[0][25] = data[2113]; buffer[0][26] = data[2114]; buffer[0][27] = data[2115]; buffer[0][28] = data[2116]; buffer[0][29] = data[2117]; buffer[0][30] = data[2118]; buffer[0][31] = data[2119]; buffer[0][32] = data[2120]; buffer[0][33] = data[2121]; buffer[0][34] = data[2122]; buffer[0][35] = data[2123]; + + } + if (partition == 59) { + buffer[0][0] = data[2124]; buffer[0][1] = data[2125]; buffer[0][2] = data[2126]; buffer[0][3] = data[2127]; buffer[0][4] = data[2128]; buffer[0][5] = data[2129]; buffer[0][6] = data[2130]; buffer[0][7] = data[2131]; buffer[0][8] = data[2132]; buffer[0][9] = data[2133]; buffer[0][10] = data[2134]; buffer[0][11] = data[2135]; buffer[0][12] = data[2136]; buffer[0][13] = data[2137]; buffer[0][14] = data[2138]; buffer[0][15] = data[2139]; buffer[0][16] = data[2140]; buffer[0][17] = data[2141]; buffer[0][18] = data[2142]; buffer[0][19] = data[2143]; buffer[0][20] = data[2144]; buffer[0][21] = data[2145]; buffer[0][22] = data[2146]; buffer[0][23] = data[2147]; buffer[0][24] = data[2148]; buffer[0][25] = data[2149]; buffer[0][26] = data[2150]; buffer[0][27] = data[2151]; buffer[0][28] = data[2152]; buffer[0][29] = data[2153]; buffer[0][30] = data[2154]; buffer[0][31] = data[2155]; buffer[0][32] = data[2156]; buffer[0][33] = data[2157]; buffer[0][34] = data[2158]; buffer[0][35] = data[2159]; + + } + if (partition == 60) { + buffer[0][0] = data[2160]; buffer[0][1] = data[2161]; buffer[0][2] = data[2162]; buffer[0][3] = data[2163]; buffer[0][4] = data[2164]; buffer[0][5] = data[2165]; buffer[0][6] = data[2166]; buffer[0][7] = data[2167]; buffer[0][8] = data[2168]; buffer[0][9] = data[2169]; buffer[0][10] = data[2170]; buffer[0][11] = data[2171]; buffer[0][12] = data[2172]; buffer[0][13] = data[2173]; buffer[0][14] = data[2174]; buffer[0][15] = data[2175]; buffer[0][16] = data[2176]; buffer[0][17] = data[2177]; buffer[0][18] = data[2178]; buffer[0][19] = data[2179]; buffer[0][20] = data[2180]; buffer[0][21] = data[2181]; buffer[0][22] = data[2182]; buffer[0][23] = data[2183]; buffer[0][24] = data[2184]; buffer[0][25] = data[2185]; buffer[0][26] = data[2186]; buffer[0][27] = data[2187]; buffer[0][28] = data[2188]; buffer[0][29] = data[2189]; buffer[0][30] = data[2190]; buffer[0][31] = data[2191]; buffer[0][32] = data[2192]; buffer[0][33] = data[2193]; buffer[0][34] = data[2194]; buffer[0][35] = data[2195]; + + } + if (partition == 61) { + buffer[0][0] = data[2196]; buffer[0][1] = data[2197]; buffer[0][2] = data[2198]; buffer[0][3] = data[2199]; buffer[0][4] = data[2200]; buffer[0][5] = data[2201]; buffer[0][6] = data[2202]; buffer[0][7] = data[2203]; buffer[0][8] = data[2204]; buffer[0][9] = data[2205]; buffer[0][10] = data[2206]; buffer[0][11] = data[2207]; buffer[0][12] = data[2208]; buffer[0][13] = data[2209]; buffer[0][14] = data[2210]; buffer[0][15] = data[2211]; buffer[0][16] = data[2212]; buffer[0][17] = data[2213]; buffer[0][18] = data[2214]; buffer[0][19] = data[2215]; buffer[0][20] = data[2216]; buffer[0][21] = data[2217]; buffer[0][22] = data[2218]; buffer[0][23] = data[2219]; buffer[0][24] = data[2220]; buffer[0][25] = data[2221]; buffer[0][26] = data[2222]; buffer[0][27] = data[2223]; buffer[0][28] = data[2224]; buffer[0][29] = data[2225]; buffer[0][30] = data[2226]; buffer[0][31] = data[2227]; buffer[0][32] = data[2228]; buffer[0][33] = data[2229]; buffer[0][34] = data[2230]; buffer[0][35] = data[2231]; + + } + if (partition == 62) { + buffer[0][0] = data[2232]; buffer[0][1] = data[2233]; buffer[0][2] = data[2234]; buffer[0][3] = data[2235]; buffer[0][4] = data[2236]; buffer[0][5] = data[2237]; buffer[0][6] = data[2238]; buffer[0][7] = data[2239]; buffer[0][8] = data[2240]; buffer[0][9] = data[2241]; buffer[0][10] = data[2242]; buffer[0][11] = data[2243]; buffer[0][12] = data[2244]; buffer[0][13] = data[2245]; buffer[0][14] = data[2246]; buffer[0][15] = data[2247]; buffer[0][16] = data[2248]; buffer[0][17] = data[2249]; buffer[0][18] = data[2250]; buffer[0][19] = data[2251]; buffer[0][20] = data[2252]; buffer[0][21] = data[2253]; buffer[0][22] = data[2254]; buffer[0][23] = data[2255]; buffer[0][24] = data[2256]; buffer[0][25] = data[2257]; buffer[0][26] = data[2258]; buffer[0][27] = data[2259]; buffer[0][28] = data[2260]; buffer[0][29] = data[2261]; buffer[0][30] = data[2262]; buffer[0][31] = data[2263]; buffer[0][32] = data[2264]; buffer[0][33] = data[2265]; buffer[0][34] = data[2266]; buffer[0][35] = data[2267]; + + } + if (partition == 63) { + buffer[0][0] = data[2268]; buffer[0][1] = data[2269]; buffer[0][2] = data[2270]; buffer[0][3] = data[2271]; buffer[0][4] = data[2272]; buffer[0][5] = data[2273]; buffer[0][6] = data[2274]; buffer[0][7] = data[2275]; buffer[0][8] = data[2276]; buffer[0][9] = data[2277]; buffer[0][10] = data[2278]; buffer[0][11] = data[2279]; buffer[0][12] = data[2280]; buffer[0][13] = data[2281]; buffer[0][14] = data[2282]; buffer[0][15] = data[2283]; buffer[0][16] = data[2284]; buffer[0][17] = data[2285]; buffer[0][18] = data[2286]; buffer[0][19] = data[2287]; buffer[0][20] = data[2288]; buffer[0][21] = data[2289]; buffer[0][22] = data[2290]; buffer[0][23] = data[2291]; buffer[0][24] = data[2292]; buffer[0][25] = data[2293]; buffer[0][26] = data[2294]; buffer[0][27] = data[2295]; buffer[0][28] = data[2296]; buffer[0][29] = data[2297]; buffer[0][30] = data[2298]; buffer[0][31] = data[2299]; buffer[0][32] = data[2300]; buffer[0][33] = data[2301]; buffer[0][34] = data[2302]; buffer[0][35] = data[2303]; + + } + if (partition == 64) { + buffer[0][0] = data[2304]; buffer[0][1] = data[2305]; buffer[0][2] = data[2306]; buffer[0][3] = data[2307]; buffer[0][4] = data[2308]; buffer[0][5] = data[2309]; buffer[0][6] = data[2310]; buffer[0][7] = data[2311]; buffer[0][8] = data[2312]; buffer[0][9] = data[2313]; buffer[0][10] = data[2314]; buffer[0][11] = data[2315]; buffer[0][12] = data[2316]; buffer[0][13] = data[2317]; buffer[0][14] = data[2318]; buffer[0][15] = data[2319]; buffer[0][16] = data[2320]; buffer[0][17] = data[2321]; buffer[0][18] = data[2322]; buffer[0][19] = data[2323]; buffer[0][20] = data[2324]; buffer[0][21] = data[2325]; buffer[0][22] = data[2326]; buffer[0][23] = data[2327]; buffer[0][24] = data[2328]; buffer[0][25] = data[2329]; buffer[0][26] = data[2330]; buffer[0][27] = data[2331]; buffer[0][28] = data[2332]; buffer[0][29] = data[2333]; buffer[0][30] = data[2334]; buffer[0][31] = data[2335]; buffer[0][32] = data[2336]; buffer[0][33] = data[2337]; buffer[0][34] = data[2338]; buffer[0][35] = data[2339]; + + } + if (partition == 65) { + buffer[0][0] = data[2340]; buffer[0][1] = data[2341]; buffer[0][2] = data[2342]; buffer[0][3] = data[2343]; buffer[0][4] = data[2344]; buffer[0][5] = data[2345]; buffer[0][6] = data[2346]; buffer[0][7] = data[2347]; buffer[0][8] = data[2348]; buffer[0][9] = data[2349]; buffer[0][10] = data[2350]; buffer[0][11] = data[2351]; buffer[0][12] = data[2352]; buffer[0][13] = data[2353]; buffer[0][14] = data[2354]; buffer[0][15] = data[2355]; buffer[0][16] = data[2356]; buffer[0][17] = data[2357]; buffer[0][18] = data[2358]; buffer[0][19] = data[2359]; buffer[0][20] = data[2360]; buffer[0][21] = data[2361]; buffer[0][22] = data[2362]; buffer[0][23] = data[2363]; buffer[0][24] = data[2364]; buffer[0][25] = data[2365]; buffer[0][26] = data[2366]; buffer[0][27] = data[2367]; buffer[0][28] = data[2368]; buffer[0][29] = data[2369]; buffer[0][30] = data[2370]; buffer[0][31] = data[2371]; buffer[0][32] = data[2372]; buffer[0][33] = data[2373]; buffer[0][34] = data[2374]; buffer[0][35] = data[2375]; + + } + if (partition == 66) { + buffer[0][0] = data[2376]; buffer[0][1] = data[2377]; buffer[0][2] = data[2378]; buffer[0][3] = data[2379]; buffer[0][4] = data[2380]; buffer[0][5] = data[2381]; buffer[0][6] = data[2382]; buffer[0][7] = data[2383]; buffer[0][8] = data[2384]; buffer[0][9] = data[2385]; buffer[0][10] = data[2386]; buffer[0][11] = data[2387]; buffer[0][12] = data[2388]; buffer[0][13] = data[2389]; buffer[0][14] = data[2390]; buffer[0][15] = data[2391]; buffer[0][16] = data[2392]; buffer[0][17] = data[2393]; buffer[0][18] = data[2394]; buffer[0][19] = data[2395]; buffer[0][20] = data[2396]; buffer[0][21] = data[2397]; buffer[0][22] = data[2398]; buffer[0][23] = data[2399]; buffer[0][24] = data[2400]; buffer[0][25] = data[2401]; buffer[0][26] = data[2402]; buffer[0][27] = data[2403]; buffer[0][28] = data[2404]; buffer[0][29] = data[2405]; buffer[0][30] = data[2406]; buffer[0][31] = data[2407]; buffer[0][32] = data[2408]; buffer[0][33] = data[2409]; buffer[0][34] = data[2410]; buffer[0][35] = data[2411]; + + } + if (partition == 67) { + buffer[0][0] = data[2412]; buffer[0][1] = data[2413]; buffer[0][2] = data[2414]; buffer[0][3] = data[2415]; buffer[0][4] = data[2416]; buffer[0][5] = data[2417]; buffer[0][6] = data[2418]; buffer[0][7] = data[2419]; buffer[0][8] = data[2420]; buffer[0][9] = data[2421]; buffer[0][10] = data[2422]; buffer[0][11] = data[2423]; buffer[0][12] = data[2424]; buffer[0][13] = data[2425]; buffer[0][14] = data[2426]; buffer[0][15] = data[2427]; buffer[0][16] = data[2428]; buffer[0][17] = data[2429]; buffer[0][18] = data[2430]; buffer[0][19] = data[2431]; buffer[0][20] = data[2432]; buffer[0][21] = data[2433]; buffer[0][22] = data[2434]; buffer[0][23] = data[2435]; buffer[0][24] = data[2436]; buffer[0][25] = data[2437]; buffer[0][26] = data[2438]; buffer[0][27] = data[2439]; buffer[0][28] = data[2440]; buffer[0][29] = data[2441]; buffer[0][30] = data[2442]; buffer[0][31] = data[2443]; buffer[0][32] = data[2444]; buffer[0][33] = data[2445]; buffer[0][34] = data[2446]; buffer[0][35] = data[2447]; + + } + if (partition == 68) { + buffer[0][0] = data[2448]; buffer[0][1] = data[2449]; buffer[0][2] = data[2450]; buffer[0][3] = data[2451]; buffer[0][4] = data[2452]; buffer[0][5] = data[2453]; buffer[0][6] = data[2454]; buffer[0][7] = data[2455]; buffer[0][8] = data[2456]; buffer[0][9] = data[2457]; buffer[0][10] = data[2458]; buffer[0][11] = data[2459]; buffer[0][12] = data[2460]; buffer[0][13] = data[2461]; buffer[0][14] = data[2462]; buffer[0][15] = data[2463]; buffer[0][16] = data[2464]; buffer[0][17] = data[2465]; buffer[0][18] = data[2466]; buffer[0][19] = data[2467]; buffer[0][20] = data[2468]; buffer[0][21] = data[2469]; buffer[0][22] = data[2470]; buffer[0][23] = data[2471]; buffer[0][24] = data[2472]; buffer[0][25] = data[2473]; buffer[0][26] = data[2474]; buffer[0][27] = data[2475]; buffer[0][28] = data[2476]; buffer[0][29] = data[2477]; buffer[0][30] = data[2478]; buffer[0][31] = data[2479]; buffer[0][32] = data[2480]; buffer[0][33] = data[2481]; buffer[0][34] = data[2482]; buffer[0][35] = data[2483]; + + } + if (partition == 69) { + buffer[0][0] = data[2484]; buffer[0][1] = data[2485]; buffer[0][2] = data[2486]; buffer[0][3] = data[2487]; buffer[0][4] = data[2488]; buffer[0][5] = data[2489]; buffer[0][6] = data[2490]; buffer[0][7] = data[2491]; buffer[0][8] = data[2492]; buffer[0][9] = data[2493]; buffer[0][10] = data[2494]; buffer[0][11] = data[2495]; buffer[0][12] = data[2496]; buffer[0][13] = data[2497]; buffer[0][14] = data[2498]; buffer[0][15] = data[2499]; buffer[0][16] = data[2500]; buffer[0][17] = data[2501]; buffer[0][18] = data[2502]; buffer[0][19] = data[2503]; buffer[0][20] = data[2504]; buffer[0][21] = data[2505]; buffer[0][22] = data[2506]; buffer[0][23] = data[2507]; buffer[0][24] = data[2508]; buffer[0][25] = data[2509]; buffer[0][26] = data[2510]; buffer[0][27] = data[2511]; buffer[0][28] = data[2512]; buffer[0][29] = data[2513]; buffer[0][30] = data[2514]; buffer[0][31] = data[2515]; buffer[0][32] = data[2516]; buffer[0][33] = data[2517]; buffer[0][34] = data[2518]; buffer[0][35] = data[2519]; + + } + if (partition == 70) { + buffer[0][0] = data[2520]; buffer[0][1] = data[2521]; buffer[0][2] = data[2522]; buffer[0][3] = data[2523]; buffer[0][4] = data[2524]; buffer[0][5] = data[2525]; buffer[0][6] = data[2526]; buffer[0][7] = data[2527]; buffer[0][8] = data[2528]; buffer[0][9] = data[2529]; buffer[0][10] = data[2530]; buffer[0][11] = data[2531]; buffer[0][12] = data[2532]; buffer[0][13] = data[2533]; buffer[0][14] = data[2534]; buffer[0][15] = data[2535]; buffer[0][16] = data[2536]; buffer[0][17] = data[2537]; buffer[0][18] = data[2538]; buffer[0][19] = data[2539]; buffer[0][20] = data[2540]; buffer[0][21] = data[2541]; buffer[0][22] = data[2542]; buffer[0][23] = data[2543]; buffer[0][24] = data[2544]; buffer[0][25] = data[2545]; buffer[0][26] = data[2546]; buffer[0][27] = data[2547]; buffer[0][28] = data[2548]; buffer[0][29] = data[2549]; buffer[0][30] = data[2550]; buffer[0][31] = data[2551]; buffer[0][32] = data[2552]; buffer[0][33] = data[2553]; buffer[0][34] = data[2554]; buffer[0][35] = data[2555]; + + } + if (partition == 71) { + buffer[0][0] = data[2556]; buffer[0][1] = data[2557]; buffer[0][2] = data[2558]; buffer[0][3] = data[2559]; buffer[0][4] = data[2560]; buffer[0][5] = data[2561]; buffer[0][6] = data[2562]; buffer[0][7] = data[2563]; buffer[0][8] = data[2564]; buffer[0][9] = data[2565]; buffer[0][10] = data[2566]; buffer[0][11] = data[2567]; buffer[0][12] = data[2568]; buffer[0][13] = data[2569]; buffer[0][14] = data[2570]; buffer[0][15] = data[2571]; buffer[0][16] = data[2572]; buffer[0][17] = data[2573]; buffer[0][18] = data[2574]; buffer[0][19] = data[2575]; buffer[0][20] = data[2576]; buffer[0][21] = data[2577]; buffer[0][22] = data[2578]; buffer[0][23] = data[2579]; buffer[0][24] = data[2580]; buffer[0][25] = data[2581]; buffer[0][26] = data[2582]; buffer[0][27] = data[2583]; buffer[0][28] = data[2584]; buffer[0][29] = data[2585]; buffer[0][30] = data[2586]; buffer[0][31] = data[2587]; buffer[0][32] = data[2588]; buffer[0][33] = data[2589]; buffer[0][34] = data[2590]; buffer[0][35] = data[2591]; + + } + if (partition == 72) { + buffer[0][0] = data[2592]; buffer[0][1] = data[2593]; buffer[0][2] = data[2594]; buffer[0][3] = data[2595]; buffer[0][4] = data[2596]; buffer[0][5] = data[2597]; buffer[0][6] = data[2598]; buffer[0][7] = data[2599]; buffer[0][8] = data[2600]; buffer[0][9] = data[2601]; buffer[0][10] = data[2602]; buffer[0][11] = data[2603]; buffer[0][12] = data[2604]; buffer[0][13] = data[2605]; buffer[0][14] = data[2606]; buffer[0][15] = data[2607]; buffer[0][16] = data[2608]; buffer[0][17] = data[2609]; buffer[0][18] = data[2610]; buffer[0][19] = data[2611]; buffer[0][20] = data[2612]; buffer[0][21] = data[2613]; buffer[0][22] = data[2614]; buffer[0][23] = data[2615]; buffer[0][24] = data[2616]; buffer[0][25] = data[2617]; buffer[0][26] = data[2618]; buffer[0][27] = data[2619]; buffer[0][28] = data[2620]; buffer[0][29] = data[2621]; buffer[0][30] = data[2622]; buffer[0][31] = data[2623]; buffer[0][32] = data[2624]; buffer[0][33] = data[2625]; buffer[0][34] = data[2626]; buffer[0][35] = data[2627]; + + } + if (partition == 73) { + buffer[0][0] = data[2628]; buffer[0][1] = data[2629]; buffer[0][2] = data[2630]; buffer[0][3] = data[2631]; buffer[0][4] = data[2632]; buffer[0][5] = data[2633]; buffer[0][6] = data[2634]; buffer[0][7] = data[2635]; buffer[0][8] = data[2636]; buffer[0][9] = data[2637]; buffer[0][10] = data[2638]; buffer[0][11] = data[2639]; buffer[0][12] = data[2640]; buffer[0][13] = data[2641]; buffer[0][14] = data[2642]; buffer[0][15] = data[2643]; buffer[0][16] = data[2644]; buffer[0][17] = data[2645]; buffer[0][18] = data[2646]; buffer[0][19] = data[2647]; buffer[0][20] = data[2648]; buffer[0][21] = data[2649]; buffer[0][22] = data[2650]; buffer[0][23] = data[2651]; buffer[0][24] = data[2652]; buffer[0][25] = data[2653]; buffer[0][26] = data[2654]; buffer[0][27] = data[2655]; buffer[0][28] = data[2656]; buffer[0][29] = data[2657]; buffer[0][30] = data[2658]; buffer[0][31] = data[2659]; buffer[0][32] = data[2660]; buffer[0][33] = data[2661]; buffer[0][34] = data[2662]; buffer[0][35] = data[2663]; + + } + if (partition == 74) { + buffer[0][0] = data[2664]; buffer[0][1] = data[2665]; buffer[0][2] = data[2666]; buffer[0][3] = data[2667]; buffer[0][4] = data[2668]; buffer[0][5] = data[2669]; buffer[0][6] = data[2670]; buffer[0][7] = data[2671]; buffer[0][8] = data[2672]; buffer[0][9] = data[2673]; buffer[0][10] = data[2674]; buffer[0][11] = data[2675]; buffer[0][12] = data[2676]; buffer[0][13] = data[2677]; buffer[0][14] = data[2678]; buffer[0][15] = data[2679]; buffer[0][16] = data[2680]; buffer[0][17] = data[2681]; buffer[0][18] = data[2682]; buffer[0][19] = data[2683]; buffer[0][20] = data[2684]; buffer[0][21] = data[2685]; buffer[0][22] = data[2686]; buffer[0][23] = data[2687]; buffer[0][24] = data[2688]; buffer[0][25] = data[2689]; buffer[0][26] = data[2690]; buffer[0][27] = data[2691]; buffer[0][28] = data[2692]; buffer[0][29] = data[2693]; buffer[0][30] = data[2694]; buffer[0][31] = data[2695]; buffer[0][32] = data[2696]; buffer[0][33] = data[2697]; buffer[0][34] = data[2698]; buffer[0][35] = data[2699]; + + } + if (partition == 75) { + buffer[0][0] = data[2700]; buffer[0][1] = data[2701]; buffer[0][2] = data[2702]; buffer[0][3] = data[2703]; buffer[0][4] = data[2704]; buffer[0][5] = data[2705]; buffer[0][6] = data[2706]; buffer[0][7] = data[2707]; buffer[0][8] = data[2708]; buffer[0][9] = data[2709]; buffer[0][10] = data[2710]; buffer[0][11] = data[2711]; buffer[0][12] = data[2712]; buffer[0][13] = data[2713]; buffer[0][14] = data[2714]; buffer[0][15] = data[2715]; buffer[0][16] = data[2716]; buffer[0][17] = data[2717]; buffer[0][18] = data[2718]; buffer[0][19] = data[2719]; buffer[0][20] = data[2720]; buffer[0][21] = data[2721]; buffer[0][22] = data[2722]; buffer[0][23] = data[2723]; buffer[0][24] = data[2724]; buffer[0][25] = data[2725]; buffer[0][26] = data[2726]; buffer[0][27] = data[2727]; buffer[0][28] = data[2728]; buffer[0][29] = data[2729]; buffer[0][30] = data[2730]; buffer[0][31] = data[2731]; buffer[0][32] = data[2732]; buffer[0][33] = data[2733]; buffer[0][34] = data[2734]; buffer[0][35] = data[2735]; + + } + if (partition == 76) { + buffer[0][0] = data[2736]; buffer[0][1] = data[2737]; buffer[0][2] = data[2738]; buffer[0][3] = data[2739]; buffer[0][4] = data[2740]; buffer[0][5] = data[2741]; buffer[0][6] = data[2742]; buffer[0][7] = data[2743]; buffer[0][8] = data[2744]; buffer[0][9] = data[2745]; buffer[0][10] = data[2746]; buffer[0][11] = data[2747]; buffer[0][12] = data[2748]; buffer[0][13] = data[2749]; buffer[0][14] = data[2750]; buffer[0][15] = data[2751]; buffer[0][16] = data[2752]; buffer[0][17] = data[2753]; buffer[0][18] = data[2754]; buffer[0][19] = data[2755]; buffer[0][20] = data[2756]; buffer[0][21] = data[2757]; buffer[0][22] = data[2758]; buffer[0][23] = data[2759]; buffer[0][24] = data[2760]; buffer[0][25] = data[2761]; buffer[0][26] = data[2762]; buffer[0][27] = data[2763]; buffer[0][28] = data[2764]; buffer[0][29] = data[2765]; buffer[0][30] = data[2766]; buffer[0][31] = data[2767]; buffer[0][32] = data[2768]; buffer[0][33] = data[2769]; buffer[0][34] = data[2770]; buffer[0][35] = data[2771]; + + } + if (partition == 77) { + buffer[0][0] = data[2772]; buffer[0][1] = data[2773]; buffer[0][2] = data[2774]; buffer[0][3] = data[2775]; buffer[0][4] = data[2776]; buffer[0][5] = data[2777]; buffer[0][6] = data[2778]; buffer[0][7] = data[2779]; buffer[0][8] = data[2780]; buffer[0][9] = data[2781]; buffer[0][10] = data[2782]; buffer[0][11] = data[2783]; buffer[0][12] = data[2784]; buffer[0][13] = data[2785]; buffer[0][14] = data[2786]; buffer[0][15] = data[2787]; buffer[0][16] = data[2788]; buffer[0][17] = data[2789]; buffer[0][18] = data[2790]; buffer[0][19] = data[2791]; buffer[0][20] = data[2792]; buffer[0][21] = data[2793]; buffer[0][22] = data[2794]; buffer[0][23] = data[2795]; buffer[0][24] = data[2796]; buffer[0][25] = data[2797]; buffer[0][26] = data[2798]; buffer[0][27] = data[2799]; buffer[0][28] = data[2800]; buffer[0][29] = data[2801]; buffer[0][30] = data[2802]; buffer[0][31] = data[2803]; buffer[0][32] = data[2804]; buffer[0][33] = data[2805]; buffer[0][34] = data[2806]; buffer[0][35] = data[2807]; + + } + if (partition == 78) { + buffer[0][0] = data[2808]; buffer[0][1] = data[2809]; buffer[0][2] = data[2810]; buffer[0][3] = data[2811]; buffer[0][4] = data[2812]; buffer[0][5] = data[2813]; buffer[0][6] = data[2814]; buffer[0][7] = data[2815]; buffer[0][8] = data[2816]; buffer[0][9] = data[2817]; buffer[0][10] = data[2818]; buffer[0][11] = data[2819]; buffer[0][12] = data[2820]; buffer[0][13] = data[2821]; buffer[0][14] = data[2822]; buffer[0][15] = data[2823]; buffer[0][16] = data[2824]; buffer[0][17] = data[2825]; buffer[0][18] = data[2826]; buffer[0][19] = data[2827]; buffer[0][20] = data[2828]; buffer[0][21] = data[2829]; buffer[0][22] = data[2830]; buffer[0][23] = data[2831]; buffer[0][24] = data[2832]; buffer[0][25] = data[2833]; buffer[0][26] = data[2834]; buffer[0][27] = data[2835]; buffer[0][28] = data[2836]; buffer[0][29] = data[2837]; buffer[0][30] = data[2838]; buffer[0][31] = data[2839]; buffer[0][32] = data[2840]; buffer[0][33] = data[2841]; buffer[0][34] = data[2842]; buffer[0][35] = data[2843]; + + } + if (partition == 79) { + buffer[0][0] = data[2844]; buffer[0][1] = data[2845]; buffer[0][2] = data[2846]; buffer[0][3] = data[2847]; buffer[0][4] = data[2848]; buffer[0][5] = data[2849]; buffer[0][6] = data[2850]; buffer[0][7] = data[2851]; buffer[0][8] = data[2852]; buffer[0][9] = data[2853]; buffer[0][10] = data[2854]; buffer[0][11] = data[2855]; buffer[0][12] = data[2856]; buffer[0][13] = data[2857]; buffer[0][14] = data[2858]; buffer[0][15] = data[2859]; buffer[0][16] = data[2860]; buffer[0][17] = data[2861]; buffer[0][18] = data[2862]; buffer[0][19] = data[2863]; buffer[0][20] = data[2864]; buffer[0][21] = data[2865]; buffer[0][22] = data[2866]; buffer[0][23] = data[2867]; buffer[0][24] = data[2868]; buffer[0][25] = data[2869]; buffer[0][26] = data[2870]; buffer[0][27] = data[2871]; buffer[0][28] = data[2872]; buffer[0][29] = data[2873]; buffer[0][30] = data[2874]; buffer[0][31] = data[2875]; buffer[0][32] = data[2876]; buffer[0][33] = data[2877]; buffer[0][34] = data[2878]; buffer[0][35] = data[2879]; + + } + if (partition == 80) { + buffer[0][0] = data[2880]; buffer[0][1] = data[2881]; buffer[0][2] = data[2882]; buffer[0][3] = data[2883]; buffer[0][4] = data[2884]; buffer[0][5] = data[2885]; buffer[0][6] = data[2886]; buffer[0][7] = data[2887]; buffer[0][8] = data[2888]; buffer[0][9] = data[2889]; buffer[0][10] = data[2890]; buffer[0][11] = data[2891]; buffer[0][12] = data[2892]; buffer[0][13] = data[2893]; buffer[0][14] = data[2894]; buffer[0][15] = data[2895]; buffer[0][16] = data[2896]; buffer[0][17] = data[2897]; buffer[0][18] = data[2898]; buffer[0][19] = data[2899]; buffer[0][20] = data[2900]; buffer[0][21] = data[2901]; buffer[0][22] = data[2902]; buffer[0][23] = data[2903]; buffer[0][24] = data[2904]; buffer[0][25] = data[2905]; buffer[0][26] = data[2906]; buffer[0][27] = data[2907]; buffer[0][28] = data[2908]; buffer[0][29] = data[2909]; buffer[0][30] = data[2910]; buffer[0][31] = data[2911]; buffer[0][32] = data[2912]; buffer[0][33] = data[2913]; buffer[0][34] = data[2914]; buffer[0][35] = data[2915]; + + } + if (partition == 81) { + buffer[0][0] = data[2916]; buffer[0][1] = data[2917]; buffer[0][2] = data[2918]; buffer[0][3] = data[2919]; buffer[0][4] = data[2920]; buffer[0][5] = data[2921]; buffer[0][6] = data[2922]; buffer[0][7] = data[2923]; buffer[0][8] = data[2924]; buffer[0][9] = data[2925]; buffer[0][10] = data[2926]; buffer[0][11] = data[2927]; buffer[0][12] = data[2928]; buffer[0][13] = data[2929]; buffer[0][14] = data[2930]; buffer[0][15] = data[2931]; buffer[0][16] = data[2932]; buffer[0][17] = data[2933]; buffer[0][18] = data[2934]; buffer[0][19] = data[2935]; buffer[0][20] = data[2936]; buffer[0][21] = data[2937]; buffer[0][22] = data[2938]; buffer[0][23] = data[2939]; buffer[0][24] = data[2940]; buffer[0][25] = data[2941]; buffer[0][26] = data[2942]; buffer[0][27] = data[2943]; buffer[0][28] = data[2944]; buffer[0][29] = data[2945]; buffer[0][30] = data[2946]; buffer[0][31] = data[2947]; buffer[0][32] = data[2948]; buffer[0][33] = data[2949]; buffer[0][34] = data[2950]; buffer[0][35] = data[2951]; + + } + if (partition == 82) { + buffer[0][0] = data[2952]; buffer[0][1] = data[2953]; buffer[0][2] = data[2954]; buffer[0][3] = data[2955]; buffer[0][4] = data[2956]; buffer[0][5] = data[2957]; buffer[0][6] = data[2958]; buffer[0][7] = data[2959]; buffer[0][8] = data[2960]; buffer[0][9] = data[2961]; buffer[0][10] = data[2962]; buffer[0][11] = data[2963]; buffer[0][12] = data[2964]; buffer[0][13] = data[2965]; buffer[0][14] = data[2966]; buffer[0][15] = data[2967]; buffer[0][16] = data[2968]; buffer[0][17] = data[2969]; buffer[0][18] = data[2970]; buffer[0][19] = data[2971]; buffer[0][20] = data[2972]; buffer[0][21] = data[2973]; buffer[0][22] = data[2974]; buffer[0][23] = data[2975]; buffer[0][24] = data[2976]; buffer[0][25] = data[2977]; buffer[0][26] = data[2978]; buffer[0][27] = data[2979]; buffer[0][28] = data[2980]; buffer[0][29] = data[2981]; buffer[0][30] = data[2982]; buffer[0][31] = data[2983]; buffer[0][32] = data[2984]; buffer[0][33] = data[2985]; buffer[0][34] = data[2986]; buffer[0][35] = data[2987]; + + } + if (partition == 83) { + buffer[0][0] = data[2988]; buffer[0][1] = data[2989]; buffer[0][2] = data[2990]; buffer[0][3] = data[2991]; buffer[0][4] = data[2992]; buffer[0][5] = data[2993]; buffer[0][6] = data[2994]; buffer[0][7] = data[2995]; buffer[0][8] = data[2996]; buffer[0][9] = data[2997]; buffer[0][10] = data[2998]; buffer[0][11] = data[2999]; buffer[0][12] = data[3000]; buffer[0][13] = data[3001]; buffer[0][14] = data[3002]; buffer[0][15] = data[3003]; buffer[0][16] = data[3004]; buffer[0][17] = data[3005]; buffer[0][18] = data[3006]; buffer[0][19] = data[3007]; buffer[0][20] = data[3008]; buffer[0][21] = data[3009]; buffer[0][22] = data[3010]; buffer[0][23] = data[3011]; buffer[0][24] = data[3012]; buffer[0][25] = data[3013]; buffer[0][26] = data[3014]; buffer[0][27] = data[3015]; buffer[0][28] = data[3016]; buffer[0][29] = data[3017]; buffer[0][30] = data[3018]; buffer[0][31] = data[3019]; buffer[0][32] = data[3020]; buffer[0][33] = data[3021]; buffer[0][34] = data[3022]; buffer[0][35] = data[3023]; + + } + if (partition == 84) { + buffer[0][0] = data[3024]; buffer[0][1] = data[3025]; buffer[0][2] = data[3026]; buffer[0][3] = data[3027]; buffer[0][4] = data[3028]; buffer[0][5] = data[3029]; buffer[0][6] = data[3030]; buffer[0][7] = data[3031]; buffer[0][8] = data[3032]; buffer[0][9] = data[3033]; buffer[0][10] = data[3034]; buffer[0][11] = data[3035]; buffer[0][12] = data[3036]; buffer[0][13] = data[3037]; buffer[0][14] = data[3038]; buffer[0][15] = data[3039]; buffer[0][16] = data[3040]; buffer[0][17] = data[3041]; buffer[0][18] = data[3042]; buffer[0][19] = data[3043]; buffer[0][20] = data[3044]; buffer[0][21] = data[3045]; buffer[0][22] = data[3046]; buffer[0][23] = data[3047]; buffer[0][24] = data[3048]; buffer[0][25] = data[3049]; buffer[0][26] = data[3050]; buffer[0][27] = data[3051]; buffer[0][28] = data[3052]; buffer[0][29] = data[3053]; buffer[0][30] = data[3054]; buffer[0][31] = data[3055]; buffer[0][32] = data[3056]; buffer[0][33] = data[3057]; buffer[0][34] = data[3058]; buffer[0][35] = data[3059]; + + } + if (partition == 85) { + buffer[0][0] = data[3060]; buffer[0][1] = data[3061]; buffer[0][2] = data[3062]; buffer[0][3] = data[3063]; buffer[0][4] = data[3064]; buffer[0][5] = data[3065]; buffer[0][6] = data[3066]; buffer[0][7] = data[3067]; buffer[0][8] = data[3068]; buffer[0][9] = data[3069]; buffer[0][10] = data[3070]; buffer[0][11] = data[3071]; buffer[0][12] = data[3072]; buffer[0][13] = data[3073]; buffer[0][14] = data[3074]; buffer[0][15] = data[3075]; buffer[0][16] = data[3076]; buffer[0][17] = data[3077]; buffer[0][18] = data[3078]; buffer[0][19] = data[3079]; buffer[0][20] = data[3080]; buffer[0][21] = data[3081]; buffer[0][22] = data[3082]; buffer[0][23] = data[3083]; buffer[0][24] = data[3084]; buffer[0][25] = data[3085]; buffer[0][26] = data[3086]; buffer[0][27] = data[3087]; buffer[0][28] = data[3088]; buffer[0][29] = data[3089]; buffer[0][30] = data[3090]; buffer[0][31] = data[3091]; buffer[0][32] = data[3092]; buffer[0][33] = data[3093]; buffer[0][34] = data[3094]; buffer[0][35] = data[3095]; + + } + if (partition == 86) { + buffer[0][0] = data[3096]; buffer[0][1] = data[3097]; buffer[0][2] = data[3098]; buffer[0][3] = data[3099]; buffer[0][4] = data[3100]; buffer[0][5] = data[3101]; buffer[0][6] = data[3102]; buffer[0][7] = data[3103]; buffer[0][8] = data[3104]; buffer[0][9] = data[3105]; buffer[0][10] = data[3106]; buffer[0][11] = data[3107]; buffer[0][12] = data[3108]; buffer[0][13] = data[3109]; buffer[0][14] = data[3110]; buffer[0][15] = data[3111]; buffer[0][16] = data[3112]; buffer[0][17] = data[3113]; buffer[0][18] = data[3114]; buffer[0][19] = data[3115]; buffer[0][20] = data[3116]; buffer[0][21] = data[3117]; buffer[0][22] = data[3118]; buffer[0][23] = data[3119]; buffer[0][24] = data[3120]; buffer[0][25] = data[3121]; buffer[0][26] = data[3122]; buffer[0][27] = data[3123]; buffer[0][28] = data[3124]; buffer[0][29] = data[3125]; buffer[0][30] = data[3126]; buffer[0][31] = data[3127]; buffer[0][32] = data[3128]; buffer[0][33] = data[3129]; buffer[0][34] = data[3130]; buffer[0][35] = data[3131]; + + } + if (partition == 87) { + buffer[0][0] = data[3132]; buffer[0][1] = data[3133]; buffer[0][2] = data[3134]; buffer[0][3] = data[3135]; buffer[0][4] = data[3136]; buffer[0][5] = data[3137]; buffer[0][6] = data[3138]; buffer[0][7] = data[3139]; buffer[0][8] = data[3140]; buffer[0][9] = data[3141]; buffer[0][10] = data[3142]; buffer[0][11] = data[3143]; buffer[0][12] = data[3144]; buffer[0][13] = data[3145]; buffer[0][14] = data[3146]; buffer[0][15] = data[3147]; buffer[0][16] = data[3148]; buffer[0][17] = data[3149]; buffer[0][18] = data[3150]; buffer[0][19] = data[3151]; buffer[0][20] = data[3152]; buffer[0][21] = data[3153]; buffer[0][22] = data[3154]; buffer[0][23] = data[3155]; buffer[0][24] = data[3156]; buffer[0][25] = data[3157]; buffer[0][26] = data[3158]; buffer[0][27] = data[3159]; buffer[0][28] = data[3160]; buffer[0][29] = data[3161]; buffer[0][30] = data[3162]; buffer[0][31] = data[3163]; buffer[0][32] = data[3164]; buffer[0][33] = data[3165]; buffer[0][34] = data[3166]; buffer[0][35] = data[3167]; + + } + if (partition == 88) { + buffer[0][0] = data[3168]; buffer[0][1] = data[3169]; buffer[0][2] = data[3170]; buffer[0][3] = data[3171]; buffer[0][4] = data[3172]; buffer[0][5] = data[3173]; buffer[0][6] = data[3174]; buffer[0][7] = data[3175]; buffer[0][8] = data[3176]; buffer[0][9] = data[3177]; buffer[0][10] = data[3178]; buffer[0][11] = data[3179]; buffer[0][12] = data[3180]; buffer[0][13] = data[3181]; buffer[0][14] = data[3182]; buffer[0][15] = data[3183]; buffer[0][16] = data[3184]; buffer[0][17] = data[3185]; buffer[0][18] = data[3186]; buffer[0][19] = data[3187]; buffer[0][20] = data[3188]; buffer[0][21] = data[3189]; buffer[0][22] = data[3190]; buffer[0][23] = data[3191]; buffer[0][24] = data[3192]; buffer[0][25] = data[3193]; buffer[0][26] = data[3194]; buffer[0][27] = data[3195]; buffer[0][28] = data[3196]; buffer[0][29] = data[3197]; buffer[0][30] = data[3198]; buffer[0][31] = data[3199]; buffer[0][32] = data[3200]; buffer[0][33] = data[3201]; buffer[0][34] = data[3202]; buffer[0][35] = data[3203]; + + } + if (partition == 89) { + buffer[0][0] = data[3204]; buffer[0][1] = data[3205]; buffer[0][2] = data[3206]; buffer[0][3] = data[3207]; buffer[0][4] = data[3208]; buffer[0][5] = data[3209]; buffer[0][6] = data[3210]; buffer[0][7] = data[3211]; buffer[0][8] = data[3212]; buffer[0][9] = data[3213]; buffer[0][10] = data[3214]; buffer[0][11] = data[3215]; buffer[0][12] = data[3216]; buffer[0][13] = data[3217]; buffer[0][14] = data[3218]; buffer[0][15] = data[3219]; buffer[0][16] = data[3220]; buffer[0][17] = data[3221]; buffer[0][18] = data[3222]; buffer[0][19] = data[3223]; buffer[0][20] = data[3224]; buffer[0][21] = data[3225]; buffer[0][22] = data[3226]; buffer[0][23] = data[3227]; buffer[0][24] = data[3228]; buffer[0][25] = data[3229]; buffer[0][26] = data[3230]; buffer[0][27] = data[3231]; buffer[0][28] = data[3232]; buffer[0][29] = data[3233]; buffer[0][30] = data[3234]; buffer[0][31] = data[3235]; buffer[0][32] = data[3236]; buffer[0][33] = data[3237]; buffer[0][34] = data[3238]; buffer[0][35] = data[3239]; + + } + if (partition == 90) { + buffer[0][0] = data[3240]; buffer[0][1] = data[3241]; buffer[0][2] = data[3242]; buffer[0][3] = data[3243]; buffer[0][4] = data[3244]; buffer[0][5] = data[3245]; buffer[0][6] = data[3246]; buffer[0][7] = data[3247]; buffer[0][8] = data[3248]; buffer[0][9] = data[3249]; buffer[0][10] = data[3250]; buffer[0][11] = data[3251]; buffer[0][12] = data[3252]; buffer[0][13] = data[3253]; buffer[0][14] = data[3254]; buffer[0][15] = data[3255]; buffer[0][16] = data[3256]; buffer[0][17] = data[3257]; buffer[0][18] = data[3258]; buffer[0][19] = data[3259]; buffer[0][20] = data[3260]; buffer[0][21] = data[3261]; buffer[0][22] = data[3262]; buffer[0][23] = data[3263]; buffer[0][24] = data[3264]; buffer[0][25] = data[3265]; buffer[0][26] = data[3266]; buffer[0][27] = data[3267]; buffer[0][28] = data[3268]; buffer[0][29] = data[3269]; buffer[0][30] = data[3270]; buffer[0][31] = data[3271]; buffer[0][32] = data[3272]; buffer[0][33] = data[3273]; buffer[0][34] = data[3274]; buffer[0][35] = data[3275]; + + } + if (partition == 91) { + buffer[0][0] = data[3276]; buffer[0][1] = data[3277]; buffer[0][2] = data[3278]; buffer[0][3] = data[3279]; buffer[0][4] = data[3280]; buffer[0][5] = data[3281]; buffer[0][6] = data[3282]; buffer[0][7] = data[3283]; buffer[0][8] = data[3284]; buffer[0][9] = data[3285]; buffer[0][10] = data[3286]; buffer[0][11] = data[3287]; buffer[0][12] = data[3288]; buffer[0][13] = data[3289]; buffer[0][14] = data[3290]; buffer[0][15] = data[3291]; buffer[0][16] = data[3292]; buffer[0][17] = data[3293]; buffer[0][18] = data[3294]; buffer[0][19] = data[3295]; buffer[0][20] = data[3296]; buffer[0][21] = data[3297]; buffer[0][22] = data[3298]; buffer[0][23] = data[3299]; buffer[0][24] = data[3300]; buffer[0][25] = data[3301]; buffer[0][26] = data[3302]; buffer[0][27] = data[3303]; buffer[0][28] = data[3304]; buffer[0][29] = data[3305]; buffer[0][30] = data[3306]; buffer[0][31] = data[3307]; buffer[0][32] = data[3308]; buffer[0][33] = data[3309]; buffer[0][34] = data[3310]; buffer[0][35] = data[3311]; + + } + if (partition == 92) { + buffer[0][0] = data[3312]; buffer[0][1] = data[3313]; buffer[0][2] = data[3314]; buffer[0][3] = data[3315]; buffer[0][4] = data[3316]; buffer[0][5] = data[3317]; buffer[0][6] = data[3318]; buffer[0][7] = data[3319]; buffer[0][8] = data[3320]; buffer[0][9] = data[3321]; buffer[0][10] = data[3322]; buffer[0][11] = data[3323]; buffer[0][12] = data[3324]; buffer[0][13] = data[3325]; buffer[0][14] = data[3326]; buffer[0][15] = data[3327]; buffer[0][16] = data[3328]; buffer[0][17] = data[3329]; buffer[0][18] = data[3330]; buffer[0][19] = data[3331]; buffer[0][20] = data[3332]; buffer[0][21] = data[3333]; buffer[0][22] = data[3334]; buffer[0][23] = data[3335]; buffer[0][24] = data[3336]; buffer[0][25] = data[3337]; buffer[0][26] = data[3338]; buffer[0][27] = data[3339]; buffer[0][28] = data[3340]; buffer[0][29] = data[3341]; buffer[0][30] = data[3342]; buffer[0][31] = data[3343]; buffer[0][32] = data[3344]; buffer[0][33] = data[3345]; buffer[0][34] = data[3346]; buffer[0][35] = data[3347]; + + } + if (partition == 93) { + buffer[0][0] = data[3348]; buffer[0][1] = data[3349]; buffer[0][2] = data[3350]; buffer[0][3] = data[3351]; buffer[0][4] = data[3352]; buffer[0][5] = data[3353]; buffer[0][6] = data[3354]; buffer[0][7] = data[3355]; buffer[0][8] = data[3356]; buffer[0][9] = data[3357]; buffer[0][10] = data[3358]; buffer[0][11] = data[3359]; buffer[0][12] = data[3360]; buffer[0][13] = data[3361]; buffer[0][14] = data[3362]; buffer[0][15] = data[3363]; buffer[0][16] = data[3364]; buffer[0][17] = data[3365]; buffer[0][18] = data[3366]; buffer[0][19] = data[3367]; buffer[0][20] = data[3368]; buffer[0][21] = data[3369]; buffer[0][22] = data[3370]; buffer[0][23] = data[3371]; buffer[0][24] = data[3372]; buffer[0][25] = data[3373]; buffer[0][26] = data[3374]; buffer[0][27] = data[3375]; buffer[0][28] = data[3376]; buffer[0][29] = data[3377]; buffer[0][30] = data[3378]; buffer[0][31] = data[3379]; buffer[0][32] = data[3380]; buffer[0][33] = data[3381]; buffer[0][34] = data[3382]; buffer[0][35] = data[3383]; + + } + if (partition == 94) { + buffer[0][0] = data[3384]; buffer[0][1] = data[3385]; buffer[0][2] = data[3386]; buffer[0][3] = data[3387]; buffer[0][4] = data[3388]; buffer[0][5] = data[3389]; buffer[0][6] = data[3390]; buffer[0][7] = data[3391]; buffer[0][8] = data[3392]; buffer[0][9] = data[3393]; buffer[0][10] = data[3394]; buffer[0][11] = data[3395]; buffer[0][12] = data[3396]; buffer[0][13] = data[3397]; buffer[0][14] = data[3398]; buffer[0][15] = data[3399]; buffer[0][16] = data[3400]; buffer[0][17] = data[3401]; buffer[0][18] = data[3402]; buffer[0][19] = data[3403]; buffer[0][20] = data[3404]; buffer[0][21] = data[3405]; buffer[0][22] = data[3406]; buffer[0][23] = data[3407]; buffer[0][24] = data[3408]; buffer[0][25] = data[3409]; buffer[0][26] = data[3410]; buffer[0][27] = data[3411]; buffer[0][28] = data[3412]; buffer[0][29] = data[3413]; buffer[0][30] = data[3414]; buffer[0][31] = data[3415]; buffer[0][32] = data[3416]; buffer[0][33] = data[3417]; buffer[0][34] = data[3418]; buffer[0][35] = data[3419]; + + } + if (partition == 95) { + buffer[0][0] = data[3420]; buffer[0][1] = data[3421]; buffer[0][2] = data[3422]; buffer[0][3] = data[3423]; buffer[0][4] = data[3424]; buffer[0][5] = data[3425]; buffer[0][6] = data[3426]; buffer[0][7] = data[3427]; buffer[0][8] = data[3428]; buffer[0][9] = data[3429]; buffer[0][10] = data[3430]; buffer[0][11] = data[3431]; buffer[0][12] = data[3432]; buffer[0][13] = data[3433]; buffer[0][14] = data[3434]; buffer[0][15] = data[3435]; buffer[0][16] = data[3436]; buffer[0][17] = data[3437]; buffer[0][18] = data[3438]; buffer[0][19] = data[3439]; buffer[0][20] = data[3440]; buffer[0][21] = data[3441]; buffer[0][22] = data[3442]; buffer[0][23] = data[3443]; buffer[0][24] = data[3444]; buffer[0][25] = data[3445]; buffer[0][26] = data[3446]; buffer[0][27] = data[3447]; buffer[0][28] = data[3448]; buffer[0][29] = data[3449]; buffer[0][30] = data[3450]; buffer[0][31] = data[3451]; buffer[0][32] = data[3452]; buffer[0][33] = data[3453]; buffer[0][34] = data[3454]; buffer[0][35] = data[3455]; + + } + if (partition == 96) { + buffer[0][0] = data[3456]; buffer[0][1] = data[3457]; buffer[0][2] = data[3458]; buffer[0][3] = data[3459]; buffer[0][4] = data[3460]; buffer[0][5] = data[3461]; buffer[0][6] = data[3462]; buffer[0][7] = data[3463]; buffer[0][8] = data[3464]; buffer[0][9] = data[3465]; buffer[0][10] = data[3466]; buffer[0][11] = data[3467]; buffer[0][12] = data[3468]; buffer[0][13] = data[3469]; buffer[0][14] = data[3470]; buffer[0][15] = data[3471]; buffer[0][16] = data[3472]; buffer[0][17] = data[3473]; buffer[0][18] = data[3474]; buffer[0][19] = data[3475]; buffer[0][20] = data[3476]; buffer[0][21] = data[3477]; buffer[0][22] = data[3478]; buffer[0][23] = data[3479]; buffer[0][24] = data[3480]; buffer[0][25] = data[3481]; buffer[0][26] = data[3482]; buffer[0][27] = data[3483]; buffer[0][28] = data[3484]; buffer[0][29] = data[3485]; buffer[0][30] = data[3486]; buffer[0][31] = data[3487]; buffer[0][32] = data[3488]; buffer[0][33] = data[3489]; buffer[0][34] = data[3490]; buffer[0][35] = data[3491]; + + } + if (partition == 97) { + buffer[0][0] = data[3492]; buffer[0][1] = data[3493]; buffer[0][2] = data[3494]; buffer[0][3] = data[3495]; buffer[0][4] = data[3496]; buffer[0][5] = data[3497]; buffer[0][6] = data[3498]; buffer[0][7] = data[3499]; buffer[0][8] = data[3500]; buffer[0][9] = data[3501]; buffer[0][10] = data[3502]; buffer[0][11] = data[3503]; buffer[0][12] = data[3504]; buffer[0][13] = data[3505]; buffer[0][14] = data[3506]; buffer[0][15] = data[3507]; buffer[0][16] = data[3508]; buffer[0][17] = data[3509]; buffer[0][18] = data[3510]; buffer[0][19] = data[3511]; buffer[0][20] = data[3512]; buffer[0][21] = data[3513]; buffer[0][22] = data[3514]; buffer[0][23] = data[3515]; buffer[0][24] = data[3516]; buffer[0][25] = data[3517]; buffer[0][26] = data[3518]; buffer[0][27] = data[3519]; buffer[0][28] = data[3520]; buffer[0][29] = data[3521]; buffer[0][30] = data[3522]; buffer[0][31] = data[3523]; buffer[0][32] = data[3524]; buffer[0][33] = data[3525]; buffer[0][34] = data[3526]; buffer[0][35] = data[3527]; + + } + if (partition == 98) { + buffer[0][0] = data[3528]; buffer[0][1] = data[3529]; buffer[0][2] = data[3530]; buffer[0][3] = data[3531]; buffer[0][4] = data[3532]; buffer[0][5] = data[3533]; buffer[0][6] = data[3534]; buffer[0][7] = data[3535]; buffer[0][8] = data[3536]; buffer[0][9] = data[3537]; buffer[0][10] = data[3538]; buffer[0][11] = data[3539]; buffer[0][12] = data[3540]; buffer[0][13] = data[3541]; buffer[0][14] = data[3542]; buffer[0][15] = data[3543]; buffer[0][16] = data[3544]; buffer[0][17] = data[3545]; buffer[0][18] = data[3546]; buffer[0][19] = data[3547]; buffer[0][20] = data[3548]; buffer[0][21] = data[3549]; buffer[0][22] = data[3550]; buffer[0][23] = data[3551]; buffer[0][24] = data[3552]; buffer[0][25] = data[3553]; buffer[0][26] = data[3554]; buffer[0][27] = data[3555]; buffer[0][28] = data[3556]; buffer[0][29] = data[3557]; buffer[0][30] = data[3558]; buffer[0][31] = data[3559]; buffer[0][32] = data[3560]; buffer[0][33] = data[3561]; buffer[0][34] = data[3562]; buffer[0][35] = data[3563]; + + } + if (partition == 99) { + buffer[0][0] = data[3564]; buffer[0][1] = data[3565]; buffer[0][2] = data[3566]; buffer[0][3] = data[3567]; buffer[0][4] = data[3568]; buffer[0][5] = data[3569]; buffer[0][6] = data[3570]; buffer[0][7] = data[3571]; buffer[0][8] = data[3572]; buffer[0][9] = data[3573]; buffer[0][10] = data[3574]; buffer[0][11] = data[3575]; buffer[0][12] = data[3576]; buffer[0][13] = data[3577]; buffer[0][14] = data[3578]; buffer[0][15] = data[3579]; buffer[0][16] = data[3580]; buffer[0][17] = data[3581]; buffer[0][18] = data[3582]; buffer[0][19] = data[3583]; buffer[0][20] = data[3584]; buffer[0][21] = data[3585]; buffer[0][22] = data[3586]; buffer[0][23] = data[3587]; buffer[0][24] = data[3588]; buffer[0][25] = data[3589]; buffer[0][26] = data[3590]; buffer[0][27] = data[3591]; buffer[0][28] = data[3592]; buffer[0][29] = data[3593]; buffer[0][30] = data[3594]; buffer[0][31] = data[3595]; buffer[0][32] = data[3596]; buffer[0][33] = data[3597]; buffer[0][34] = data[3598]; buffer[0][35] = data[3599]; + + } + } +}; + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_common.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_common.h new file mode 100644 index 00000000..e942a1dc --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_common.h @@ -0,0 +1,76 @@ +#ifndef NNET_COMMON_H_ +#define NNET_COMMON_H_ + +#include "ap_fixed.h" + +// This is a substitute for "ceil(n/(float)d)". +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n > d ? n : d) + +#define STRINGIFY(x) #x +#define EXPAND_STRING(x) STRINGIFY(x) + +#ifndef __VITIS_HLS__ +#define DATA_PACK_TXT HLS DATA_PACK variable = +#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable +#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable))) +#else +#define PRAGMA_DATA_PACK(variable) +#endif + +namespace nnet { + +// Common type definitions +enum io_type { io_parallel = 0, io_stream }; +enum strategy { latency, resource }; +enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 }; + +/* --- + * Balanced tree reduce implementation. + * For use in scenarios where Vivado cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ +template T reduce(const T *x, Op op) { + static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + if (N == 1) { + return x[0]; + } + if (N == 2) { + return op(x[0], x[1]); + } + return op(reduce(x, op), reduce(x + leftN, op)); +} + +template class Op_add { + public: + T operator()(T a, T b) { return a + b; } +}; + +template class Op_and { + public: + T operator()(T a, T b) { return a && b; } +}; + +template class Op_or { + public: + T operator()(T a, T b) { return a || b; } +}; + +template class Op_max { + public: + T operator()(T a, T b) { return a >= b ? a : b; } +}; + +template class Op_min { + public: + T operator()(T a, T b) { return a <= b ? a : b; } +}; + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d.h new file mode 100644 index 00000000..0f2e89ac --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d.h @@ -0,0 +1,76 @@ +#ifndef NNET_CONV1D_H_ +#define NNET_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d_latency.h" +#include "nnet_conv1d_resource.h" +#include + +namespace nnet { + +struct conv1d_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Convolutional parameters + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_width = 10; + static const unsigned n_chan = 0; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = 1; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1 + + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; // not used yet +}; + +template +void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE region + + if (CONFIG_T::strategy == nnet::latency) { + conv_1d_latency_cl(data, res, weights, biases); + } else { + conv_1d_resource_cl(data, res, weights, biases); + } +} + +template +void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + #pragma HLS INLINE region + + if (CONFIG_T::strategy == nnet::latency) { + if (CONFIG_T::implementation == conv_implementation::pointwise) { + // Use pointwise unrolled implementation + if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) { + pointwise_conv_1d_latency_cl_split_by_rf(data, res, weights, biases); + } else { + assert(CONFIG_T::reuse_factor == 1); + pointwise_conv_1d_latency_cl(data, res, weights, biases); + } + } else { + // Use standard unrolled implementation + conv_1d_latency_cl(data, res, weights, biases); + } + } else { + conv_1d_resource_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_latency.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_latency.h new file mode 100644 index 00000000..aabc8698 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_latency.h @@ -0,0 +1,439 @@ +#ifndef NNET_CONV1D_LATENCY_H_ +#define NNET_CONV1D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + typename CONFIG_T::accum_t acc[mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + // Limit multipliers to control parallelization + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + +PartitionLoop: + for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + data_T cache; + + // Do the matrix-multiply + Product1: + for (int i_in = 0; i_in < mult_n_in; i_in++) { + #pragma HLS UNROLL + cache = data_buf[i_pxl][i_in]; + Product2: + for (int i_out = 0; i_out < mult_n_out; i_out++) { + #pragma HLS UNROLL + mult[i_in * mult_n_out + i_out] = + CONFIG_T::mult_config::template product::product( + cache, weights[i_in * mult_n_out + i_out]); + } + } + + // Initialize accumulator with input biases + ResetAccum: + for (int i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + + // Accumulate multiplication result + Accum1: + for (int i_in = 0; i_in < mult_n_in; i_in++) { + #pragma HLS UNROLL + Accum2: + for (int i_out = 0; i_out < mult_n_out; i_out++) { + #pragma HLS UNROLL + acc[i_out] += mult[i_in * mult_n_out + i_out]; + } + } + + // Cast to "res_t" type + Result: + for (int i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_res]); + } + } + } +} + +template +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + + #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // Parallel mode + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + #pragma HLS ARRAY_PARTITION variable=weights complete dim=0 + #pragma HLS ARRAY_PARTITION variable=biases complete dim=0 + + // Limit multipliers to control parallelization + int multiplier_limit = + ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) / + float(CONFIG_T::reuse_factor)); +#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit + +// Convolve, saving all multiplication results to accumulate later +ConvOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + ConvFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + ConvChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + #pragma HLS UNROLL + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + int index_weight = cc * CONFIG_T::n_filt + ff; + int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc; + + if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left || + (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) { + mult[index_mult] = 0; + } else { + mult[index_mult] = data[index_data] * weights[index_weight]; + } + } // end channel loop + } // end filter loop + } // end output loop + + // Initialize accumulator with input biases + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + acc[ii][ff] = biases[ff]; + } + } + +// Accumulate multiplication result +AccumOut: + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + AccumFilt: + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Do "dot product" sum within filter and sum over channels + AccumChan: + for (int cc = 0; cc < CONFIG_T::n_chan; cc++) { + int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc; + acc[ii][ff] += mult[index_mult]; + } // end channel loop + } // end filter loop + } // end output loop + + // Cast to "res_t" type + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + #pragma HLS UNROLL + res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]); + } + } +} + +template +void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 + res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor]; + #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + +RFInputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerInputLoop: + for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii]; + } + } + + pointwise_conv_1d_latency_cl(data_tmp[0], res_tmp[0], weights, biases); + pointwise_conv_1d_latency_cl(data_tmp[1], res_tmp[1], weights, biases); + if (CONFIG_T::reuse_factor > 2) + pointwise_conv_1d_latency_cl(data_tmp[2], res_tmp[2], weights, biases); + if (CONFIG_T::reuse_factor > 3) + pointwise_conv_1d_latency_cl(data_tmp[3], res_tmp[3], weights, biases); + if (CONFIG_T::reuse_factor > 4) + pointwise_conv_1d_latency_cl(data_tmp[4], res_tmp[4], weights, biases); + if (CONFIG_T::reuse_factor > 5) + pointwise_conv_1d_latency_cl(data_tmp[5], res_tmp[5], weights, biases); + if (CONFIG_T::reuse_factor > 6) + pointwise_conv_1d_latency_cl(data_tmp[6], res_tmp[6], weights, biases); + if (CONFIG_T::reuse_factor > 7) + pointwise_conv_1d_latency_cl(data_tmp[7], res_tmp[7], weights, biases); + if (CONFIG_T::reuse_factor > 8) + pointwise_conv_1d_latency_cl(data_tmp[8], res_tmp[8], weights, biases); + if (CONFIG_T::reuse_factor > 9) + pointwise_conv_1d_latency_cl(data_tmp[9], res_tmp[9], weights, biases); + if (CONFIG_T::reuse_factor > 10) + pointwise_conv_1d_latency_cl(data_tmp[10], res_tmp[10], weights, biases); + if (CONFIG_T::reuse_factor > 11) + pointwise_conv_1d_latency_cl(data_tmp[11], res_tmp[11], weights, biases); + if (CONFIG_T::reuse_factor > 12) + pointwise_conv_1d_latency_cl(data_tmp[12], res_tmp[12], weights, biases); + if (CONFIG_T::reuse_factor > 13) + pointwise_conv_1d_latency_cl(data_tmp[13], res_tmp[13], weights, biases); + if (CONFIG_T::reuse_factor > 14) + pointwise_conv_1d_latency_cl(data_tmp[14], res_tmp[14], weights, biases); + if (CONFIG_T::reuse_factor > 15) + pointwise_conv_1d_latency_cl(data_tmp[15], res_tmp[15], weights, biases); + if (CONFIG_T::reuse_factor > 16) + pointwise_conv_1d_latency_cl(data_tmp[16], res_tmp[16], weights, biases); + if (CONFIG_T::reuse_factor > 17) + pointwise_conv_1d_latency_cl(data_tmp[17], res_tmp[17], weights, biases); + if (CONFIG_T::reuse_factor > 18) + pointwise_conv_1d_latency_cl(data_tmp[18], res_tmp[18], weights, biases); + if (CONFIG_T::reuse_factor > 19) + pointwise_conv_1d_latency_cl(data_tmp[19], res_tmp[19], weights, biases); + if (CONFIG_T::reuse_factor > 20) + pointwise_conv_1d_latency_cl(data_tmp[20], res_tmp[20], weights, biases); + if (CONFIG_T::reuse_factor > 21) + pointwise_conv_1d_latency_cl(data_tmp[21], res_tmp[21], weights, biases); + if (CONFIG_T::reuse_factor > 22) + pointwise_conv_1d_latency_cl(data_tmp[22], res_tmp[22], weights, biases); + if (CONFIG_T::reuse_factor > 23) + pointwise_conv_1d_latency_cl(data_tmp[23], res_tmp[23], weights, biases); + if (CONFIG_T::reuse_factor > 24) + pointwise_conv_1d_latency_cl(data_tmp[24], res_tmp[24], weights, biases); + if (CONFIG_T::reuse_factor > 25) + pointwise_conv_1d_latency_cl(data_tmp[25], res_tmp[25], weights, biases); + if (CONFIG_T::reuse_factor > 26) + pointwise_conv_1d_latency_cl(data_tmp[26], res_tmp[26], weights, biases); + if (CONFIG_T::reuse_factor > 27) + pointwise_conv_1d_latency_cl(data_tmp[27], res_tmp[27], weights, biases); + if (CONFIG_T::reuse_factor > 28) + pointwise_conv_1d_latency_cl(data_tmp[28], res_tmp[28], weights, biases); + if (CONFIG_T::reuse_factor > 29) + pointwise_conv_1d_latency_cl(data_tmp[29], res_tmp[29], weights, biases); + if (CONFIG_T::reuse_factor > 30) + pointwise_conv_1d_latency_cl(data_tmp[30], res_tmp[30], weights, biases); + if (CONFIG_T::reuse_factor > 31) + pointwise_conv_1d_latency_cl(data_tmp[31], res_tmp[31], weights, biases); + if (CONFIG_T::reuse_factor > 32) + pointwise_conv_1d_latency_cl(data_tmp[32], res_tmp[32], weights, biases); + if (CONFIG_T::reuse_factor > 33) + pointwise_conv_1d_latency_cl(data_tmp[33], res_tmp[33], weights, biases); + if (CONFIG_T::reuse_factor > 34) + pointwise_conv_1d_latency_cl(data_tmp[34], res_tmp[34], weights, biases); + if (CONFIG_T::reuse_factor > 35) + pointwise_conv_1d_latency_cl(data_tmp[35], res_tmp[35], weights, biases); + if (CONFIG_T::reuse_factor > 36) + pointwise_conv_1d_latency_cl(data_tmp[36], res_tmp[36], weights, biases); + if (CONFIG_T::reuse_factor > 37) + pointwise_conv_1d_latency_cl(data_tmp[37], res_tmp[37], weights, biases); + if (CONFIG_T::reuse_factor > 38) + pointwise_conv_1d_latency_cl(data_tmp[38], res_tmp[38], weights, biases); + if (CONFIG_T::reuse_factor > 39) + pointwise_conv_1d_latency_cl(data_tmp[39], res_tmp[39], weights, biases); + if (CONFIG_T::reuse_factor > 40) + pointwise_conv_1d_latency_cl(data_tmp[40], res_tmp[40], weights, biases); + if (CONFIG_T::reuse_factor > 41) + pointwise_conv_1d_latency_cl(data_tmp[41], res_tmp[41], weights, biases); + if (CONFIG_T::reuse_factor > 42) + pointwise_conv_1d_latency_cl(data_tmp[42], res_tmp[42], weights, biases); + if (CONFIG_T::reuse_factor > 43) + pointwise_conv_1d_latency_cl(data_tmp[43], res_tmp[43], weights, biases); + if (CONFIG_T::reuse_factor > 44) + pointwise_conv_1d_latency_cl(data_tmp[44], res_tmp[44], weights, biases); + if (CONFIG_T::reuse_factor > 45) + pointwise_conv_1d_latency_cl(data_tmp[45], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 46) + pointwise_conv_1d_latency_cl(data_tmp[46], res_tmp[45], weights, biases); + if (CONFIG_T::reuse_factor > 47) + pointwise_conv_1d_latency_cl(data_tmp[47], res_tmp[47], weights, biases); + if (CONFIG_T::reuse_factor > 48) + pointwise_conv_1d_latency_cl(data_tmp[48], res_tmp[48], weights, biases); + if (CONFIG_T::reuse_factor > 49) + pointwise_conv_1d_latency_cl(data_tmp[49], res_tmp[49], weights, biases); + if (CONFIG_T::reuse_factor > 50) + pointwise_conv_1d_latency_cl(data_tmp[50], res_tmp[50], weights, biases); + if (CONFIG_T::reuse_factor > 51) + pointwise_conv_1d_latency_cl(data_tmp[51], res_tmp[51], weights, biases); + if (CONFIG_T::reuse_factor > 52) + pointwise_conv_1d_latency_cl(data_tmp[52], res_tmp[52], weights, biases); + if (CONFIG_T::reuse_factor > 53) + pointwise_conv_1d_latency_cl(data_tmp[53], res_tmp[53], weights, biases); + if (CONFIG_T::reuse_factor > 54) + pointwise_conv_1d_latency_cl(data_tmp[54], res_tmp[54], weights, biases); + if (CONFIG_T::reuse_factor > 55) + pointwise_conv_1d_latency_cl(data_tmp[55], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 56) + pointwise_conv_1d_latency_cl(data_tmp[56], res_tmp[55], weights, biases); + if (CONFIG_T::reuse_factor > 57) + pointwise_conv_1d_latency_cl(data_tmp[57], res_tmp[57], weights, biases); + if (CONFIG_T::reuse_factor > 58) + pointwise_conv_1d_latency_cl(data_tmp[58], res_tmp[58], weights, biases); + if (CONFIG_T::reuse_factor > 59) + pointwise_conv_1d_latency_cl(data_tmp[59], res_tmp[59], weights, biases); + if (CONFIG_T::reuse_factor > 60) + pointwise_conv_1d_latency_cl(data_tmp[60], res_tmp[60], weights, biases); + if (CONFIG_T::reuse_factor > 61) + pointwise_conv_1d_latency_cl(data_tmp[61], res_tmp[61], weights, biases); + if (CONFIG_T::reuse_factor > 62) + pointwise_conv_1d_latency_cl(data_tmp[62], res_tmp[62], weights, biases); + if (CONFIG_T::reuse_factor > 63) + pointwise_conv_1d_latency_cl(data_tmp[63], res_tmp[63], weights, biases); + if (CONFIG_T::reuse_factor > 64) + pointwise_conv_1d_latency_cl(data_tmp[64], res_tmp[64], weights, biases); + if (CONFIG_T::reuse_factor > 65) + pointwise_conv_1d_latency_cl(data_tmp[65], res_tmp[65], weights, biases); + if (CONFIG_T::reuse_factor > 66) + pointwise_conv_1d_latency_cl(data_tmp[66], res_tmp[66], weights, biases); + if (CONFIG_T::reuse_factor > 67) + pointwise_conv_1d_latency_cl(data_tmp[67], res_tmp[67], weights, biases); + if (CONFIG_T::reuse_factor > 68) + pointwise_conv_1d_latency_cl(data_tmp[68], res_tmp[68], weights, biases); + if (CONFIG_T::reuse_factor > 69) + pointwise_conv_1d_latency_cl(data_tmp[69], res_tmp[69], weights, biases); + if (CONFIG_T::reuse_factor > 70) + pointwise_conv_1d_latency_cl(data_tmp[70], res_tmp[70], weights, biases); + if (CONFIG_T::reuse_factor > 71) + pointwise_conv_1d_latency_cl(data_tmp[71], res_tmp[71], weights, biases); + if (CONFIG_T::reuse_factor > 72) + pointwise_conv_1d_latency_cl(data_tmp[72], res_tmp[72], weights, biases); + if (CONFIG_T::reuse_factor > 73) + pointwise_conv_1d_latency_cl(data_tmp[73], res_tmp[73], weights, biases); + if (CONFIG_T::reuse_factor > 74) + pointwise_conv_1d_latency_cl(data_tmp[74], res_tmp[74], weights, biases); + if (CONFIG_T::reuse_factor > 75) + pointwise_conv_1d_latency_cl(data_tmp[75], res_tmp[75], weights, biases); + if (CONFIG_T::reuse_factor > 76) + pointwise_conv_1d_latency_cl(data_tmp[76], res_tmp[76], weights, biases); + if (CONFIG_T::reuse_factor > 77) + pointwise_conv_1d_latency_cl(data_tmp[77], res_tmp[77], weights, biases); + if (CONFIG_T::reuse_factor > 78) + pointwise_conv_1d_latency_cl(data_tmp[78], res_tmp[78], weights, biases); + if (CONFIG_T::reuse_factor > 79) + pointwise_conv_1d_latency_cl(data_tmp[79], res_tmp[79], weights, biases); + if (CONFIG_T::reuse_factor > 80) + pointwise_conv_1d_latency_cl(data_tmp[80], res_tmp[80], weights, biases); + if (CONFIG_T::reuse_factor > 81) + pointwise_conv_1d_latency_cl(data_tmp[81], res_tmp[81], weights, biases); + if (CONFIG_T::reuse_factor > 82) + pointwise_conv_1d_latency_cl(data_tmp[82], res_tmp[82], weights, biases); + if (CONFIG_T::reuse_factor > 83) + pointwise_conv_1d_latency_cl(data_tmp[83], res_tmp[83], weights, biases); + if (CONFIG_T::reuse_factor > 84) + pointwise_conv_1d_latency_cl(data_tmp[84], res_tmp[84], weights, biases); + if (CONFIG_T::reuse_factor > 85) + pointwise_conv_1d_latency_cl(data_tmp[85], res_tmp[85], weights, biases); + if (CONFIG_T::reuse_factor > 86) + pointwise_conv_1d_latency_cl(data_tmp[86], res_tmp[86], weights, biases); + if (CONFIG_T::reuse_factor > 87) + pointwise_conv_1d_latency_cl(data_tmp[87], res_tmp[87], weights, biases); + if (CONFIG_T::reuse_factor > 88) + pointwise_conv_1d_latency_cl(data_tmp[88], res_tmp[88], weights, biases); + if (CONFIG_T::reuse_factor > 89) + pointwise_conv_1d_latency_cl(data_tmp[89], res_tmp[89], weights, biases); + if (CONFIG_T::reuse_factor > 90) + pointwise_conv_1d_latency_cl(data_tmp[90], res_tmp[90], weights, biases); + if (CONFIG_T::reuse_factor > 91) + pointwise_conv_1d_latency_cl(data_tmp[91], res_tmp[91], weights, biases); + if (CONFIG_T::reuse_factor > 92) + pointwise_conv_1d_latency_cl(data_tmp[92], res_tmp[92], weights, biases); + if (CONFIG_T::reuse_factor > 93) + pointwise_conv_1d_latency_cl(data_tmp[93], res_tmp[93], weights, biases); + if (CONFIG_T::reuse_factor > 94) + pointwise_conv_1d_latency_cl(data_tmp[94], res_tmp[94], weights, biases); + if (CONFIG_T::reuse_factor > 95) + pointwise_conv_1d_latency_cl(data_tmp[95], res_tmp[95], weights, biases); + if (CONFIG_T::reuse_factor > 96) + pointwise_conv_1d_latency_cl(data_tmp[96], res_tmp[96], weights, biases); + if (CONFIG_T::reuse_factor > 97) + pointwise_conv_1d_latency_cl(data_tmp[97], res_tmp[97], weights, biases); + if (CONFIG_T::reuse_factor > 98) + pointwise_conv_1d_latency_cl(data_tmp[98], res_tmp[98], weights, biases); + if (CONFIG_T::reuse_factor > 99) + pointwise_conv_1d_latency_cl(data_tmp[99], res_tmp[99], weights, biases); + if (CONFIG_T::reuse_factor > 100) + pointwise_conv_1d_latency_cl(data_tmp[100], res_tmp[100], weights, biases); + if (CONFIG_T::reuse_factor > 101) + pointwise_conv_1d_latency_cl(data_tmp[101], res_tmp[101], weights, biases); + if (CONFIG_T::reuse_factor > 102) + pointwise_conv_1d_latency_cl(data_tmp[102], res_tmp[102], weights, biases); + if (CONFIG_T::reuse_factor > 103) + pointwise_conv_1d_latency_cl(data_tmp[103], res_tmp[103], weights, biases); + if (CONFIG_T::reuse_factor > 104) + pointwise_conv_1d_latency_cl(data_tmp[104], res_tmp[104], weights, biases); + if (CONFIG_T::reuse_factor > 105) + pointwise_conv_1d_latency_cl(data_tmp[105], res_tmp[105], weights, biases); + if (CONFIG_T::reuse_factor > 106) + pointwise_conv_1d_latency_cl(data_tmp[106], res_tmp[106], weights, biases); + if (CONFIG_T::reuse_factor > 107) + pointwise_conv_1d_latency_cl(data_tmp[107], res_tmp[107], weights, biases); + if (CONFIG_T::reuse_factor > 108) + pointwise_conv_1d_latency_cl(data_tmp[108], res_tmp[108], weights, biases); + if (CONFIG_T::reuse_factor > 109) + pointwise_conv_1d_latency_cl(data_tmp[109], res_tmp[109], weights, biases); + if (CONFIG_T::reuse_factor > 110) + pointwise_conv_1d_latency_cl(data_tmp[110], res_tmp[110], weights, biases); + if (CONFIG_T::reuse_factor > 111) + pointwise_conv_1d_latency_cl(data_tmp[111], res_tmp[111], weights, biases); + if (CONFIG_T::reuse_factor > 112) + pointwise_conv_1d_latency_cl(data_tmp[112], res_tmp[112], weights, biases); + if (CONFIG_T::reuse_factor > 113) + pointwise_conv_1d_latency_cl(data_tmp[113], res_tmp[113], weights, biases); + if (CONFIG_T::reuse_factor > 114) + pointwise_conv_1d_latency_cl(data_tmp[114], res_tmp[114], weights, biases); + if (CONFIG_T::reuse_factor > 115) + pointwise_conv_1d_latency_cl(data_tmp[115], res_tmp[115], weights, biases); + if (CONFIG_T::reuse_factor > 116) + pointwise_conv_1d_latency_cl(data_tmp[116], res_tmp[116], weights, biases); + if (CONFIG_T::reuse_factor > 117) + pointwise_conv_1d_latency_cl(data_tmp[117], res_tmp[117], weights, biases); + if (CONFIG_T::reuse_factor > 118) + pointwise_conv_1d_latency_cl(data_tmp[118], res_tmp[118], weights, biases); + if (CONFIG_T::reuse_factor > 119) + pointwise_conv_1d_latency_cl(data_tmp[119], res_tmp[119], weights, biases); + +RFOutputLoop: + for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) { + #pragma HLS UNROLL + InnerOutputLoop: + for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) { + #pragma HLS UNROLL + res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii]; + } + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_resource.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 00000000..6e70158a --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,103 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + constexpr unsigned multscale = block_factor / mult_n_out; + + assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) && + "This function is correct only for RF <= FILT_WIDTH * N_CHAN"); + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + +PartitionLoop: + for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + //#pragma HLS UNROLL // We don't want this loop unrolled + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelInitAccumLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + InitAccumLoop: + for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + } + + ReuseLoop: + for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) { + #pragma HLS PIPELINE II=1 rewind + + unsigned i_w = i_rf; + unsigned i_in = i_rf; + unsigned i_out = 0; + unsigned i_acc = 0; + + MultLoop: + for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) { + #pragma HLS UNROLL + + PixelMultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + acc[i_pxl][i_out] += static_cast( + CONFIG_T::mult_config::template product::product( + data_buf[i_pxl][i_in], weights[i_w])); + } + + // Increment i_w + i_w += CONFIG_T::reuse_factor; + // Increment i_in + i_in += CONFIG_T::reuse_factor; + if (i_in >= mult_n_in) { + i_in = i_rf; + } + // Increment i_out + if (i_acc + 1 >= multscale) { + i_acc = 0; + i_out++; + } else { + i_acc++; + } + } + } + + PixelResultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + // Cast to "res_t" type + ResultLoop: + for (unsigned i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_pxl][i_res]); + } + } + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 00000000..b23c330c --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,89 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" + +namespace nnet { + +template +void compute_scaled_indices_1d(const unsigned w_idx, ap_uint *pixel_idx) { + unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan); + +ComputeIndex: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + #pragma HLS UNROLL + unsigned sw_idx = + CONFIG_T::template scale_index::scale_index( + wp_idx + p); + pixel_idx[p] = CONFIG_T::pixels[sw_idx]; + } +} + +template +void conv_1d_encoded_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + hls::stream data_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; + const int win_depth = CONFIG_T::out_width; + for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=pixel_idx complete + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_1d(i_iw, pixel_idx); + compute_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, weights, + biases, pixel_idx); + } +} + +template +void conv_1d_buffer_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_output_buffer_1d(data.read(), res, weights, biases); + } +} + +template +void conv_1d_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + conv_1d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + conv_1d_encoded_cl(data, res, weights, biases); + break; + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d.h new file mode 100644 index 00000000..71a88f44 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d.h @@ -0,0 +1,75 @@ +#ifndef NNET_CONV2D_H_ +#define NNET_CONV2D_H_ + +#include "nnet_common.h" +#include "nnet_conv2d_latency.h" +#include "nnet_conv2d_resource.h" +#include + +namespace nnet { + +struct conv2d_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Convolutional parameters + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_chan = 1; + static const unsigned filt_height = 1; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_height * filt_width; + static const unsigned n_filt = 1; + static const unsigned stride_height = 1; + static const unsigned stride_width = 1; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + static const unsigned dilation_height = 1; + static const unsigned dilation_width = 1; + + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; // not used yet +}; + +template +void conv_2d_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE region + + if (CONFIG_T::strategy == nnet::latency) { + conv_2d_latency_cl(data, res, weights, biases); + } else { + conv_2d_resource_cl(data, res, weights, biases); + } +} + +template +void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + #pragma HLS INLINE region + + // Nothing special to be done for io_parallel implementation + if (CONFIG_T::strategy == nnet::latency) { + conv_2d_latency_cl(data, res, weights, biases); + } else { + conv_2d_resource_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_latency.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_latency.h new file mode 100644 index 00000000..5114af78 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_latency.h @@ -0,0 +1,89 @@ +#ifndef NNET_CONV2D_LATENCY_H_ +#define NNET_CONV2D_LATENCY_H_ + +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void conv_2d_latency_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + typename CONFIG_T::accum_t acc[mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + + // Limit multipliers to control parallelization + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit + +PartitionLoop: + for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + data_T cache; + + // Do the matrix-multiply + Product1: + for (int i_in = 0; i_in < mult_n_in; i_in++) { + #pragma HLS UNROLL + cache = data_buf[i_pxl][i_in]; + Product2: + for (int i_out = 0; i_out < mult_n_out; i_out++) { + #pragma HLS UNROLL + mult[i_in * mult_n_out + i_out] = + CONFIG_T::mult_config::template product::product( + cache, weights[i_in * mult_n_out + i_out]); + } + } + + // Initialize accumulator with input biases + ResetAccum: + for (int i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + + // Accumulate multiplication result + Accum1: + for (int i_in = 0; i_in < mult_n_in; i_in++) { + #pragma HLS UNROLL + Accum2: + for (int i_out = 0; i_out < mult_n_out; i_out++) { + #pragma HLS UNROLL + acc[i_out] += mult[i_in * mult_n_out + i_out]; + } + } + + // Cast to "res_t" type + Result: + for (int i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_res]); + } + } + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_resource.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 00000000..eb7e18e4 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,105 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +template +void conv_2d_resource_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; + constexpr unsigned mult_n_out = CONFIG_T::n_filt; + constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor); + + constexpr unsigned multscale = block_factor / mult_n_out; + + assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan) && + "This function is correct only for RF <= FILT_HEIGHT * FILT_WIDTH * N_CHAN"); + + data_T data_buf[CONFIG_T::n_pixels][mult_n_in]; + #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0 + + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 + +PartitionLoop: + for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) { + //#pragma HLS UNROLL // We don't want this loop unrolled + + CONFIG_T::template fill_buffer::fill_buffer(data, data_buf, i_part); + + PixelInitAccumLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + InitAccumLoop: + for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) { + #pragma HLS UNROLL + acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t)biases[i_acc]; + } + } + + ReuseLoop: + for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) { + #pragma HLS PIPELINE II=1 rewind + + unsigned i_w = i_rf; + unsigned i_in = i_rf; + unsigned i_out = 0; + unsigned i_acc = 0; + + MultLoop: + for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) { + #pragma HLS UNROLL + + PixelMultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + + acc[i_pxl][i_out] += static_cast( + CONFIG_T::mult_config::template product::product( + data_buf[i_pxl][i_in], weights[i_w])); + } + + // Increment i_w + i_w += CONFIG_T::reuse_factor; + // Increment i_in + i_in += CONFIG_T::reuse_factor; + if (i_in >= mult_n_in) { + i_in = i_rf; + } + // Increment i_out + if (i_acc + 1 >= multscale) { + i_acc = 0; + i_out++; + } else { + i_acc++; + } + } + } + + PixelResultLoop: + for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) { + #pragma HLS UNROLL + // Cast to "res_t" type + ResultLoop: + for (unsigned i_res = 0; i_res < mult_n_out; i_res++) { + #pragma HLS UNROLL + *(res++) = cast(acc[i_pxl][i_res]); + } + } + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 00000000..8a4fb6be --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,112 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "ap_shift_reg.h" +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" + +namespace nnet { + +template +void compute_scaled_indices_2d(const unsigned h_idx, const unsigned w_idx, + ap_uint *pixel_idx) { + const unsigned sh_idx = CONFIG_T::template scale_index_height::scale_index(h_idx); + unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan); + +ComputeIndex: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + #pragma HLS UNROLL + + unsigned sw_idx = CONFIG_T::template scale_index_width::scale_index(wp_idx + p); + pixel_idx[p] = CONFIG_T::pixels[sh_idx * CONFIG_T::min_width + sw_idx]; + } +} + +template +void conv_2d_encoded_cl( + hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == CONFIG_T::filt_width); + + hls::stream data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width; + for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=pixel_idx complete + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_2d(i_ih, i_iw, pixel_idx); + compute_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, weights, + biases, pixel_idx); + } + } +} + +// Line Buffer +template +void conv_2d_buffer_cl( + hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (CONFIG_T::filt_height > 1) { + compute_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void conv_2d_cl( + hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + conv_2d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + conv_2d_encoded_cl(data, res, weights, biases); + break; + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv_stream.h new file mode 100644 index 00000000..b763938c --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv_stream.h @@ -0,0 +1,394 @@ +#ifndef NNET_CONV_STREAM_H_ +#define NNET_CONV_STREAM_H_ + +#include "ap_shift_reg.h" +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +// ************************************************* +// Encoded Implementation (Vlad's) +// ************************************************* +template unsigned scale_index_K_gte_S(const unsigned idx) { + #pragma HLS INLINE + + if (idx < K - S) { + return idx; + } + + constexpr unsigned nW = ((W - K) / S) * S + K; // Nearest W without unused pixels on the right + constexpr unsigned sW = (DIV_ROUNDUP(K, S) - 1) * S + K; // Scaled W that behaves like original W + if (idx >= nW) { + return sW; + } + + const unsigned r = nW - idx; + if (r <= K - S) { + return sW - r; + } + + return K - S + (idx - (K - S)) % S; +} + +template unsigned scale_index_K_lt_S(const unsigned idx) { + #pragma HLS INLINE + + if (idx < S - K) { + return idx; + } + + constexpr unsigned nW = ((W - K) / S) * S + K; // Nearest W without unused pixels on the right + constexpr unsigned sW = (DIV_ROUNDUP(S, K) - 1) * S + K; // Scaled W that behaves like original W + if (idx >= nW) { + return sW; + } + + const unsigned r = nW - idx; + if (r <= S - K) { + return sW - r; + } + + return S - K + (idx - (S - K)) % S; +} + +template class scale_index_regular { + public: + static unsigned scale_index(const unsigned idx) { + #pragma HLS INLINE + + if (K >= S) { + return scale_index_K_gte_S(idx); + } else { + return scale_index_K_lt_S(idx); + } + } +}; + +template class scale_index_unscaled { + public: + static unsigned scale_index(const unsigned idx) { + #pragma HLS INLINE + return idx; + } +}; + +template +void mult_buffer(hls::stream data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T &res_pack, hls::stream &res_stream, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE + + typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = data complete + typename res_T::value_type res[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = res complete + +InitData: + for (int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) { + #pragma HLS UNROLL + data[id] = data_window[id].read(); + } + + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + data, res, weights, biases); + } else { + dense_resource( + data, res, weights, biases); + } + +CastLoop: + for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) { + #pragma HLS UNROLL + if (res_T::size / CONFIG_T::n_filt == 1) { + res_pack[jj] = res[jj]; + } else { + res_pack[outputs_ready * CONFIG_T::n_filt + jj] = res[jj]; + } + } + + if (res_T::size / CONFIG_T::n_filt == 1) { + res_stream.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) { + res_stream.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } +} + +template +void compute_output_encoded(const data_T &in_elem, + hls::stream data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + hls::stream &res, res_T &res_pack, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], ap_uint *pixel_idx) { + #pragma HLS INLINE + +MultLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + #pragma HLS PIPELINE II = CONFIG_T::reuse_factor + CopyDataFilt: + for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) { + #pragma HLS UNROLL + CopyDataChan: + for (unsigned c = 0; c < CONFIG_T::n_chan; c++) { + #pragma HLS UNROLL + if (pixel_idx[p][f]) + data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]); + } + } + if (pixel_idx[p][CONFIG_T::kernel_size - 1]) { + mult_buffer(data_window, res_pack, res, outputs_ready, weights, biases); + } + } +} + +// ************************************************* +// Line Buffer Implementation (Phil's) +// ************************************************* +template +void kernel_shift_1d(const data_T &in_elem, + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) { + #pragma HLS inline + + // Shift kernel_window by one step to the left (manual shift operation) + static const int filt_width = CONFIG_T::filt_width - 1; +KernelShiftWidth: + for (int i_iw = 0; i_iw < filt_width; i_iw++) { + #pragma HLS PIPELINE II = 1 + KernelShiftChannel: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + #pragma HLS UNROLL + // Shift every element in kernel_window to the left + kernel_window[i_iw * CONFIG_T::n_chan + i_ic] = kernel_window[(i_iw + 1) * CONFIG_T::n_chan + i_ic]; + } + } + + // Insert shift_buffer column into right-most column of kernel + static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan; +KernelPushChannel: + for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + #pragma HLS UNROLL + kernel_window[lastheight + i_ic] = in_elem[i_ic]; + } +} + +template +void kernel_shift_2d( + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) { + #pragma HLS inline + + // Shift kernel_window by one step to the left (manual shift operation) + static const int filt_width = CONFIG_T::filt_width - 1; +KernelShiftWidth: + for (int i_iw = 0; i_iw < filt_width; i_iw++) { + #pragma HLS PIPELINE II = 1 + KernelShiftHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) { + KernelShiftChannel: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // Shift every element in kernel_window to the left + kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_iw * CONFIG_T::n_chan + i_ic] = + kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + (i_iw + 1) * CONFIG_T::n_chan + i_ic]; + } + } + } + + // Insert shift_buffer column into right-most column of kernel + static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan; +KernelPushHeight: + for (int i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) { + #pragma HLS UNROLL + KernelPushChannel: + for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + kernel_window[lastheight + i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_ic] = shift_buffer[i_ih][i_ic]; + } + } +} + +template +void shift_line_buffer( + const data_T &in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]) { + + #pragma HLS PIPELINE + + // Temporary buffer for popped (shifted) elements + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = shift_buffer complete dim = 0 + +UpdateBuffer: + for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + #pragma HLS UNROLL + + // Insert pixel(s) at end of shift buffer + shift_buffer[CONFIG_T::filt_height - 1][i_ic] = in_elem[i_ic]; + } + +LineBufferDataIn: + for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) { + // Shift the shift buffer into the line buffer + LineBufferShift: + for (unsigned i_ih = 1; i_ih < CONFIG_T::filt_height; i_ih++) { + #pragma HLS UNROLL + typename data_T::value_type pop_elem = line_buffer[i_ih - 1][i_ic].shift( + shift_buffer[CONFIG_T::filt_height - i_ih][i_ic]); // Shift the line buffer, return the popped pixel + shift_buffer[CONFIG_T::filt_height - i_ih - 1][i_ic] = + pop_elem; // Popped element placed back into shift_buffer, one row up. + } + } + kernel_shift_2d(shift_buffer, kernel_window); +} + +template +void compute_output_buffer_2d( + const data_T &in_elem, + ap_shift_reg line_buffer[MAX(CONFIG_T::filt_height - 1, 1)] + [CONFIG_T::n_chan], + hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE OFF + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + const static int lShiftY = CONFIG_T::filt_height - 1; + + // Counters + static int pX = 0; // Pixel X + static int pY = 0; // Pixel Y + + static int sX = 0; // Stride X + static int sY = 0; // Stride Y + + static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel to buffer + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + + // Dense multiply + // #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + kernel_data, res_out, weights, biases); + } else { + dense_resource( + kernel_data, res_out, weights, biases); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { + pY = pY + 1; + // Update stride (threshold) ? subtract stride : increment stride + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +// Conv 1D compute output +template +void compute_output_buffer_1d( + const data_T &in_elem, hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + + // Counters + static int pX = 0; // pixel counter + static int sX = 0; // stride counter + + static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel to buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { + + // Dense multiply + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + kernel_data, res_out, weights, biases); + } else { + dense_resource( + kernel_data, res_out, weights, biases); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense.h new file mode 100644 index 00000000..c5155d84 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense.h @@ -0,0 +1,49 @@ +#ifndef NNET_DENSE_H_ +#define NNET_DENSE_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_dense_latency.h" +#include "nnet_dense_resource.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct dense_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned strategy = latency; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + // Product function to use + template using product = nnet::product::mult; +}; + +template +void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS inline + if (CONFIG_T::strategy == nnet::latency) { + dense_latency(data, res, weights, biases); + } else { + dense_resource(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_compressed.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_compressed.h new file mode 100644 index 00000000..029b7480 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_compressed.h @@ -0,0 +1,90 @@ +#ifndef NNET_COMPRESSED_LAYER_H_ +#define NNET_COMPRESSED_LAYER_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_dense.h" +#include + +namespace nnet { + +template +void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out], + typename CONFIG_T::accum_t weight) { + for (unsigned k = 0; k < CONFIG_T::n_out; k++) { + #pragma HLS UNROLL + if (k == index) + mult[k] += weight; + } +} + +template +void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor); + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + #pragma HLS ARRAY_PARTITION variable=biases complete + #pragma HLS ARRAY_RESHAPE variable=weights block factor=multiplier_limit + +#ifdef __VITIS_HLS__ + #pragma HLS AGGREGATE variable=weights +#else + #pragma HLS data_pack variable=weights struct_level +#endif + +InitAccum: + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + #pragma HLS UNROLL + acc[i] = (typename CONFIG_T::accum_t)(biases[i]); + } + + // Do the compressed matrix-multiply + const int rufactor = CONFIG_T::reuse_factor; +ReuseLoop: + for (unsigned ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + typename CONFIG_T::accum_t mult[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < CONFIG_T::n_out; imult++) { + #pragma HLS UNROLL + mult[imult] = 0; + } + + CompressedMultLoop: + for (unsigned im = 0; im < multiplier_limit; im++) { + #pragma HLS UNROLL + unsigned w = im * rufactor + ir; + auto row = weights[w].row_index; + auto col = weights[w].col_index; + auto weight_cache = weights[w].weight; + data_T data_cache = data[row]; + // mult[col] += weight_cache * data_cache; + typename CONFIG_T::accum_t prod = + CONFIG_T::template product::product(data_cache, weight_cache); + fill_mult(col, mult, prod); + } + + for (int im = 0; im < CONFIG_T::n_out; im++) { + acc[im] += mult[im]; + } + } + +// Cast to "res_t" type +ResultLoop: + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + #pragma HLS UNROLL + // res[i] = (res_T) (acc[i]); + res[i] = cast(acc[i]); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_latency.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_latency.h new file mode 100644 index 00000000..02802c45 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_latency.h @@ -0,0 +1,72 @@ +#ifndef NNET_DENSE_LATENCY_H_ +#define NNET_DENSE_LATENCY_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +template +void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + data_T cache; + typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out]; + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights,biases + + // For parallel inputs: + // - completely partition arrays -- target fabric + // - if we have an unroll factor, limit number of multipliers + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes + #pragma HLS ARRAY_PARTITION variable=biases complete + #pragma HLS ARRAY_PARTITION variable=mult complete + #pragma HLS ARRAY_PARTITION variable=acc complete + + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + +// Do the matrix-multiply +Product1: + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + cache = data[ii]; + Product2: + for (int jj = 0; jj < CONFIG_T::n_out; jj++) { + int index = ii * CONFIG_T::n_out + jj; + mult[index] = CONFIG_T::template product::product(cache, weights[index]); + } + } + +// Initialize accumulator with input biases +ResetAccum: + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +// Accumulate multiplication result +Accum1: + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + Accum2: + for (int jj = 0; jj < CONFIG_T::n_out; jj++) { + int index = ii * CONFIG_T::n_out + jj; + acc[jj] += mult[index]; + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + // res[ires] = (res_T) (acc[ires]); + res[ires] = cast(acc[ires]); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_resource.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_resource.h new file mode 100644 index 00000000..88de9472 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_resource.h @@ -0,0 +1,263 @@ +#ifndef NNET_DENSE_RESOURCE_H_ +#define NNET_DENSE_RESOURCE_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_mult.h" +#include +#include + +namespace nnet { + +template +void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + +InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int w_index = ir; + int in_index = ir; + int out_index = 0; + int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights[w_index])); + + // Increment w_index + w_index += rufactor; + // Increment in_index + in_index += rufactor; + if (in_index >= nin) { + in_index = ir; + } + // Increment out_index + if (acc_step + 1 >= multscale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out); + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + +InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + + int w_index; + int in_index = 0; + int out_index; + int outstep = 0; + const int outscale = rufactor / nin; + + int outidx[rufactor]; +IndexLoop: + for (int ir = 0; ir < rufactor; ir++) { + outidx[ir] = outstep; + if ((ir + 1) % nin == 0) { + outstep++; + } + } + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + w_index = ir; + out_index = outidx[ir] /*outstep*/; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + acc[out_index] += static_cast( + CONFIG_T::template product::product(data[in_index], weights[w_index])); + + w_index += rufactor; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) + break; // check out of bounds + out_index += outscale; + } + + in_index++; + if (in_index >= nin) { + in_index = 0; + // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround. + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit / CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin) && "This function is correct only for RF > N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + +InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + typename CONFIG_T::accum_t tmpmult[block_factor]; + #pragma HLS ARRAY_PARTITION variable=tmpmult complete + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + rufactor * im; + int in_index = w_index % nin; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) + continue; // check out of bounds + tmpmult[im] = + CONFIG_T::template product::product(data[in_index], weights[w_index]); + } + + typename CONFIG_T::accum_t mult[multiplier_limit]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < multiplier_limit; imult++) { + #pragma HLS UNROLL + mult[imult] = 0; + } + + AccumLoop1: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + rufactor * im; + int out_index = w_index / multfactor; + if (out_index >= multiplier_limit) + continue; // check out of bounds + mult[out_index] += tmpmult[im]; + } + + AccumLoop2: + for (int im = 0; im < multiplier_limit; im++) { + #pragma HLS UNROLL + // int out_index = im/multscale; // This is the general case + // acc[out_index] += mult[im]; + acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + #pragma HLS INLINE recursive + + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_resource_rf_gt_nin(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_stream.h new file mode 100644 index 00000000..ad3a972e --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,68 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_types.h" +#include +#include + +namespace nnet { + +template +void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + dense_latency(data, res, weights, biases); + } else { + dense_resource(data, res, weights, biases); + } +} + +template +void dense(hls::stream &data_stream, hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + typename data_T::value_type data[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=data complete + + typename res_T::value_type res[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=res complete + +DataPrepare: + for (int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) { + if (CONFIG_T::n_in / data_T::size > 1) { + #pragma HLS PIPELINE + } + data_T data_pack = data_stream.read(); + DataPack: + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data[i_in * data_T::size + i_pack] = data_pack[i_pack]; + } + } + + dense_wrapper(data, res, weights, biases); + +ResWrite: + for (unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) { + if (CONFIG_T::n_out / res_T::size > 1) { + #pragma HLS PIPELINE + } + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPack: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = res[i_out * res_T::size + i_pack]; + } + res_stream.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed.h new file mode 100644 index 00000000..dfc77afa --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed.h @@ -0,0 +1,45 @@ +#ifndef NNET_EMBED_H_ +#define NNET_EMBED_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +struct embed_config { + // Internal data type definitions + typedef float embeddings_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 16; + static const unsigned vocab_size = 50; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; +}; + +template +void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out], + typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { + + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + // This can save a few cycles, but it will create a large multiplexer due to + // non-constant access pattern, so let's leave it out + //#pragma HLS ARRAY_PARTITION variable=embeddings complete + +InputSequence: + for (int j = 0; j < CONFIG_T::n_in; j++) { + #pragma HLS UNROLL + DenseEmbedding: + for (int i = 0; i < CONFIG_T::n_out; i++) { + #pragma HLS UNROLL + res[j * CONFIG_T::n_out + i] = embeddings[data[j] * CONFIG_T::n_out + i]; + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed_stream.h new file mode 100644 index 00000000..79ae9bc1 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed_stream.h @@ -0,0 +1,33 @@ +#ifndef NNET_EMBED_STREAM_H_ +#define NNET_EMBED_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +template +void embedding(hls::stream &data, hls::stream &res, + typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { + data_T in_data = data.read(); + +InputSequence: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + DenseEmbedding: + for (int i = 0; i < CONFIG_T::n_out; i++) { + #pragma HLS UNROLL + res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i]; + } + res.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_garnet.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_garnet.h new file mode 100644 index 00000000..1fcd5545 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_garnet.h @@ -0,0 +1,816 @@ +#ifndef NNET_GARNET_H_ +#define NNET_GARNET_H_ + +#include "hls_math.h" +#include "hls_stream.h" +#include "nnet_common.h" + +namespace nnet { +namespace garnet_utils { + +template +inline typename std::enable_if::value>::type +initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) { + typedef ap_uint index_t; + + unsigned const table_size = (1 << CONFIG_T::distance_width); + + index_t index; + typename CONFIG_T::distance_t distance; + + // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1 + edge_weights_table[0] = ap_ufixed(1.); + + for (unsigned iw = 1; iw < table_size; ++iw) { + index = iw; + distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0); + edge_weights_table[iw] = hls::exp(-distance * distance); + } +} + +template +inline typename std::enable_if::value>::type +initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) { + unsigned const table_size = (1 << CONFIG_T::distance_width); + double const step = 64. / table_size; + + typename CONFIG_T::distance_t v = -32.; + for (unsigned iw = 0; iw < table_size; ++iw) { + edge_weights_table[iw] = std::exp(-v * v); + v += step; + } +} + +template +inline typename std::enable_if::value, typename CONFIG_T::edge_weight_t>::type +get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) { + typedef ap_uint index_t; + + index_t index(distance.range(CONFIG_T::distance_width - 1, 0)); + + return edge_weights_table[index]; +} + +template +inline + typename std::enable_if::value, typename CONFIG_T::edge_weight_t>::type + get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) { + unsigned const table_size = (1 << CONFIG_T::distance_width); + double const step = 64. / table_size; + + int index = (distance + 32.) / step; + if (index < 0) + index = 0; + else if (index >= table_size) + index = table_size - 1; + + return edge_weights_table[index]; +} + +template typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) { + if (CONFIG_T::is_stack) { + #pragma HLS INLINE OFF + } +#ifdef __SYNTHESIS__ + typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width]; + // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices / + // CONFIG_T::reuse_factor); + // #pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1 + bool initialized = false; +#else + static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width]; + static bool initialized = false; +#endif + if (not initialized) { + initialize_edge_weights_table(edge_weights_table); + initialized = true; + } + + return get_edge_weight(distance, edge_weights_table); +} + +template +inline typename std::enable_if::value, dividend_T>::type normalize_log2(dividend_T dividend, + exponent_T exponent) { + #pragma HLS INLINE + return dividend >> exponent; +} + +template +inline typename std::enable_if::value, dividend_T>::type normalize_log2(dividend_T dividend, + exponent_T exponent) { + #pragma HLS INLINE + return dividend / std::pow(2., exponent); +} + +template struct Means { + typedef E edge_weight_t; + + edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators]; + typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features]; + + Means() { + #pragma HLS INLINE + #pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete + #pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete + #pragma HLS UNROLL region + + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + edge_weight_mean[ia] = 0.; + + InFeatures: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + weighted_feature_mean[iax] = 0.; + } + } + } + + void set_weight(unsigned, edge_weight_t const &) { + #pragma HLS INLINE + } + + void add_means_normalized(Means const &local) { + #pragma HLS INLINE + // Always called within a pipelined region - no UNROLL needed + + unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor; + + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor); + + InFeatures: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor); + } + } + } + + template + typename std::enable_if::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) { + #pragma HLS INLINE + #pragma HLS UNROLL region + + // accum comes divided by unroll factor + typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx; + + Aggregators: + for (unsigned ia = 0; ia < T::n_aggregators; ++ia) { + edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm; + + InFeatures: + for (unsigned ix = 0; ix < T::n_in_features; ++ix) { + unsigned const iax = ia * T::n_in_features + ix; + + weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm; + } + } + } + + template + typename std::enable_if::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) { + #pragma HLS INLINE + #pragma HLS UNROLL region + + Aggregators: + for (unsigned ia = 0; ia < T::n_aggregators; ++ia) { + + edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor); + + InFeatures: + for (unsigned ix = 0; ix < T::n_in_features; ++ix) { + unsigned const iax = ia * T::n_in_features + ix; + + weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor); + } + } + } +}; + +template struct WeightsAndMeans : public Means { + typedef E edge_weight_t; + + edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators]; + + WeightsAndMeans() : Means() { + #pragma HLS INLINE + unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor); + #pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor + } + + void set_weight(unsigned iva, edge_weight_t const &weight) { + #pragma HLS INLINE + edge_weights[iva] = weight; + } +}; + +template struct OutputBiasNormalizer; + +template +struct OutputBiasNormalizer::type> { + typedef typename CONFIG_T::output_transform_biases_t biases_t; + + biases_t const (&output_biases)[CONFIG_T::n_out_features]; + + OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} { + #pragma HLS INLINE + } +}; + +template +struct OutputBiasNormalizer::type> { + typedef typename CONFIG_T::output_transform_biases_t biases_t; + + biases_t output_biases[CONFIG_T::n_out_features]; + + OutputBiasNormalizer(nvtx_T const nvtx) { + #pragma HLS ARRAY_PARTITION variable=output_biases complete + #pragma HLS UNROLL region + + // Cannot add a loop label here due to a Vivado HLS bug, apparently + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io]; + bias *= nvtx; + output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width); + } + } +}; + +template struct InputDataGetter { + typedef data_T data_t; + + data_T const *dataref; + + InputDataGetter(data_T const *d) : dataref{d} { + #pragma HLS INLINE + } + data_T const &get(unsigned iv, unsigned ix) const { + #pragma HLS INLINE + unsigned const ivx = iv * CONFIG_T::n_in_features + ix; + return dataref[ivx]; + } +}; + +template struct SingleVertexDataGetter { + typedef data_T data_t; + + data_T const (&dataref)[CONFIG_T::n_in_features]; + + SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} { + #pragma HLS INLINE + } + data_T const &get(unsigned, unsigned ix) const { + #pragma HLS INLINE + return dataref[ix]; + } +}; + +template struct OutputResSetter { + typedef res_T res_t; + + res_T *resref; + + OutputResSetter(res_T *r) : resref{r} { + #pragma HLS INLINE + } + void set(unsigned iv, unsigned io, res_T const &acc) { + #pragma HLS INLINE + unsigned const ivo = iv * CONFIG_T::n_out_features + io; + resref[ivo] = acc; + } +}; + +template struct SingleVertexResSetter { + typedef res_T res_t; + + res_T (&resref)[CONFIG_T::n_out_features]; + + SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} { + #pragma HLS INLINE + } + void set(unsigned, unsigned io, res_T const &acc) { + #pragma HLS INLINE + resref[io] = acc; + } +}; + +template +inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local, + arrays_T &arrays) { + #pragma HLS INLINE + +Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia]; + + InFeatures1: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax]; + + distance += incr; + } + + typename CONFIG_T::edge_weight_t edge_weight = + garnet_utils::compute_edge_weight(distance); + + arrays_local.edge_weight_mean[ia] += edge_weight; + + InFeatures2: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight; + + arrays_local.weighted_feature_mean[iax] += incr; + } + + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + arrays.set_weight(iva, edge_weight); + } +} + +template +inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) { + #pragma HLS INLINE + #pragma HLS UNROLL region + + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa]; + +InFeatures: + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const ioax = ioa * CONFIG_T::n_in_features + ix; + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax]; + } + + return aggr; +} + +template +inline void compute_output_base(arrays_T const &arrays, + typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) { + #pragma HLS INLINE + #pragma HLS UNROLL region + +OutFeatures: + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + + output_base[ioa] = compute_output_base_core(arrays, io, ia); + } + } +} + +template +inline void +compute_vertex_output(arrays_T const &arrays, unsigned iv, + typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators], + res_setter_T &res_setter) { + #pragma HLS INLINE + + typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators]; + #pragma HLS ARRAY_PARTITION variable=edge_weights complete + +Aggregators1: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + + edge_weights[ia] = arrays.edge_weights[iva]; + } + +OutFeatures: + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io]; + + Aggregators2: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + + typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa]; + acc += incr; + } + + res_setter.set(iv, io, acc); + } +} + +template +void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) { + InputDataGetter data_getter(data); + + unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor; + + Means means_accum; + +VerticesOuter: + for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) { + #pragma HLS PIPELINE + + if (ivv * unroll_factor >= nvtx) + break; + + Means means_local; + + VerticesInner: + for (unsigned ir = 0; ir < unroll_factor; ++ir) { + unsigned iv = ivv * unroll_factor + ir; + + if (iv == nvtx) + break; + + compute_weights_aggregates(data_getter, iv, means_local, arrays); + } + + means_accum.add_means_normalized(means_local); + } + + arrays.set_means_normalized(nvtx, means_accum); +} + +template +void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + OutputResSetter res_setter(res); + + typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]; + #pragma HLS ARRAY_PARTITION variable=output_base complete + + compute_output_base(arrays, output_base); + + unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor; + +VerticesOuter: + for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) { + #pragma HLS PIPELINE + + if (ivv * unroll_factor >= nvtx) + break; + + VerticesInner: + for (unsigned ir = 0; ir < unroll_factor; ++ir) { + unsigned iv = ivv * unroll_factor + ir; + + if (iv == nvtx) + break; + + compute_vertex_output(arrays, iv, output_base, res_setter); + } + } +} + +template +void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays, + res_T res[CONFIG_T::n_out_features]) { + #pragma HLS PIPELINE + +OutFeatures: + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + res_T acc = output_transform_biases.output_biases[io]; + + Aggregators: + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + typename CONFIG_T::aggr_t aggr = compute_output_base_core(arrays, io, ia); + + acc += arrays.edge_weight_mean[ia] * aggr; + } + + res[io] = acc; + } +} + +template +void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T ¤t_arrays) { + typedef typename prev_layer_t::output_t data_T; + + typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators]; + #pragma HLS ARRAY_PARTITION variable=prev_output_base complete + + compute_output_base(prev_arrays, prev_output_base); + + unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor; + + Means means_accum; + +VerticesOuter: + for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) { + #pragma HLS PIPELINE + + if (ivv * unroll_factor >= nvtx) + break; + + Means means_local; + + VerticesInner: + for (unsigned ir = 0; ir < unroll_factor; ++ir) { + unsigned iv = ivv * unroll_factor + ir; + + if (iv == nvtx) + break; + + data_T data[prev_layer_t::n_out_features]; + #pragma HLS ARRAY_PARTITION variable=data complete + + SingleVertexResSetter res_setter(data); + + compute_vertex_output(prev_arrays, iv, prev_output_base, res_setter); + + SingleVertexDataGetter data_getter(data); + + compute_weights_aggregates(data_getter, iv, means_local, current_arrays); + } + + means_accum.add_means_normalized(means_local); + } + + current_arrays.set_means_normalized(nvtx, means_accum); +} + +template +inline typename std::enable_if::value>::type +sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) { + #pragma HLS INLINE + + distribute_aggregate(nvtx, prev_arrays, last_arrays); +} + +template +inline typename std::enable_if::value>::type +sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) { + #pragma HLS INLINE + + WeightsAndMeans current_arrays; + + distribute_aggregate(nvtx, prev_arrays, current_arrays); + + sublayer(nvtx, current_arrays, last_arrays); +} +} // namespace garnet_utils + +struct garnet_config { + // Layer specs + static const unsigned n_vertices_width = 8; + static const unsigned n_vertices = (1 << n_vertices_width); + static const unsigned n_in_features = 4; + static const unsigned n_propagate = 4; + static const unsigned n_aggregators = 4; + static const unsigned n_out_features = 4; + static const unsigned distance_width = 12; + + // Internal data type definitions + typedef float input_transform_weights_t; + typedef float input_transform_biases_t; + typedef float output_transform_weights_t; + typedef float output_transform_biases_t; + typedef float aggregator_distance_weights_t; + typedef float aggregator_distance_biases_t; + + typedef float norm_t; + typedef float distance_t; + typedef float edge_weight_t; + typedef float edge_weight_aggr_t; + typedef float aggr_t; + typedef float output_t; + + /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */ + /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */ + /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */ + /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */ + /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */ + + enum OutputCollapse { no_collapse, collapse_mean, collapse_max }; + + static const unsigned output_collapse = no_collapse; + + static const bool mean_by_nvert = false; + static const bool is_stack = false; + + // Optimization specs + static const unsigned reuse_factor = 64; + static const unsigned log2_reuse_factor = 6; +}; + +// vertices -> vertices +template +typename std::enable_if::type +garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + #pragma HLS DATAFLOW + + garnet_utils::WeightsAndMeans arrays; + + garnet_utils::aggregate(data, nvtx[0], arrays); + + garnet_utils::distribute(nvtx[0], arrays, res); +} + +// vertices -> out features +template +typename std::enable_if::type +garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features]) { + #pragma HLS DATAFLOW + + garnet_utils::Means arrays; + + garnet_utils::aggregate(data, nvtx[0], arrays); + + garnet_utils::OutputBiasNormalizer normalize_bias(nvtx[0]); + + garnet_utils::set_output(normalize_bias, arrays, res); +} + +// vertices -> vertices +template +typename std::enable_if::type +garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + #pragma HLS DATAFLOW + + typedef typename CONFIG_T::template sublayer_t<0> first_layer_t; + unsigned const ilast = CONFIG_T::n_sublayers - 1; + typedef typename CONFIG_T::template sublayer_t last_layer_t; + + garnet_utils::WeightsAndMeans arrays_first; + garnet_utils::Means arrays_last; + + garnet_utils::aggregate(data, nvtx[0], arrays_first); + + garnet_utils::sublayer(nvtx[0], arrays_first, + arrays_last); + + garnet_utils::distribute(nvtx[0], arrays_last, res); +} + +// vertices -> out features +template +typename std::enable_if::type +garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features]) { + #pragma HLS DATAFLOW + + typedef typename CONFIG_T::template sublayer_t<0> first_layer_t; + unsigned const ilast = CONFIG_T::n_sublayers - 1; + typedef typename CONFIG_T::template sublayer_t last_layer_t; + + garnet_utils::WeightsAndMeans arrays_first; + garnet_utils::Means arrays_last; + + garnet_utils::aggregate(data, nvtx[0], arrays_first); + + garnet_utils::sublayer(nvtx[0], arrays_first, + arrays_last); + + garnet_utils::OutputBiasNormalizer normalize_bias(nvtx[0]); + + garnet_utils::set_output(normalize_bias, arrays_last, res); +} + +/* Reference (dumb) implementation returning (Vertices, Features) */ +template +typename std::enable_if::type +garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) { + typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators]; + typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate]; + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const ivp = iv * CONFIG_T::n_propagate + ip; + + propagated_features[ivp] = CONFIG_T::input_transform_biases[ip]; + + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const ivx = iv * CONFIG_T::n_in_features + ix; + unsigned const ipx = ip * CONFIG_T::n_in_features + ix; + + propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx]; + } + } + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + + typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia]; + + for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) { + unsigned const ivx = iv * CONFIG_T::n_in_features + ix; + unsigned const iax = ia * CONFIG_T::n_in_features + ix; + + distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax]; + } + + edge_weights[iva] = garnet_utils::compute_edge_weight(distance); + } + } + + typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate]; + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const iap = ia * CONFIG_T::n_propagate + ip; + + aggregated_features[iap] = 0.; + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + unsigned const ivp = iv * CONFIG_T::n_propagate + ip; + + aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp]; + } + } + } + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const iap = ia * CONFIG_T::n_propagate + ip; + + if (CONFIG_T::mean_by_nvert) + aggregated_features[iap] /= nvtx[0]; + else { + // Not using right shift in case aggr_t is float or double + aggregated_features[iap] /= CONFIG_T::n_vertices; + } + } + } + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + unsigned const ivo = iv * CONFIG_T::n_out_features + io; + + typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io]; + + for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) { + unsigned const iva = iv * CONFIG_T::n_aggregators + ia; + unsigned const ioa = io * CONFIG_T::n_aggregators + ia; + + typename CONFIG_T::aggr_t aggr = 0.; + + for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) { + unsigned const iap = ia * CONFIG_T::n_propagate + ip; + unsigned const ioap = ioa * CONFIG_T::n_propagate + ip; + + aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap]; + } + + acc += edge_weights[iva] * aggr; + } + + res[ivo] = acc; + } + } +} + +/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */ +template +typename std::enable_if::type +garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1], + res_T res[CONFIG_T::n_out_features]) { + typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]; + + garnet_ref(data, nvtx, vertex_res); + + for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) { + typename CONFIG_T::aggr_t acc = 0.; + + for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) { + if (iv == nvtx[0]) + break; + + unsigned const ivo = iv * CONFIG_T::n_out_features + io; + + acc += vertex_res[ivo]; + } + + if (CONFIG_T::mean_by_nvert) + acc /= nvtx[0]; + else { + // Not using right shift in case aggr_t is float or double + acc /= CONFIG_T::n_vertices; + } + + res[io] = acc; + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_helpers.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_helpers.h new file mode 100644 index 00000000..b8c2a48d --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_helpers.h @@ -0,0 +1,382 @@ +#ifndef NNET_HELPERS_H +#define NNET_HELPERS_H + +#include "hls_stream.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +#ifndef __SYNTHESIS__ + +#ifndef WEIGHTS_DIR +#define WEIGHTS_DIR "weights" +#endif + +template void load_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + + size_t i = 0; + while (std::getline(iss, token, ',')) { + std::istringstream(token) >> w[i]; + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} + +template void load_compressed_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + std::string extra_chars = "} "; + + size_t i = 0; + while (std::getline(iss, token, '{')) { + if (token.length() == 0) { + continue; + } + for (char c : extra_chars) { + token.erase(std::remove(token.begin(), token.end(), c), token.end()); + } + if (token.back() == ',') { + token.erase(token.end() - 1); + } + + std::replace(token.begin(), token.end(), ',', ' '); + std::istringstream structss(token); + + if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) { + std::cerr << "ERROR: Unable to parse file " << std::string(fname); + exit(1); + } + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} + +template void load_exponent_weights_from_txt(T *w, const char *fname) { + + std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname); + std::ifstream infile(full_path.c_str(), std::ios::binary); + + if (infile.fail()) { + std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl; + exit(1); + } + + std::string line; + if (std::getline(infile, line)) { + std::istringstream iss(line); + std::string token; + std::string extra_chars = "} "; + + size_t i = 0; + while (std::getline(iss, token, '{')) { + if (token.length() == 0) { + continue; + } + for (char c : extra_chars) { + token.erase(std::remove(token.begin(), token.end(), c), token.end()); + } + if (token.back() == ',') { + token.erase(token.end() - 1); + } + + std::replace(token.begin(), token.end(), ',', ' '); + std::istringstream structss(token); + + if (!(structss >> w[i].sign >> w[i].weight)) { + std::cerr << "ERROR: Unable to parse file " << std::string(fname); + exit(1); + } + i++; + } + + if (SIZE != i) { + std::cerr << "ERROR: Expected " << SIZE << " values"; + std::cerr << " but read only " << i << " values" << std::endl; + } + } +} +template void convert_data(srcType *src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + dst[i] = dstType(src[i]); + } +} + +template void convert_data(srcType *src, hls::stream &dst) { + for (size_t i = 0; i < SIZE / dstType::size; i++) { + dstType ctype; + for (size_t j = 0; j < dstType::size; j++) { + ctype[j] = typename dstType::value_type(src[i * dstType::size + j]); + } + dst.write(ctype); + } +} + +template void convert_data(hls::stream &src, dstType *dst) { + for (size_t i = 0; i < SIZE / srcType::size; i++) { + srcType ctype = src.read(); + for (size_t j = 0; j < srcType::size; j++) { + dst[i * srcType::size + j] = dstType(ctype[j]); + } + } +} + +extern bool trace_enabled; +extern std::map *trace_outputs; +extern size_t trace_type_size; + +template void save_output_array(data_T *data, save_T *ptr, size_t layer_size) { + for (int i = 0; i < layer_size; i++) { + ptr[i] = save_T(data[i]); + } +} + +template void save_output_array(hls::stream &data, save_T *ptr, size_t layer_size) { + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + ptr[i * data_T::size + j] = save_T(ctype[j]); + } + data.write(ctype); + } +} + +// We don't want to include save_T in this function because it will be inserted into myproject.cpp +// so a workaround with element size is used +template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (int i = 0; i < layer_size; i++) { + out << float(data[i]) << " "; // We don't care about precision in text files + } + out << std::endl; + out.close(); + } +} + +template void save_layer_output(hls::stream &data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (size_t i = 0; i < layer_size / data_T::size; i++) { + data_T ctype = data.read(); + for (size_t j = 0; j < data_T::size; j++) { + out << float(ctype[j]) << " "; // We don't care about precision in text files + } + data.write(ctype); + } + out << std::endl; + out.close(); + } +} + +#endif + +template void copy_data(std::vector src, dst_T dst[SIZE]) { + typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; + typename std::vector::const_iterator in_end = in_begin + SIZE; + std::copy(in_begin, in_end, dst); +} + +template +void copy_data(std::vector src, hls::stream &dst) { + typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; + typename std::vector::const_iterator in_end = in_begin + SIZE; + + size_t i_pack = 0; + dst_T dst_pack; + for (typename std::vector::const_iterator i = in_begin; i != in_end; ++i) { + dst_pack[i_pack++] = typename dst_T::value_type(*i); + if (i_pack == dst_T::size) { + i_pack = 0; + dst.write(dst_pack); + } + } +} + +template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { + for (auto i = 0; i < SIZE; i++) + if (i == SIZE - 1) { + dst[i].data = src[i]; + dst[i].last = 1; + } else { + dst[i].data = src[i]; + dst[i].last = 0; + } +} + +template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE; i++) { + out << result[i] << " "; + } + out << std::endl; +} + +template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE / res_T::size; i++) { + res_T res_pack = result.read(); + for (int j = 0; j < res_T::size; j++) { + out << res_pack[j] << " "; + } + if (keep) + result.write(res_pack); + } + out << std::endl; +} + +template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } + +template void fill_zero(hls::stream &data) { + for (int i = 0; i < SIZE / data_T::size; i++) { + data_T data_pack; + for (int j = 0; j < data_T::size; j++) { + data_pack[j] = 0.; + } + data.write(data_pack); + } +} + +template int read_file_1D(const char *filename, dataType data[nrows]) { + FILE *fp; + fp = fopen(filename, "r"); + if (fp == 0) { + return -1; + } + // Read data from file + float newval; + for (int ii = 0; ii < nrows; ii++) { + if (fscanf(fp, "%f\n", &newval) != 0) { + data[ii] = newval; + } else { + return -2; + } + } + fclose(fp); + return 0; +} + +template +int read_file_2D(const char *filename, dataType data[nrows][ncols]) { + FILE *fp; + fp = fopen(filename, "r"); + if (fp == 0) { + return -1; + } + // Read data from file + float newval; + for (int ii = 0; ii < nrows; ii++) { + for (int jj = 0; jj < ncols; jj++) { + if (fscanf(fp, "%f\n", &newval) != 0) { + data[ii][jj] = newval; + } else { + return -2; + } + } + } + fclose(fp); + return 0; +} + +template void change_type(hls::stream &in, hls::stream &out) { + in_T datareg; + hls::stream input_trunc; + for (int ii = 0; ii < N_IN; ii++) { + out << (out_T)in.read(); + } +} + +template void hls_stream_debug(hls::stream &data, hls::stream &res) { + data_T datareg; + for (int ii = 0; ii < N_IN; ii++) { + datareg = data.read(); + std::cout << "[" << ii << "]: " << datareg << std::endl; + res << datareg; + } +} + +constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } + +constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } + +constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image.h new file mode 100644 index 00000000..eeb45481 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image.h @@ -0,0 +1,41 @@ +#ifndef NNET_IMAGE_H_ +#define NNET_IMAGE_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include + +namespace nnet { + +struct resize_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned n_chan = 10; + static const unsigned new_height = 10; + static const unsigned new_width = 10; +}; + +template +void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan], + data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) { + int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1; + int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1; + int x2, y2; + + #pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::new_height; i++) { + for (int j = 0; j < CONFIG_T::new_width; j++) { + x2 = ((j * x_ratio) >> 16); + y2 = ((i * y_ratio) >> 16); + for (int k = 0; k < CONFIG_T::n_chan; k++) { + resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] = + image[(y2 * CONFIG_T::width * CONFIG_T::n_chan) + x2 * CONFIG_T::n_chan + k]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image_stream.h new file mode 100644 index 00000000..a23a93db --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image_stream.h @@ -0,0 +1,66 @@ +#ifndef NNET_IMAGE_STREAM_H_ +#define NNET_IMAGE_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" + +namespace nnet { + +template void resize_nearest(hls::stream &image, hls::stream &resized) { + assert(CONFIG_T::new_height % CONFIG_T::height == 0); + assert(CONFIG_T::new_width % CONFIG_T::width == 0); + constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height; + constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width; + +ImageHeight: + for (unsigned h = 0; h < CONFIG_T::height; h++) { + #pragma HLS PIPELINE + + data_T data_in_row[CONFIG_T::width]; + + ImageWidth: + for (unsigned i = 0; i < CONFIG_T::width; i++) { + #pragma HLS UNROLL + + data_T in_data = image.read(); + + ImageChan: + for (unsigned j = 0; j < CONFIG_T::n_chan; j++) { + #pragma HLS UNROLL + + data_in_row[i][j] = in_data[j]; + } + } + + ResizeHeight: + for (unsigned i = 0; i < ratio_height; i++) { + #pragma HLS UNROLL + + ImageWidth2: + for (unsigned l = 0; l < CONFIG_T::width; l++) { + #pragma HLS UNROLL + + ResizeWidth: + for (unsigned j = 0; j < ratio_width; j++) { + #pragma HLS UNROLL + + data_T out_data; + PRAGMA_DATA_PACK(out_data) + + ResizeChan: + for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { + #pragma HLS UNROLL + + out_data[k] = data_in_row[l][k]; + } + + resized.write(out_data); + } + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_math.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_math.h new file mode 100644 index 00000000..c021d8eb --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_math.h @@ -0,0 +1,178 @@ +#ifndef NNET_MATH_H_ +#define NNET_MATH_H_ + +#include "hls_math.h" + +namespace nnet { + +// This header defines the functions that return type different from the input +// For example, hls::sin(x) returns ap_fixed +// By ensuring we return the same type we can avoid casting issues in expressions + +template T sin(T x) { return (T)hls::sin(x); }; + +template T cos(T x) { return (T)hls::cos(x); }; + +template T asin(T x) { return (T)hls::asin(x); }; + +template T acos(T x) { return (T)hls::acos(x); }; + +template T atan(T x) { return (T)hls::atan(x); }; + +template T atan2(T x, T y) { return (T)hls::atan2(x, y); }; + +template void init_sincos_table(T table[1 << (W - I - 3)][2]) { + unsigned int NTE = 1 << (W - I - 3); // No of table entries + double step = M_PI / (4 * NTE); // Interval between angles + double y = 0; + // double scaled_angle = 0; + + for (unsigned int i = 0; i < NTE; i++) { + table[i][0] = std::cos(y); + table[i][1] = std::sin(y); + y += step; + // scaled_angle = y/(2*M_PI); + // printf("cos(%f) = %23.22f, sin(%f) = %23.22f index = %d, scaled angle = %13.12f \n", y, cos(y), y, sin(y), i, + // scaled_angle); + } +} + +template void sincos_lut(const T &input, T output[2]) { + + #pragma HLS INLINE + + // This implementation is based on ac_sincos_lut.h from AC math library + + static bool flag = true; + if (flag && T::width - T::iwidth > 12) { +#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG) + std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl; + std::cout << "Warning: The output of sincos_lut will not be accurate" << std::endl; +#endif + flag = false; + } + // Datatype for lookup table entries + typedef ap_ufixed luttype; + // Datatype for posinput which is used to handle negative inputs + typedef ap_ufixed posinputtype; + + typedef ap_uint<9> lutindextype; // 9 bits required for indexing into 512 entry table + typedef ap_uint<3> octanttype; // 3 bits required for octant value range of 0 thru 7 + T outputtemp[2]; + lutindextype luTdex = 0; + posinputtype posinput = input; + + // Initialize the lookup table +#ifdef __SYNTHESIS__ + bool initialized = false; + luttype sincos[512][2]; +#else + static bool initialized = false; + static luttype sincos[512][2]; +#endif + if (!initialized) { + init_sincos_table(sincos); + initialized = true; + } + + // Leaving this commented out makes the table to to BRAM + //#pragma HLS ARRAY_PARTITION variable=sincos complete dim=0 + + typedef ap_uint lutindextype1; + // Extracting (MSB-3:LSB) bits of scaled input to determine the lookup table index + lutindextype1 luTdex1 = posinput.range(AP_MAX(T::width - T::iwidth - 3, 1), 0); // Extracting the lookup table index + + if (T::width - T::iwidth >= 4 && T::width - T::iwidth <= 12) { + luTdex(8, 12 - (T::width - T::iwidth)) = luTdex1; // stride + } + // Approximation for the scaled inputs whose number of bits are greater than 12 + else if (T::width - T::iwidth > 12) { + // Lookup table index for the scaled inputs whose number of bits are greater than 12 + luTdex = luTdex1 / (1 << (AP_MAX(T::width - T::iwidth - 12, 0))); + if ((luTdex1 % (1 << (AP_MAX(T::width - T::iwidth - 12, 0)))) > (1 << (AP_MAX(T::width - T::iwidth - 13, 0)))) { + luTdex = luTdex + 1; + } + typedef ap_ufixed + datatype; + datatype x = (datatype)luTdex1; + x = x >> AP_MAX(T::width - T::iwidth - 12, 0); + if (x > 511.5) { + luTdex = 511; + } + if (luTdex1 <= 1 << (AP_MAX(T::width - T::iwidth - 13, 0)) && luTdex1 != 0) { + luTdex = 1; + } + } + + if (T::width - T::iwidth >= 3) { + // Getting the octant 0-7 by extracting the first 3 bits from MSB side of scaled input where + // octant 0 corresponds to [0-PI/4), + // octant 1 corresponds to [PI/4-2PI/4), + // octant 2 corresponds to [2PI/4-3PI/4) and so on + // octanttype octant = posinput.template slc<3>(T::width-T::iwidth-3); + octanttype octant = posinput(T::width - T::iwidth - 1, T::width - T::iwidth - 3); + luTdex = (octant[0] == 1) ? (lutindextype)(512 - luTdex) : (lutindextype)(luTdex); + // imaginary part is sine + outputtemp[1] = ((octant == 0) | (octant == 3)) ? (T)sincos[luTdex][1] + : ((octant == 2) | (octant == 1)) ? (T)sincos[luTdex][0] + : ((octant == 7) | (octant == 4)) ? (T)-sincos[luTdex][1] + : (T)-sincos[luTdex][0]; + // real part is cosine + outputtemp[0] = ((octant == 6) | (octant == 1)) ? (T)sincos[luTdex][1] + : ((octant == 3) | (octant == 4)) ? (T)-sincos[luTdex][0] + : ((octant == 2) | (octant == 5)) ? (T)-sincos[luTdex][1] + : (T)sincos[luTdex][0]; + // Below two are the cases when the output corresponds to + or - (0 or 1) for which there is no entry in the lookup + // table + output[1] = ((posinput == 0.125) | (posinput == 0.375)) ? T(0.7071067811865475244008) + : ((posinput == 0.625) | (posinput == 0.875)) ? T(-0.7071067811865475244008) + : outputtemp[1]; + output[0] = ((posinput == 0.125) | (posinput == 0.875)) ? T(0.7071067811865475244008) + : ((posinput == 0.375) | (posinput == 0.625)) ? T(-0.7071067811865475244008) + : outputtemp[0]; + } + + if (T::width - T::iwidth <= 2) { + output[1] = (posinput == 0) ? (T)0 + : (posinput == 0.25) ? (T)1 + : (posinput == 0.5) ? (T)0 + : (posinput == 0.75) ? (T)-1 + : outputtemp[1]; + output[0] = (posinput == 0) ? (T)1 + : (posinput == 0.25) ? (T)0 + : (posinput == 0.5) ? (T)-1 + : (posinput == 0.75) ? (T)0 + : outputtemp[0]; + } + +#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG) + std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl; + std::cout << "============AP_FIXED SINCOS======================" << std::endl; + std::cout << "positive input is = " << posinput << std::endl; + std::cout << "lut index is = " << luTdex << std::endl; + std::cout << "sin value is = " << output[1] << std::endl; + std::cout << "cos value is = " << output[0] << std::endl; + std::cout << "=================================================" << std::endl; +#endif +} + +template T sin_lut(const T input) { + #pragma HLS INLINE + T sincos_res[2]; + T scaled_input = input * ap_ufixed<16, 0>(0.15915494309); // 1/(2*pi) + sincos_lut(scaled_input, sincos_res); + return sincos_res[1]; +} + +template T cos_lut(const T input) { + #pragma HLS INLINE + T sincos_res[2]; + T scaled_input = input * ap_ufixed<16, 0>(0.15915494309); // 1/(2*pi) + sincos_lut(scaled_input, sincos_res); + return sincos_res[0]; +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge.h new file mode 100644 index 00000000..083c3185 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge.h @@ -0,0 +1,257 @@ +#ifndef NNET_MERGE_H_ +#define NNET_MERGE_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct merge_config { + static const unsigned n_elem = 10; +}; + +struct dot_config { + static const unsigned n_in = 10; + static const unsigned n_out = 1; + static const unsigned reuse_factor = 1; + typedef float accum_t; + // Product function to use + template using product = nnet::product::mult; +}; + +struct concat_config { + static const unsigned n_elem1_0 = 10; + static const unsigned n_elem1_1 = 10; + static const unsigned n_elem1_2 = 10; + static const unsigned n_elem2_0 = 10; + static const unsigned n_elem2_1 = 10; + static const unsigned n_elem2_2 = 10; + + static const unsigned axis = -1; +}; + +template +void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = data1[ii] + data2[ii]; + } +} + +template +void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = data1[ii] - data2[ii]; + } +} + +template +void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem*2], res_T res[CONFIG_T::n_elem*2]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii*2] = data1[ii] * data2[ii*2]; + res[ii*2+1] = data1[ii] * data2[ii*2+1]; + } +} + +template +void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = (data1[ii] + data2[ii]) / (res_T)2; + } +} + +template +void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii]; + } +} + +template +void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem; ii++) { + res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii]; + } +} + +template +void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + + typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=mult complete + typename CONFIG_T::accum_t acc = 0; + +Product: + for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) { + #pragma HLS UNROLL + mult[i_mult] = CONFIG_T::template product::product(data1[i_mult], data2[i_mult]); + } + +Accum: + for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) { + #pragma HLS UNROLL + acc += mult[i_acc]; + } + + res[0] = cast(acc); +} + +template +void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0], + res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + res[ii] = data1[ii]; + } + for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) { + res[CONFIG_T::n_elem1_0 + ii] = data2[ii]; + } +} + +template +void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) { + res[ii] = data1[ii]; + } + for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii]; + } +} + +template +void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { + res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj]; + } + for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) { + res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] = + data2[ii * CONFIG_T::n_elem2_1 + jj]; + } + } +} + +template +void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + #pragma HLS INLINE + + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) { + res[ii] = data1[ii]; + } + for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii]; + } +} + +template +void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { + for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { + int res_idx = + ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk; + int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk; + res[res_idx] = data1[data_idx]; + } + } + for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) { + for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) { + int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + + (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk; + int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk; + res[res_idx] = data2[data_idx]; + } + } + } +} + +template +void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + #pragma HLS PIPELINE + + for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) { + for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) { + for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { + int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk; + int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk; + res[res_idx] = data1[data_idx]; + } + for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) { + int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2; + int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk; + res[res_idx] = data2[data_idx]; + } + } + } +} + +template +void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + #pragma HLS INLINE + + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge_stream.h new file mode 100644 index 00000000..a57ec78e --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge_stream.h @@ -0,0 +1,370 @@ +#ifndef NNET_MERGE_STREAM_H_ +#define NNET_MERGE_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include + +namespace nnet { + +template +void add(hls::stream &data1, hls::stream &data2, hls::stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +AddLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + #pragma HLS PIPELINE + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + AddPack: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = in_data1[j] + in_data2[j]; + } + + res.write(out_data); + } +} + +template +void subtract(hls::stream &data1, hls::stream &data2, hls::stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +SubtractLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + #pragma HLS PIPELINE + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + SubtractPack: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = in_data1[j] - in_data2[j]; + } + + res.write(out_data); + } +} + +template +void multiply(hls::stream &data1, hls::stream &data2, hls::stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +MultiplyLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + MultiplyPack: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = in_data1[j] * in_data2[j]; + } + + res.write(out_data); + } +} + +template +void average(hls::stream &data1, hls::stream &data2, hls::stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +AverageLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + AveragePack: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2; + } + + res.write(out_data); + } +} + +template +void maximum(hls::stream &data1, hls::stream &data2, hls::stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +MaximumLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + MaximumPack: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]; + } + + res.write(out_data); + } +} + +template +void minimum(hls::stream &data1, hls::stream &data2, hls::stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +MinimumLoop: + for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + MinimumPack: + for (int j = 0; j < res_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]; + } + + res.write(out_data); + } +} + +template +void concatenate3d_0(hls::stream &data1, hls::stream &data2, hls::stream &res) { +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + #pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + res.write(out_data); + } + } +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + ConcatLoopWidth2: + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + #pragma HLS PIPELINE II=1 + + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data2[k]; + } + + res.write(out_data); + } + } +} + +template +void concatenate3d_1(hls::stream &data1, hls::stream &data2, hls::stream &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + #pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + res.write(out_data); + } + ConcatLoopWidth2: + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + #pragma HLS PIPELINE II=1 + + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data2[k]; + } + + res.write(out_data); + } + } +} + +template +void concatenate3d_2(hls::stream &data1, hls::stream &data2, hls::stream &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth: + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + #pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + #pragma HLS UNROLL + out_data[input1_T::size + k] = in_data2[k]; + } + + res.write(out_data); + } + } +} + +template +void concatenate3d(hls::stream &data1, hls::stream &data2, hls::stream &res) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +template +void concatenate2d_0(hls::stream &data1, hls::stream &data2, hls::stream &res) { +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + #pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + res.write(out_data); + } +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + #pragma HLS PIPELINE II=1 + + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data2[k]; + } + + res.write(out_data); + } +} + +template +void concatenate2d_1(hls::stream &data1, hls::stream &data2, hls::stream &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + #pragma HLS PIPELINE II=1 + + input1_T in_data1 = data1.read(); + input2_T in_data2 = data2.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + ConcatPackInput1: + for (int k = 0; k < input1_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data1[k]; + } + + ConcatPackInput2: + for (int k = 0; k < input2_T::size; k++) { + #pragma HLS UNROLL + out_data[input1_T::size + k] = in_data2[k]; + } + + res.write(out_data); + } +} + +template +void concatenate2d(hls::stream &data1, hls::stream &data2, hls::stream &res) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate1d(hls::stream &data1, hls::stream &data2, hls::stream &res) { + res_T out_data; + PRAGMA_DATA_PACK(out_data) +ConcatLoop1: + for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) { + #pragma HLS PIPELINE + input1_T in_data1 = data1.read(); + ConcatPack1: + for (int j = 0; j < input1_T::size; j++) { + #pragma HLS UNROLL + out_data[j + (i * input1_T::size)] = in_data1[j]; + } + } +ConcatLoop2: + for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) { + #pragma HLS PIPELINE + input2_T in_data2 = data2.read(); + ConcatPack2: + for (int j = 0; j < input2_T::size; j++) { + #pragma HLS UNROLL + out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j]; + } + } + res.write(out_data); +} +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_mult.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_mult.h new file mode 100644 index 00000000..00d1c6d1 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_mult.h @@ -0,0 +1,116 @@ +#ifndef NNET_MULT_H_ +#define NNET_MULT_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_helpers.h" +#include +#include + +namespace nnet { + +namespace product { + +/* --- + * different methods to perform the product of input and weight, depending on the + * types of each. + * --- */ + +class Product {}; + +template class both_binary : public Product { + public: + static x_T product(x_T a, w_T w) { + // specialisation for 1-bit weights and incoming data + #pragma HLS INLINE + return a == w; + } +}; + +template class weight_binary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 1-bit weights, arbitrary data + #pragma HLS INLINE + if (w == 0) + return -a; + else + return a; + } +}; + +template class data_binary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-w) { + // Specialisation for 1-bit data, arbitrary weight + #pragma HLS INLINE + if (a == 0) + return -w; + else + return w; + } +}; + +template class weight_ternary : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 2-bit weights, arbitrary data + #pragma HLS INLINE + if (w == 0) + return 0; + else if (w == -1) + return -a; + else + return a; // if(w == 1) + } +}; + +template class mult : public Product { + public: + static auto product(x_T a, w_T w) -> decltype(a * w) { + // 'Normal' product + #pragma HLS INLINE + return a * w; + } +}; + +template class weight_exponential : public Product { + public: + using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>; + static r_T product(x_T a, w_T w) { + // Shift product for exponential weights + #pragma HLS INLINE + + // Shift by the exponent. Negative weights shift right + r_T y = static_cast(a) << w.weight; + + // Negate or not depending on weight sign + return w.sign == 1 ? y : static_cast(-y); + } +}; + +} // namespace product + +template +inline typename std::enable_if>::value && + std::is_same>::value, + ap_int>::type +cast(typename CONFIG_T::accum_t x) { + return (ap_int)(x - CONFIG_T::n_in / 2) * 2; +} + +template +inline typename std::enable_if< + std::is_same>::value && !std::is_same>::value, res_T>::type +cast(typename CONFIG_T::accum_t x) { + return (res_T)x; +} + +template +inline typename std::enable_if<(!std::is_same>::value), res_T>::type cast(typename CONFIG_T::accum_t x) { + return (res_T)x; +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding.h new file mode 100644 index 00000000..e48a2fb4 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding.h @@ -0,0 +1,145 @@ +#ifndef NNET_PADDING_H_ +#define NNET_PADDING_H_ + +#include + +namespace nnet { + +struct padding1d_config { + static const unsigned n_chan = 10; + static const unsigned in_width = 10; + static const unsigned out_width = 10; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template +void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) { + #pragma HLS PIPELINE + + for (int j = 0; j < CONFIG_T::n_chan; j++) { + for (int i = 0; i < CONFIG_T::pad_left; i++) { + *(res++) = 0; + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + *(res++) = (res_T) * (data++); + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + *(res++) = 0; + } + } +} + +template +void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) { + #pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::pad_left; i++) { + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = (res_T) * (data++); + } + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = 0; + } + } +} + +struct padding2d_config { + static const unsigned n_chan = 10; + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template +void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { + #pragma HLS PIPELINE + + for (int k = 0; k < CONFIG_T::n_chan; k++) { + + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + *(res++) = 0; + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + *(res++) = (res_T) * (data++); + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + *(res++) = 0; + } + } + } +} + +template +void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { + #pragma HLS PIPELINE + + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = (res_T) * (data++); + } + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding_stream.h new file mode 100644 index 00000000..9df5d540 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding_stream.h @@ -0,0 +1,85 @@ +#ifndef NNET_PADDING_STREAM_H_ +#define NNET_PADDING_STREAM_H_ + +#include + +namespace nnet { + +template void fill_zero(hls::stream &res) { + #pragma HLS INLINE + res_T res_part; + for (int c = 0; c < CONFIG_T::n_chan; c++) { + #pragma HLS UNROLL + res_part[c] = 0; + } + res.write(res_part); +} + +template void fill_data(hls::stream &data, hls::stream &res) { + #pragma HLS INLINE + data_T data_part = data.read(); + res_T res_part; + for (int c = 0; c < CONFIG_T::n_chan; c++) { + #pragma HLS UNROLL + res_part[c] = data_part[c]; + } + res.write(res_part); +} + +template +void zeropad1d_cl(hls::stream &data, hls::stream &res) { +PadLeft: + for (int i = 0; i < CONFIG_T::pad_left; i++) { + fill_zero(res); + } + +CopyMain: + for (int i = 0; i < CONFIG_T::in_width; i++) { + fill_data(data, res); + } + +PadRight: + for (int i = 0; i < CONFIG_T::pad_right; i++) { + fill_zero(res); + } +} + +template +void zeropad2d_cl(hls::stream &data, hls::stream &res) { + +PadTop: + for (int i = 0; i < CONFIG_T::pad_top; i++) { + PadTopWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(res); + } + } + +PadMain: + for (int i = 0; i < CONFIG_T::in_height; i++) { + PadLeft: + for (int j = 0; j < CONFIG_T::pad_left; j++) { + fill_zero(res); + } + CopyMain: + for (int j = 0; j < CONFIG_T::in_width; j++) { + fill_data(data, res); + } + PadRight: + for (int j = 0; j < CONFIG_T::pad_right; j++) { + fill_zero(res); + } + } + +PadBottom: + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + PadBottomWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(res); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling.h new file mode 100644 index 00000000..12ac8fe3 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling.h @@ -0,0 +1,373 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include "nnet_helpers.h" +#include + +namespace nnet { + +// Return the maximum value from an array +template T max(T x[N]) { + T y = x[0]; + for (int i = 1; i < N; i++) { + y = x[i] > y ? x[i] : y; + } + return y; +} + +template ap_int avg(ap_int (&x)[N]) { + // Use a wider accumulator than the input to avoid overflow + ap_int tmp = 0; + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_int y = tmp; + return tmp; +} + +template ap_int avg(ap_uint (&x)[N]) { + // Use a wider accumulator than the input to avoid overflow + ap_uint tmp = 0; + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_uint y = tmp; + return tmp; +} + +template ap_fixed avg(ap_fixed (&x)[N]) { + // Use a wider accumulator than the input to avoid overflow + ap_fixed tmp = 0; + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_fixed y = tmp; + return y; +} + +template ap_ufixed avg(ap_ufixed (&x)[N]) { + // Use a wider accumulator than the input to avoid overflow + ap_ufixed tmp = 0; + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + tmp /= N; + // Now cast back to original type + ap_ufixed y = tmp; + return y; +} + +// Return the mean value of an array +template T avg(T (&x)[N]) { + T y = 0; + for (int i = 0; i < N; i++) { + y += x[i]; + } + y /= N; + return y; +} + +// Enumeration for pooling operation (max, avg, l2norm pooling) +enum Pool_Op { Max, Average }; // L2Norm }; +template T pool_op(T (&x)[N]) { + switch (op) { + case Max: + return max(x); + case Average: + return avg(x); + // case L2Norm: return l2norm(x); + } +} + +template T pad_val() { + /*--- + *- In Tensorflow, pooling ignores the value in the padded cells + *- For Avg pooling, return 0 (the divisior is modified to the + *- area overlapping the unpadded image. + *- For max pooling, return the most negative value for the type. + *- TODO this is not really generic, it assumes fixed point or integer T + ---*/ + switch (op) { + case Max: { + T x = 0; + x[x.width - 1] = 1; + return x; + break; + } + case Average: + return 0; + } +} + +struct pooling1d_config { + // IO size + static const unsigned n_in = 10; + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template constexpr int pool_op_limit_1d() { + return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor; +} + +template +void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + // Add any necessary padding + unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image x in steps of stride + for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window x + for (int jj = 0; jj < CONFIG_T::stride_width; jj++) { + if (ii + jj < CONFIG_T::pad_left || ii + jj >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[jj] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce width to area of pool window + // not overlapping padding region + res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) { + data_T rescale = static_cast(CONFIG_T::pool_width) / img_overlap; + res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale; + } + } + } +} + +template +void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit_1d(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + data_T pool[CONFIG_T::n_in]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + pool[jj] = data[jj * CONFIG_T::n_filt + ff]; + } + // do the pooling + res[ff] = pool_op(pool); + } +} + +struct pooling2d_config { + // IO size + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + // Pooling function + static const Pool_Op pool_op = Max; + // Reuse factor + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef float accum_t; +}; + +template constexpr int pool_op_limit() { + return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor; +} + +template +void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image y in steps of stride + for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + // Loop over input image x in steps of stride + for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { + // Loop over pool window x + for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + pool[kk * CONFIG_T::stride_width + ll] = + data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) { + data_T rescale = + static_cast(CONFIG_T::pool_height) * static_cast(CONFIG_T::pool_width) / img_overlap; + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale; + } + } + } + } +} + +template +void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + // TODO partition the arrays according to the reuse factor + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit + // Add any necessary padding + unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) { + padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height); + padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width); + } + + for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { + // Loop over input image y in steps of stride + for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) { + // Loop over input image x in steps of stride + for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) { + data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool complete dim=0 + // Keep track of number of pixels in image vs padding region + unsigned img_overlap = 0; + // Loop over pool window y + for (int kk = 0; kk < CONFIG_T::stride_height; kk++) { + // Loop over pool window x + for (int ll = 0; ll < CONFIG_T::stride_width; ll++) { + if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) || + jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[kk * CONFIG_T::stride_width + ll] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + pool[kk * CONFIG_T::stride_width + ll] = + data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + + ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left]; + img_overlap++; + } + } + } + // do the pooling + // TODO in the case of average pooling, need to reduce height * width to area of pool window + // not overlapping padding region + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) + + ff * CONFIG_T::out_height * CONFIG_T::out_width] = + pool_op(pool); + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) { + data_T rescale = + static_cast(CONFIG_T::pool_height) * static_cast(CONFIG_T::pool_width) / img_overlap; + res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) + + ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale; + } + } + } + } +} + +template +void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + const int limit = pool_op_limit(); + #pragma HLS ALLOCATION instances=pool_op limit=limit function + +FiltLoop: + for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast(pool_op(pool)); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 00000000..13d5979a --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,609 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +#include "ap_shift_reg.h" +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" +#include "nnet_pooling.h" +#include "utils/x_hls_utils.h" + +namespace nnet { + +// ************************************************* +// Max/average pooling +// ************************************************* + +template T reduce_pool(T x[N]) { + #pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + return reduce>(x, op_max); + } else { + Op_add op_add; + T sum = reduce>(x, op_add); + return sum / N; + } +} + +template void init_pool_table(unsigned table[TABLE_SIZE]) { + for (unsigned ii = 0; ii < TABLE_SIZE; ii++) { + table[ii] = ii % POOL_SIZE; + } +} + +template +void compute_pool_encoded_2d( + const unsigned h_idx, const unsigned w_idx, const data_T &in_elem, + hls::stream data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt], + hls::stream &res, res_T &res_pack, unsigned &outputs_ready) { + // Nearest H without unused pixels on the right + constexpr unsigned nH = + ((CONFIG_T::in_height - CONFIG_T::pool_height) / CONFIG_T::stride_height) * CONFIG_T::stride_height + + CONFIG_T::pool_height; + // Scaled H that behaves like original H + constexpr unsigned sH = + (DIV_ROUNDUP(CONFIG_T::pool_height, CONFIG_T::stride_height) - 1) * CONFIG_T::stride_height + CONFIG_T::pool_height; + // Nearest W without unused pixels on the right + constexpr unsigned nW = ((CONFIG_T::in_width - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + + CONFIG_T::pool_width; + // Scaled W that behaves like original W + constexpr unsigned sW = + (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width; + +#ifdef __SYNTHESIS__ + bool initialized = false; + unsigned pool_table_height[CONFIG_T::in_height]; + unsigned pool_table_width[CONFIG_T::in_width]; +#else + static bool initialized = false; + static unsigned pool_table_height[CONFIG_T::in_height]; + static unsigned pool_table_width[CONFIG_T::in_width]; +#endif + if (!initialized) { + init_pool_table(pool_table_height); + init_pool_table(pool_table_width); + initialized = true; + } + + #pragma HLS INLINE + + if (data_T::size / CONFIG_T::n_filt > 1) { + #pragma HLS ARRAY_PARTITION variable=pool_table_height complete + #pragma HLS ARRAY_PARTITION variable=pool_table_width complete + } + + typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + const unsigned sh_idx = pool_table_height[h_idx] * CONFIG_T::pool_width; + const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt); + +PixelLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + #pragma HLS PIPELINE + + ap_uint filt_mask = 0; + if ((h_idx < nH) && (wp_idx + p < nW)) { + filt_mask = sh_idx + pool_table_width[wp_idx + p] + 1; + } + + CopyDataFilt: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + if (filt_mask > 0) + data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write( + in_elem[p * CONFIG_T::n_filt + c]); + } + + if (filt_mask == CONFIG_T::pool_height * CONFIG_T::pool_width) { + FiltLoop: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + PoolLoop: + for (unsigned f = 0; f < CONFIG_T::pool_height * CONFIG_T::pool_width; f++) { + pool_window[f] = data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + f].read(); + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res_pack[c] = + reduce_pool( + pool_window); + } else { + res_pack[outputs_ready * CONFIG_T::n_filt + c] = + reduce_pool( + pool_window); + } + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) { + res.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } + } + } +} + +template +void pooling2d_encoded_cl(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + hls::stream data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + constexpr int win_depth = CONFIG_T::pool_height * CONFIG_T::out_width; + for (unsigned i_out = 0; i_out < CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) { + #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + constexpr int pack_factor = data_T::size / CONFIG_T::n_filt; + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (pack_factor); i_iw++) { + #pragma HLS LOOP_FLATTEN + if (res_T::size / CONFIG_T::n_filt == 1) { + #pragma HLS PIPELINE II=pack_factor + } + compute_pool_encoded_2d(i_ih, i_iw, data.read(), data_window, res, res_pack, + outputs_ready); + } + } +} + +// ************************************************* +// Line Buffer Implementation (Phil's) +// ************************************************* +template +void compute_pool_buffer_2d(const data_T &in_elem, + ap_shift_reg + line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt], + hls::stream &res) { + #pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + const static int lShiftY = CONFIG_T::pool_height - 1; + static int pX = 0; // pixel X + static int pY = 0; // pixel Y + static int sX = 0; // stride X + static int sY = 0; // stride Y + + typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel into line buffer, return pooling kernels + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Can compute pooling output + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + FiltLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: + for (unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) { + pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = + reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { // Next line + pY = pY + 1; + // Update stride (threshold) ? subtract stride : increment stride + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling2d_buffer_cl(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + static ap_shift_reg line_buffer[MAX(CONFIG_T::pool_height - 1, 1)] + [CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE + + compute_pool_buffer_2d(data.read(), line_buffer, res); + } + } +} + +template +void pooling2d_cl(hls::stream &data, hls::stream &res) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + pooling2d_buffer_cl(data, res); + break; + case conv_implementation::encoded: + pooling2d_encoded_cl(data, res); + break; + } +} + +// ************************************************* +// Pooling 1D +// ************************************************* + +template +void compute_pool_encoded_1d(const unsigned w_idx, const data_T &in_elem, + hls::stream data_window[CONFIG_T::pool_width * CONFIG_T::n_filt], + hls::stream &res, res_T &res_pack, unsigned &outputs_ready) { + // Nearest W without unused pixels on the right + constexpr unsigned nW = + ((CONFIG_T::n_in - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + CONFIG_T::pool_width; + // Scaled W that behaves like original W + constexpr unsigned sW = + (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width; + +#ifdef __SYNTHESIS__ + bool initialized = false; + unsigned pool_table_width[CONFIG_T::n_in]; +#else + static bool initialized = false; + static unsigned pool_table_width[CONFIG_T::n_in]; +#endif + if (!initialized) { + init_pool_table(pool_table_width); + initialized = true; + } + + #pragma HLS INLINE + + if (data_T::size / CONFIG_T::n_filt > 1) { + #pragma HLS ARRAY_PARTITION variable=pool_table_width complete + } + + typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt); + +PixelLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + #pragma HLS PIPELINE + + ap_uint filt_mask = 0; + if (wp_idx + p < nW) { + filt_mask = pool_table_width[wp_idx + p] + 1; + } + + CopyDataFilt: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + if (filt_mask > 0) + data_window[c * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write(in_elem[p * CONFIG_T::n_filt + c]); + } + + if (filt_mask == CONFIG_T::pool_width) { + FiltLoop: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + PoolLoop: + for (unsigned f = 0; f < CONFIG_T::pool_width; f++) { + pool_window[f] = data_window[c * CONFIG_T::pool_width + f].read(); + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res_pack[c] = reduce_pool(pool_window); + } else { + res_pack[outputs_ready * CONFIG_T::n_filt + c] = + reduce_pool(pool_window); + } + } + if (res_T::size / CONFIG_T::n_filt == + 1) { // Saves resources if we don't pack output, compiler will remove the else branch + res.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) { + res.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } + } + } +} + +template +void pooling1d_encoded_cl(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + hls::stream data_window[CONFIG_T::pool_width * CONFIG_T::n_filt]; + constexpr int win_depth = CONFIG_T::n_out; + for (unsigned i_out = 0; i_out < CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) { + #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + constexpr int pack_factor = data_T::size / CONFIG_T::n_filt; + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (pack_factor); i_iw++) { + #pragma HLS LOOP_FLATTEN + if (res_T::size / CONFIG_T::n_filt == 1) { + #pragma HLS PIPELINE II=pack_factor + } + compute_pool_encoded_1d(i_iw, data.read(), data_window, res, res_pack, outputs_ready); + } +} + +// ************************************************* +// Line Buffer Implementation (Phil's) 1D +// ************************************************* +template +void compute_pool_buffer_1d(const data_T &in_elem, hls::stream &res) { + #pragma HLS INLINE + const static int lShiftX = CONFIG_T::pool_width - 1; + // Counters + static int pX = 0; + static int sX = 0; + + typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_width]; + #pragma HLS ARRAY_PARTITION variable=pool_window complete + + static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel into line buffer, return pooling kernels + // 1D case line buffer not necessary. Put directly into the kernel_data buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Can compute pooling output + if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { + FiltLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS PIPELINE + + // Retrieve data for current channel + PoolLoop: + for (unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) { + pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic]; + } + + // Compute Pooling + res_pack[i_ic] = reduce_pool(pool_window); + } + + // Write to output + res.write(res_pack); + } + + // Counter Housekeeping + if (pX + 1 == CONFIG_T::n_in) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + // Update stride (threshold) ? subtract stride : increment stride + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void pooling1d_buffer_cl(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) { + #pragma HLS LOOP_FLATTEN + #pragma HLS PIPELINE + compute_pool_buffer_1d(data.read(), res); + } +} + +template +void pooling1d_cl(hls::stream &data, hls::stream &res) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + pooling1d_buffer_cl(data, res); + break; + case conv_implementation::encoded: + pooling1d_encoded_cl(data, res); + break; + } +} + +// ************************************************* +// Global max/average pooling +// ************************************************* + +template T reduce_global_pool(T x, T y[N]) { + #pragma HLS INLINE + if (CONFIG_T::pool_op == Max) { + Op_max op_max; + T y_max = reduce>(y, op_max); + return (x > y_max) ? x : y_max; + } else { + Op_add op_add; + T y_sum = reduce>(y, op_add); + return x + y_sum; + } +} + +template +void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]) { +PoolFilt: + for (unsigned c = 0; c < CONFIG_T::n_filt; c++) { + #pragma HLS UNROLL + + typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0 + + PixelLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) { + #pragma HLS UNROLL + data_pack[p] = in_elem[p * CONFIG_T::n_filt + c]; + } + data_window[c] = reduce_global_pool( + data_window[c], data_pack); + } +} + +template +void global_pooling2d_cl(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + init = hls::numeric_limits::min(); + } + +PoolInitLoop: + for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + #pragma HLS UNROLL + data_window[i_init] = init; + } + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) { + #pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + MaxPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + AvgPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width); + } + res.write(res_pack); + } + } +} + +template +void global_pooling1d_cl(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + + typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=data_window complete + + typename CONFIG_T::accum_t init = 0; + if (CONFIG_T::pool_op == Max) { + init = hls::numeric_limits::min(); + } + +PoolInitLoop: + for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) { + #pragma HLS UNROLL + data_window[i_init] = init; + } + +ReadInput: + for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) { + #pragma HLS LOOP_FLATTEN + compute_global_pool(data.read(), data_window); + } + + if (CONFIG_T::pool_op == Max) { + MaxPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + MaxPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack]; + } + res.write(res_pack); + } + } else { + AvgPoolRes: + for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) { + #pragma HLS PIPELINE + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + AvgPoolPack: + for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in; + } + res.write(res_pack); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recr_activations.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recr_activations.h new file mode 100644 index 00000000..f68d8066 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recr_activations.h @@ -0,0 +1,56 @@ +#ifndef NNET_RECR_ACTIVATION_H_ +#define NNET_RECR_ACTIVATION_H_ + +#include "hls_stream.h" +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_helpers.h" +#include + +namespace nnet { + +namespace activation { + +template class Activation { + public: + // ************************************************* + // Blank Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here +}; + +template class relu : public Activation { + public: + // ************************************************* + // Relu Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::relu(data, res); + } +}; + +template class sigmoid : public Activation { + public: + // ************************************************* + // Sigmoid Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::sigmoid(data, res); + } +}; + +template class tanh : public Activation { + public: + // ************************************************* + // TanH Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::tanh(data, res); + } +}; + +} // namespace activation + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recurrent.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recurrent.h new file mode 100644 index 00000000..6e868148 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recurrent.h @@ -0,0 +1,571 @@ +#ifndef NNET_RECURSIVE_H_ +#define NNET_RECURSIVE_H_ + +#include "hls_stream.h" +#include "nnet_activation.h" +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recr_activations.h" + +namespace nnet { + +struct lstm_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + + // Layer Sizes + static const unsigned n_in = 2; + static const unsigned n_parts = 20; + static const unsigned n_out = 2; + static const unsigned n_state = 2; + static const unsigned n_4state = 8; + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; + static const bool store_weights_in_bram = false; + static const bool use_static = true; + + template using activation_recr = nnet::activation::relu; + template using activation = nnet::activation::relu; +}; +// Long Short term Memory NN (LSTM) +// Resources: +// https://github.com/nicodjimenez/lstm/blob/master/lstm.py +// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb +// https://en.wikipedia.org/wiki/Long_short-term_memory +// Notes: +// - LSTM naming conventions adopted from the above links +// - s_newstate = activation(U*input + W*state) +// - h_output = activation(U*input + W*state)*activation(s_newstate) +// - If softmax is needed on output, perform *outside* this operations +// Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE +// dense network at the end) +template +void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + // Initialize the state variable -- will maintain state between function calls + + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state]; // c-matrix (keras notation) + typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state]; + + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + #pragma HLS ARRAY_PARTITION variable=s_newstate complete + #pragma HLS ARRAY_PARTITION variable=tmpres complete + #pragma HLS ARRAY_PARTITION variable=tmpres_state complete + #pragma HLS ARRAY_PARTITION variable=tmpres_ifo complete + #pragma HLS ARRAY_PARTITION variable=tmpres_c complete + #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete + #pragma HLS ARRAY_PARTITION variable=inputacc_c complete + #pragma HLS ARRAY_PARTITION variable=s_actstate complete + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_newstate, tmpres_state, param_r, param_br); + + for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc; + if (iacc > 2 * CONFIG_T::n_state - 1) + index = iacc + CONFIG_T::n_state; + inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index]; + } + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_c[iacc] = tmpres[index] + tmpres_state[index]; + } + + CONFIG_T::template activation_recr::activation( + inputacc_ifo, tmpres_ifo); + + // Now for the confusion matrix + CONFIG_T::template activation::activation( + inputacc_c, tmpres_c); + + // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues) + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)]; + } + // Operation: h=act(s)*o + CONFIG_T::template activation::activation( + s_newstate, s_actstate); + + for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) { + #pragma HLS UNROLL + h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc]; + } +} + +template +void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + res_T s_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + static res_T h_state[CONFIG_T::n_state]; + static res_T s_state[CONFIG_T::n_state]; + // Initialize the state variable -- will maintain state between function calls + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4]; + typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state]; // c-matrix (keras notation) + typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state]; + + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + #pragma HLS ARRAY_PARTITION variable=s_newstate complete + #pragma HLS ARRAY_PARTITION variable=h_state complete + #pragma HLS ARRAY_PARTITION variable=s_state complete + #pragma HLS ARRAY_PARTITION variable=tmpres complete + #pragma HLS ARRAY_PARTITION variable=tmpres_state complete + #pragma HLS ARRAY_PARTITION variable=tmpres_ifo complete + #pragma HLS ARRAY_PARTITION variable=tmpres_c complete + #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete + #pragma HLS ARRAY_PARTITION variable=inputacc_c complete + #pragma HLS ARRAY_PARTITION variable=s_actstate complete + + if (reset_state) { + for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) { + #pragma HLS UNROLL + s_state[i_state] = 0; + h_state[i_state] = 0; + } + } + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_state, tmpres_state, param_r, + param_br); + + for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc; + if (iacc > 2 * CONFIG_T::n_state - 1) + index = iacc + CONFIG_T::n_state; + inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index]; + } + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_c[iacc] = tmpres[index] + tmpres_state[index]; + } + + CONFIG_T::template activation_recr::activation( + inputacc_ifo, tmpres_ifo); + + // Now for the confusion matrix + CONFIG_T::template activation::activation( + inputacc_c, tmpres_c); + + // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues) + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)]; + s_newstate[iacc] = s_state[iacc]; + } + // Operation: h=act(s)*o + CONFIG_T::template activation::activation( + s_state, s_actstate); + + for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) { + #pragma HLS UNROLL + h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc]; + h_newstate[iacc] = h_state[iacc]; + } +} + +template +void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + + res_T h_newstate[CONFIG_T::n_state]; + res_T s_newstate[CONFIG_T::n_state]; + data_T data_in[CONFIG_T::n_in]; + bool reset_state = true; + + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + #pragma HLS ARRAY_PARTITION variable=s_newstate complete + + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + #pragma HLS UNROLL + h_newstate[ii] = 0; + s_newstate[ii] = 0; + } + for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) { + for (int j = 0; j < CONFIG_T::n_in; j++) { + #pragma HLS UNROLL + data_in[j] = data[j + iloop * CONFIG_T::n_in]; + } + if (CONFIG_T::use_static) + nnet::lstm_static(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, + param_br); + else + nnet::lstm(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, + param_br); + if (CONFIG_T::n_sequence_out > 1) + for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) { + #pragma HLS UNROLL + res[i] = h_newstate[j]; + } + reset_state = false; + } + if (CONFIG_T::n_sequence_out == 1) + for (int i = 0; i < (CONFIG_T::n_state); i++) { + #pragma HLS UNROLL + res[i] = h_newstate[i]; + } +} + +template +void lstm_stack(hls::stream &data_stream, hls::stream &res_stream, + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) { + + typename res_T::value_type h_newstate[CONFIG_T::n_state]; + typename res_T::value_type s_newstate[CONFIG_T::n_state]; + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + #pragma HLS ARRAY_PARTITION variable=s_newstate complete + + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + #pragma HLS UNROLL + h_newstate[ii] = 0; + s_newstate[ii] = 0; + } + + typename data_T::value_type data_in[CONFIG_T::n_in]; + bool reset_state = true; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) { + if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) { + // #pragma HLS PIPELINE + } + data_T data_pack = data_stream.read(); + DataPack: + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data_in[i_pack] = data_pack[i_pack]; + } + if (CONFIG_T::use_static) + nnet::lstm_static( + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br); + else + nnet::lstm( + reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br); + if (CONFIG_T::n_sequence_out > 1) { + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPack_sequences: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } + reset_state = false; + } + + if (CONFIG_T::n_sequence_out == 1) { + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPack: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } +} + +// Struct for the GRU template + +struct gru_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 2; + static const unsigned n_out = 2; + static const unsigned n_state = 2; + static const unsigned n_sequence = 2; + static const unsigned n_4state = 8; + static const unsigned table_size = 1024; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const bool use_static = true; + static const unsigned n_zeros = 0; + + template using activation_recr = nnet::activation::relu; + template using activation = nnet::activation::relu; +}; + +template +void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param + // weights - refer page in copy!! + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + // Initialize the state variable -- will maintain state between function calls + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state]; + typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state]; // c-matrix (keras notation) + + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + #pragma HLS ARRAY_PARTITION variable=tmpres complete + #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete + #pragma HLS ARRAY_PARTITION variable=tmpres_state_h complete + #pragma HLS ARRAY_PARTITION variable=tmpres_zr complete + #pragma HLS ARRAY_PARTITION variable=tmpres_h complete + #pragma HLS ARRAY_PARTITION variable=inputacc_zr complete + #pragma HLS ARRAY_PARTITION variable=inputacc_h complete + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_newstate, tmpres_state_zr, param_zr, + param_br); + + // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres + // initialized with biases -- DONE + for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc; + inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index]; + } + + // Activation function Sub layer -- START + CONFIG_T::template activation_recr::activation(inputacc_zr, tmpres_zr); + + // Activation function Sub layer -- END + + // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + } + + // Assuming reset_after is false + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc]; + } + + // Now run the activation on this guy + CONFIG_T::template activation::activation(inputacc_h, tmpres_h); + + // Mix the stat with the previous state + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]); + } +} + +template +void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + // Initialize the state variable -- will maintain state between function calls + + static res_T h_state[CONFIG_T::n_state]; + typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3]; + typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state]; + typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2]; // activated i,f,o matrices (keras notation) + typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state]; // activated c-matrix (keras notation) + typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation) + typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state]; // c-matrix (keras notation) + + #pragma HLS ARRAY_PARTITION variable=h_state complete + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + #pragma HLS ARRAY_PARTITION variable=tmpres complete + #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete + #pragma HLS ARRAY_PARTITION variable=tmpres_state_h complete + #pragma HLS ARRAY_PARTITION variable=tmpres_zr complete + #pragma HLS ARRAY_PARTITION variable=tmpres_h complete + #pragma HLS ARRAY_PARTITION variable=inputacc_zr complete + #pragma HLS ARRAY_PARTITION variable=inputacc_h complete + + if (reset_state) { + for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) { + #pragma HLS UNROLL + h_state[i_h_state] = 0; + } + } + + nnet::dense(data, tmpres, param, param_b); + nnet::dense(h_state, tmpres_state_zr, param_zr, + param_br); + + // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres + // initialized with biases -- DONE + for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc; + inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index]; + } + + // Activation function Sub layer -- START + CONFIG_T::template activation_recr::activation(inputacc_zr, tmpres_zr); + + // Activation function Sub layer -- END + + // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)]; + } + + // Assuming reset_after is false + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + int index = iacc + CONFIG_T::n_state * 2; + inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc]; + } + + // Now run the activation on this guy + CONFIG_T::template activation::activation(inputacc_h, tmpres_h); + + // Mix the stat with the previous state + for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) { + #pragma HLS UNROLL + h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]); + h_newstate[iacc] = h_state[iacc]; + } +} + +template +void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state], + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + + res_T h_state[CONFIG_T::n_state]; + data_T data_in[CONFIG_T::n_in]; + bool reset_state = true; + + #pragma HLS ARRAY_PARTITION variable=h_state complete + #pragma HLS ARRAY_PARTITION variable=data_in complete + + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + #pragma HLS UNROLL + h_state[ii] = 0; + } + for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) { + for (int j = 0; j < CONFIG_T::n_in; j++) { + #pragma HLS UNROLL + data_in[j] = data[j + iloop * CONFIG_T::n_in]; + } + if (CONFIG_T::use_static) + nnet::gru_static(reset_state, data_in, h_state, param, param_zr, param_b, param_br); + else + nnet::gru(reset_state, data_in, h_state, param, param_zr, param_b, param_br); + if (CONFIG_T::n_sequence_out > 1) + for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) { + #pragma HLS UNROLL + res[i] = h_state[j]; + } + reset_state = false; + } + if (CONFIG_T::n_sequence_out == 1) + for (int i = 0; i < (CONFIG_T::n_state); i++) { + #pragma HLS UNROLL + res[i] = h_state[i]; + } +} + +template +void gru_stack(hls::stream &data_stream, hls::stream &res_stream, + typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], + typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state], + typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3], + typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) { + + typename res_T::value_type h_newstate[CONFIG_T::n_state]; + #pragma HLS ARRAY_PARTITION variable=h_newstate complete + for (int ii = 0; ii < CONFIG_T::n_state; ii++) { + #pragma HLS UNROLL + h_newstate[ii] = 0; + } + + typename data_T::value_type data_in[CONFIG_T::n_in]; + bool reset_state = true; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) { + if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) { + // #pragma HLS PIPELINE + } + data_T data_pack = data_stream.read(); + DataPack: + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + #pragma HLS UNROLL + data_in[i_pack] = data_pack[i_pack]; + } + if (CONFIG_T::use_static) + nnet::gru_static( + reset_state, data_in, h_newstate, param, param_zr, param_b, param_br); + else + nnet::gru(reset_state, data_in, h_newstate, + param, param_zr, param_b, param_br); + if (CONFIG_T::n_sequence_out > 1) { + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPack_sequences: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } + reset_state = false; + } + + if (CONFIG_T::n_sequence_out == 1) { + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + ResPack: + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + #pragma HLS UNROLL + res_pack[i_pack] = h_newstate[i_pack]; + } + res_stream.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv1d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv1d_stream.h new file mode 100644 index 00000000..254fc506 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv1d_stream.h @@ -0,0 +1,119 @@ +#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_ +#define NNET_SEPARABLE_CONV1D_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_conv1d_stream.h" +#include "nnet_sepconv_stream.h" + +namespace nnet { + +template +void depthwise_conv_1d_encoded_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + hls::stream data_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; + const int win_depth = CONFIG_T::out_width; + for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=pixel_idx complete + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_1d(i_iw, pixel_idx); + compute_depthwise_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, + weights, biases, pixel_idx); + } +} + +template +void depthwise_conv_1d_buffer_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } +} + +template +void depthwise_conv_1d_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + depthwise_conv_1d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + depthwise_conv_1d_encoded_cl(data, res, weights, biases); + break; + } +} + +template +void pointwise_conv_1d_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_width == 1); + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + +ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } +} + +template +void separable_conv_1d_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + #pragma HLS DATAFLOW + + hls::stream depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_width; + #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights, + depthwise_biases); + pointwise_conv_1d_cl(depthwise_res, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv2d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv2d_stream.h new file mode 100644 index 00000000..d56ed6d9 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv2d_stream.h @@ -0,0 +1,143 @@ +#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_ +#define NNET_SEPARABLE_CONV2D_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_conv2d_stream.h" +#include "nnet_sepconv_stream.h" +#include "nnet_types.h" + +namespace nnet { + +template +void depthwise_conv_2d_encoded_cl( + hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == CONFIG_T::filt_width); + + hls::stream data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width; + for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) { + #pragma HLS STREAM variable=data_window[i_out] depth=win_depth + } + + #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + unsigned outputs_ready = 0; + + ap_uint pixel_idx[data_T::size / CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=pixel_idx complete + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + compute_scaled_indices_2d(i_ih, i_iw, pixel_idx); + compute_depthwise_output_encoded(data.read(), data_window, res, res_pack, outputs_ready, + weights, biases, pixel_idx); + } + } +} + +// Line Buffer Implementation (Phil's) +template +void depthwise_conv_2d_buffer_cl( + hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + static ap_shift_reg line_buffer[CONFIG_T::filt_height - 1] + [CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2 + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) { + #pragma HLS LOOP_FLATTEN + if (CONFIG_T::strategy == nnet::latency) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (CONFIG_T::filt_height > 1) { + compute_depthwise_output_buffer_2d(data.read(), line_buffer, res, weights, biases); + } else { + compute_depthwise_output_buffer_1d(data.read(), res, weights, biases); + } + } + } +} + +template +void depthwise_conv_2d_cl( + hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS inline recursive + switch (CONFIG_T::implementation) { + case conv_implementation::linebuffer: + depthwise_conv_2d_buffer_cl(data, res, weights, biases); + break; + case conv_implementation::encoded: + depthwise_conv_2d_encoded_cl(data, res, weights, biases); + break; + } +} + +template +void pointwise_conv_2d_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + #pragma HLS ARRAY_PARTITION variable=weights complete + #pragma HLS ARRAY_PARTITION variable=biases complete + +ReadInputHeight: + for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) { + ReadInputWidth: + for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) { + if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + } + if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) { + pointwise_mult_buffer(data.read(), res, weights, biases); + } else { + data.read(); + } + } + } +} + +template +void separable_conv_2d_cl(hls::stream &data, hls::stream &res, + typename CONFIG_T::depthwise_config::weight_t + depthwise_weights[CONFIG_T::depthwise_config::filt_height * + CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::weight_t + pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt], + typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan], + typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) { + #pragma HLS DATAFLOW + + hls::stream depthwise_res; + unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width; + #pragma HLS STREAM variable=depthwise_res depth=res_depth + + depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights, + depthwise_biases); + pointwise_conv_2d_cl(depthwise_res, res, pointwise_weights, + pointwise_biases); +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv_stream.h new file mode 100644 index 00000000..9c16de19 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv_stream.h @@ -0,0 +1,306 @@ +#ifndef NNET_SEPARABLE_CONV_STREAM_H_ +#define NNET_SEPARABLE_CONV_STREAM_H_ + +#include "hls_stream.h" +#include "nnet_common.h" +#include "nnet_conv_stream.h" + +namespace nnet { + +template +void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan], + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS INLINE + + typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan]; + typename CONFIG_T::accum_t acc[CONFIG_T::n_chan]; + + // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases + #pragma HLS function_instantiate variable=weights + + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + + #pragma HLS ARRAY_PARTITION variable=mult complete + + #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit + +// Do the matrix-multiply +Product: + for (int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) { + #pragma HLS UNROLL + mult[ii] = CONFIG_T::mult_config::template product::product( + data[ii], weights[ii]); + } + +// Initialize accumulator with input biases +ResetAccum: + for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + +// Accumulate multiplication result +Accum1: + for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) { + Accum2: + for (int jj = 0; jj < CONFIG_T::n_chan; jj++) { + int index = ii * CONFIG_T::n_chan + jj; + acc[jj] += mult[index]; + } + } + +// Cast to "res_t" type +Result: + for (int ires = 0; ires < CONFIG_T::n_chan; ires++) { + #pragma HLS UNROLL + res[ires] = cast(acc[ires]); + } +} + +template +void depthwise_mult_buffer(hls::stream data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + res_T &res_pack, hls::stream &res_stream, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS INLINE + + typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=data complete + typename res_T::value_type res[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=res complete + +InitData: + for (int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) { + #pragma HLS UNROLL + data[id] = data_window[id].read(); + } + + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_product(data, res, weights, biases); + } else { + assert("Resource strategy for DepthwiseConv2D is not supported." && false); + } + +CastLoop: + for (unsigned jj = 0; jj < CONFIG_T::n_chan; jj++) { + #pragma HLS UNROLL + if (res_T::size / CONFIG_T::n_chan == 1) { + res_pack[jj] = res[jj]; + } else { + res_pack[outputs_ready * CONFIG_T::n_chan + jj] = res[jj]; + } + } + + if (res_T::size / CONFIG_T::n_chan == 1) { + res_stream.write(res_pack); + } else { + if (outputs_ready == (res_T::size / CONFIG_T::n_chan) - 1) { + res_stream.write(res_pack); + outputs_ready = 0; + } else { + outputs_ready++; + } + } +} + +template +void compute_depthwise_output_encoded( + const data_T &in_elem, hls::stream data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan], + hls::stream &res, res_T &res_pack, unsigned &outputs_ready, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan], ap_uint *pixel_idx) { + #pragma HLS INLINE + +MultLoop: + for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) { + #pragma HLS PIPELINE II=CONFIG_T::reuse_factor + CopyDataFilt: + for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) { + #pragma HLS UNROLL + CopyDataChan: + for (unsigned c = 0; c < CONFIG_T::n_chan; c++) { + #pragma HLS UNROLL + if (pixel_idx[p][f]) + data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]); + } + } + if (pixel_idx[p][CONFIG_T::kernel_size - 1]) { + depthwise_mult_buffer(data_window, res_pack, res, outputs_ready, weights, biases); + } + } +} + +template +void pointwise_mult_buffer(const data_T &data_pack, hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + #pragma HLS INLINE + + typename data_T::value_type data[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=data complete + + typename res_T::value_type res[CONFIG_T::n_filt]; + #pragma HLS ARRAY_PARTITION variable=res complete + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + +InitData: + for (int id = 0; id < CONFIG_T::n_chan; id++) { + #pragma HLS UNROLL + data[id] = data_pack[id]; + } + + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + dense_latency( + data, res, weights, biases); + } else { + dense_resource( + data, res, weights, biases); + } + +CastLoop: + for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) { + #pragma HLS UNROLL + res_pack[jj] = res[jj]; + } + + res_stream.write(res_pack); +} + +// Line Buffer Implementation (Phil's) +template +void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + + // Counters + static int pX = 0; + static int sX = 0; + + static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel to buffer + nnet::kernel_shift_1d(in_elem, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > lShiftX - 1) { + // Dense multiply + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_product(kernel_data, res_out, + weights, biases); + } else { + assert("Resource strategy for DepthwiseConv1D is not supported." && false); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Pointer Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + } else { + pX = pX + 1; + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +template +void compute_depthwise_output_buffer_2d(const data_T &in_elem, + ap_shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + hls::stream &res_stream, + typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan], + typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) { + #pragma HLS INLINE + + // Thresholds + const static int lShiftX = CONFIG_T::filt_width - 1; + const static int lShiftY = CONFIG_T::filt_height - 1; + + // counters + static int pX = 0; // pixel X + static int pY = 0; // pixel Y + + static int sX = 0; // stride X + static int sY = 0; // stride Y + + static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=kernel_data complete + + typename res_T::value_type res_out[CONFIG_T::n_chan]; + #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0 + + res_T res_pack; + PRAGMA_DATA_PACK(res_pack) + + // Add pixel to buffer + nnet::shift_line_buffer(in_elem, line_buffer, kernel_data); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) { + // Dense multiply + #pragma HLS INLINE recursive + if (CONFIG_T::strategy == nnet::latency) { + depthwise_product(kernel_data, res_out, + weights, biases); + } else { + assert("Resource strategy for DepthwiseConv2D is not supported." && false); + } + + // Pack output + CastLoop: + for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) { + #pragma HLS UNROLL + res_pack[i_ic] = res_out[i_ic]; + } + + // Write output to stream when output ready + res_stream.write(res_pack); + } + + // Pointer Housekeeping + if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded) + { + pX = 0; + sX = 0; + if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image + pY = 0; + sY = 0; + } else { + pY = pY + 1; + sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1; + } + } else { + pX = pX + 1; + sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1; + } +} + +} // namespace nnet +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_stream.h new file mode 100644 index 00000000..900db16c --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_stream.h @@ -0,0 +1,207 @@ + +#ifndef NNET_STREAM_H +#define NNET_STREAM_H + +#include "hls_stream.h" +#include "nnet_common.h" + +namespace nnet { + +struct broadcast_config { + static const unsigned in_height = 1; + static const unsigned in_width = 1; + static const unsigned in_chan = 3; + static const unsigned out_height = 2; + static const unsigned out_width = 2; + static const unsigned out_chan = 3; +}; + +template +void clone_stream(hls::stream &data, hls::stream &res1, hls::stream &res2) { +CloneLoop: + for (int i = 0; i < N / data_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data1; + res_T out_data2; + PRAGMA_DATA_PACK(out_data1) + PRAGMA_DATA_PACK(out_data2) + + ClonePack: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + } + + res1.write(out_data1); + res2.write(out_data2); + } +} + +template +void clone_stream(hls::stream &data, hls::stream &res1, hls::stream &res2, hls::stream &res3) { +CloneLoop: + for (int i = 0; i < N / data_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data1; + res_T out_data2; + res_T out_data3; + PRAGMA_DATA_PACK(out_data1) + PRAGMA_DATA_PACK(out_data2) + PRAGMA_DATA_PACK(out_data3) + + ClonePack: + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + out_data3[j] = in_data[j]; + } + + res1.write(out_data1); + res2.write(out_data2); + res3.write(out_data3); + } +} + +template void repack_stream(hls::stream &data, hls::stream &res) { + if (data_T::size == res_T::size) { + for (int i = 0; i < N / data_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + out_data[j] = in_data[j]; + } + + res.write(out_data); + } + } else if (data_T::size > res_T::size) { + constexpr unsigned pack_diff = data_T::size / res_T::size; + for (int i = 0; i < N / data_T::size; i++) { + if (N / data_T::size > 1) { + #pragma HLS PIPELINE + } + + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + + for (int j = 0; j < pack_diff; j++) { + #pragma HLS PIPELINE + + res_T out_data; + for (int k = 0; k < res_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data[j * res_T::size + k]; + } + res.write(out_data); + } + } + } else { // data_T::size < res_T::size + res_T out_data; + constexpr unsigned pack_diff = res_T::size / data_T::size; + unsigned pack_cnt = 0; + for (int i = 0; i < N / data_T::size; i++) { + #pragma HLS PIPELINE + + data_T in_data = data.read(); + for (int j = 0; j < data_T::size; j++) { + #pragma HLS UNROLL + out_data[pack_cnt * data_T::size + j] = in_data[j]; + } + + if (pack_cnt == pack_diff - 1) { + res.write(out_data); + pack_cnt = 0; + } else { + pack_cnt++; + } + } + } +} + +template +void broadcast_stream_1x1xC(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan); + int n_dupl = (CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::out_chan) / + (CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan); +BroadcastLoop: + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { + #pragma HLS PIPELINE + data_T in_data = data.read(); + for (int j = 0; j < n_dupl; j++) { + #pragma HLS PIPELINE + res_T out_data; + PRAGMA_DATA_PACK(out_data) + for (int k = 0; k < res_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data[k]; + } + res.write(out_data); + } + } +} + +template +void broadcast_stream_HxWx1(hls::stream &data, hls::stream &res) { + assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && + CONFIG_T::in_width == CONFIG_T::out_width); +BroadcastLoop: + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) { + #pragma HLS PIPELINE + data_T in_data = data.read(); + res_T out_data; + PRAGMA_DATA_PACK(out_data) + for (int k = 0; k < res_T::size; k++) { + #pragma HLS UNROLL + out_data[k] = in_data[0]; + } + res.write(out_data); + } +} + +template +void broadcast_stream(hls::stream &data, hls::stream &res) { + if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) { + broadcast_stream_1x1xC(data, res); + } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height && + CONFIG_T::in_width == CONFIG_T::out_width) { + broadcast_stream_HxWx1(data, res); + } +} + +template +void transpose_2d(hls::stream &data, hls::stream &res) { + typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width]; + #pragma HLS ARRAY_PARTITION variable=data_array complete + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) { + #pragma HLS PIPELINE + data_T in_data = data.read(); + for (int j = 0; j < data_T::size; j++) { + data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); + } + } + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) { + #pragma HLS PIPELINE + res_T out_data; + PRAGMA_DATA_PACK(out_data) + for (int j = 0; j < res_T::size; j++) { + out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); + } + res.write(out_data); + } +} +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_types.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_types.h new file mode 100644 index 00000000..0fcac134 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_types.h @@ -0,0 +1,64 @@ +#ifndef NNET_TYPES_H_ +#define NNET_TYPES_H_ + +#include +#include +#include + +namespace nnet { + +// Fixed-size array +template struct array { + typedef T value_type; + static const unsigned size = N; + + T data[N]; + + T &operator[](size_t pos) { return data[pos]; } + + const T &operator[](size_t pos) const { return data[pos]; } + + array &operator=(const array &other) { + if (&other == this) + return *this; + + assert(N == other.size && "Array sizes must match."); + + for (unsigned i = 0; i < N; i++) { + #pragma HLS UNROLL + data[i] = other[i]; + } + return *this; + } +}; + +// Generic lookup-table implementation, for use in approximations of math functions +template class lookup_table { + public: + lookup_table(T from, T to) : range_start(from), range_end(to), base_div(ap_uint<16>(N) / T(to - from)) { + T step = (range_end - range_start) / ap_uint<16>(N); + for (size_t i = 0; i < N; i++) { + T num = range_start + ap_uint<16>(i) * step; + T sample = func(num); + samples[i] = sample; + } + } + + T operator()(T n) const { + int index = (n - range_start) * base_div; + if (index < 0) + index = 0; + else if (index > N - 1) + index = N - 1; + return samples[index]; + } + + private: + T samples[N]; + const T range_start, range_end; + ap_fixed<20, 16> base_div; +}; + +} // namespace nnet + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/parameters.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/parameters.h new file mode 100644 index 00000000..9d4d11a0 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/parameters.h @@ -0,0 +1,247 @@ +#ifndef PARAMETERS_H_ +#define PARAMETERS_H_ + +#include "ap_fixed.h" +#include "ap_int.h" + +#include "nnet_utils/nnet_code_gen.h" +#include "nnet_utils/nnet_helpers.h" +// hls-fpga-machine-learning insert includes +#include "nnet_utils/nnet_activation.h" +#include "nnet_utils/nnet_activation_stream.h" +#include "nnet_utils/nnet_conv1d.h" +#include "nnet_utils/nnet_embed.h" +#include "nnet_utils/nnet_embed_stream.h" +#include "nnet_utils/nnet_merge.h" +#include "nnet_utils/nnet_merge_stream.h" +#include "nnet_utils/nnet_pooling.h" +#include "nnet_utils/nnet_pooling_stream.h" +#include "nnet_utils/nnet_sepconv1d_stream.h" + +// hls-fpga-machine-learning insert weights +#include "weights/e3.h" +#include "weights/e4.h" +#include "weights/w22.h" +#include "weights/b22.h" +#include "weights/w23.h" +#include "weights/b23.h" +#include "weights/w24.h" +#include "weights/b24.h" + +// hls-fpga-machine-learning insert layer-config +// embedding0 +struct config3 : nnet::embed_config { + static const unsigned n_in = 100; + static const unsigned n_out = 2; + static const unsigned vocab_size = 6; + static const unsigned io_type = nnet::io_parallel; + static const unsigned reuse_factor = 1; + typedef embedding0_embeddings_t embeddings_t; +}; + +// embedding1 +struct config4 : nnet::embed_config { + static const unsigned n_in = 100; + static const unsigned n_out = 2; + static const unsigned vocab_size = 4; + static const unsigned io_type = nnet::io_parallel; + static const unsigned reuse_factor = 1; + typedef embedding1_embeddings_t embeddings_t; +}; + +// concatenate +struct config6 : nnet::concat_config { + static const unsigned n_elem1_0 = 100; + static const unsigned n_elem1_1 = 2; + static const unsigned n_elem1_2 = 0; + static const unsigned n_elem2_0 = 100; + static const unsigned n_elem2_1 = 2; + static const unsigned n_elem2_2 = 0; + + static const int axis = -1; +}; + +// concatenate_1 +struct config7 : nnet::concat_config { + static const unsigned n_elem1_0 = 100; + static const unsigned n_elem1_1 = 4; + static const unsigned n_elem1_2 = 0; + static const unsigned n_elem2_0 = 100; + static const unsigned n_elem2_1 = 4; + static const unsigned n_elem2_2 = 0; + + static const int axis = -1; +}; + +// dense +struct config22_mult : nnet::dense_config { + static const unsigned n_in = 8; + static const unsigned n_out = 12; + static const unsigned reuse_factor = 1; + static const unsigned strategy = nnet::latency; + static const unsigned n_zeros = 0; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + typedef model_default_t accum_t; + typedef dense_bias_t bias_t; + typedef dense_weight_t weight_t; + template + using product = nnet::product::mult; +}; + +struct config22 : nnet::conv1d_config { + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_width = 100; + static const unsigned n_chan = 8; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = 12; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + static const unsigned out_width = 100; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::latency; + static const nnet::conv_implementation implementation = nnet::conv_implementation::linebuffer; + static const unsigned min_width = 100; + static const ap_uint pixels[min_width]; + static const unsigned n_partitions = 100; + static const unsigned n_pixels = out_width / n_partitions; + template + using fill_buffer = nnet::fill_buffer_22; + typedef model_default_t accum_t; + typedef dense_bias_t bias_t; + typedef dense_weight_t weight_t; + typedef config22_mult mult_config; + template + using scale_index = nnet::scale_index_unscaled; +}; +const ap_uint config22::pixels[] = {0}; + +// activation +struct tanh_config11 : nnet::activ_config { + static const unsigned n_in = 1200; + static const unsigned table_size = 1024; + static const unsigned io_type = nnet::io_parallel; + static const unsigned reuse_factor = 1; + typedef activation_table_t table_t; +}; + +// dense_1 +struct config23_mult : nnet::dense_config { + static const unsigned n_in = 12; + static const unsigned n_out = 36; + static const unsigned reuse_factor = 1; + static const unsigned strategy = nnet::latency; + static const unsigned n_zeros = 0; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + typedef model_default_t accum_t; + typedef dense_1_bias_t bias_t; + typedef dense_1_weight_t weight_t; + template + using product = nnet::product::mult; +}; + +struct config23 : nnet::conv1d_config { + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_width = 100; + static const unsigned n_chan = 12; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = 36; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + static const unsigned out_width = 100; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::latency; + static const nnet::conv_implementation implementation = nnet::conv_implementation::linebuffer; + static const unsigned min_width = 100; + static const ap_uint pixels[min_width]; + static const unsigned n_partitions = 100; + static const unsigned n_pixels = out_width / n_partitions; + template + using fill_buffer = nnet::fill_buffer_23; + typedef model_default_t accum_t; + typedef dense_1_bias_t bias_t; + typedef dense_1_weight_t weight_t; + typedef config23_mult mult_config; + template + using scale_index = nnet::scale_index_unscaled; +}; +const ap_uint config23::pixels[] = {0}; + +// activation_1 +struct tanh_config15 : nnet::activ_config { + static const unsigned n_in = 3600; + static const unsigned table_size = 1024; + static const unsigned io_type = nnet::io_parallel; + static const unsigned reuse_factor = 1; + typedef activation_1_table_t table_t; +}; + +// met_weight +struct config24_mult : nnet::dense_config { + static const unsigned n_in = 36; + static const unsigned n_out = 1; + static const unsigned reuse_factor = 1; + static const unsigned strategy = nnet::latency; + static const unsigned n_zeros = 0; + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor; + typedef model_default_t accum_t; + typedef met_weight_bias_t bias_t; + typedef met_weight_weight_t weight_t; + template + using product = nnet::product::mult; +}; + +struct config24 : nnet::conv1d_config { + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned in_width = 100; + static const unsigned n_chan = 36; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + static const unsigned n_filt = 1; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + static const unsigned out_width = 100; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; + static const bool store_weights_in_bram = false; + static const unsigned strategy = nnet::latency; + static const nnet::conv_implementation implementation = nnet::conv_implementation::linebuffer; + static const unsigned min_width = 100; + static const ap_uint pixels[min_width]; + static const unsigned n_partitions = 100; + static const unsigned n_pixels = out_width / n_partitions; + template + using fill_buffer = nnet::fill_buffer_24; + typedef model_default_t accum_t; + typedef met_weight_bias_t bias_t; + typedef met_weight_weight_t weight_t; + typedef config24_mult mult_config; + template + using scale_index = nnet::scale_index_unscaled; +}; +const ap_uint config24::pixels[] = {0}; + +// multiply +struct config20 : nnet::merge_config { + static const unsigned n_elem = N_OUTPUTS_24*N_FILT_24; +}; + +// output +struct config21 : nnet::pooling1d_config { + static const unsigned n_in = 100; + static const unsigned n_filt = 2; + static const nnet::Pool_Op pool_op = nnet::Average; + static const unsigned reuse_factor = 1; + typedef model_default_t accum_t; +}; + + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.h new file mode 100644 index 00000000..e9c30326 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.h @@ -0,0 +1,15 @@ +//Numpy array shape [12] +//Min -0.455119371414 +//Max 0.398226708174 +//Number of zeros 0 + +#ifndef B22_H_ +#define B22_H_ + +#ifndef __SYNTHESIS__ +dense_bias_t b22[12]; +#else +dense_bias_t b22[12] = {-0.227416396141052, -0.321803480386734, -0.105886071920395, 0.004980653524399, -1.102990508079529, 1.840189456939697, -0.065355993807316, -0.420345693826675, -0.125013768672943, -0.633407652378082, 0.452038317918777, -0.057287767529488}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.txt new file mode 100644 index 00000000..c6c56a2f --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.txt @@ -0,0 +1 @@ +-0.227416396141052, -0.321803480386734, -0.105886071920395, 0.004980653524399, -1.102990508079529, 1.840189456939697, -0.065355993807316, -0.420345693826675, -0.125013768672943, -0.633407652378082, 0.452038317918777, -0.057287767529488 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.h new file mode 100644 index 00000000..2665bfe2 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.h @@ -0,0 +1,15 @@ +//Numpy array shape [36] +//Min -0.522930324078 +//Max 0.388318747282 +//Number of zeros 0 + +#ifndef B23_H_ +#define B23_H_ + +#ifndef __SYNTHESIS__ +dense_1_bias_t b23[36]; +#else +dense_1_bias_t b23[36] = {-28.527759552001953, -6.611515045166016, -14.351591110229492, -3.294915914535522, 14.957226753234863, -5.450253486633301, -5.768840312957764, 1.048536539077759, -1.573255777359009, -4.288578033447266, -2.320878744125366, 2.320586442947388, -2.193000793457031, 14.887507438659668, 2.135548591613770, -6.345302581787109, 1.965700864791870, -6.714401245117188, -1.507563710212708, -7.482578754425049, -5.760603904724121, -8.901734352111816, 4.178072929382324, -7.702874183654785, -5.517005920410156, 2.493387222290039, -5.700569152832031, 3.564873695373535, 1.121586322784424, 8.881909370422363, 6.257650375366211, -0.310464382171631, 1.509941101074219, 5.575150012969971, -4.270040988922119, 4.464414119720459}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.txt new file mode 100644 index 00000000..e14f7cf8 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.txt @@ -0,0 +1 @@ +-28.527759552001953, -6.611515045166016, -14.351591110229492, -3.294915914535522, 14.957226753234863, -5.450253486633301, -5.768840312957764, 1.048536539077759, -1.573255777359009, -4.288578033447266, -2.320878744125366, 2.320586442947388, -2.193000793457031, 14.887507438659668, 2.135548591613770, -6.345302581787109, 1.965700864791870, -6.714401245117188, -1.507563710212708, -7.482578754425049, -5.760603904724121, -8.901734352111816, 4.178072929382324, -7.702874183654785, -5.517005920410156, 2.493387222290039, -5.700569152832031, 3.564873695373535, 1.121586322784424, 8.881909370422363, 6.257650375366211, -0.310464382171631, 1.509941101074219, 5.575150012969971, -4.270040988922119, 4.464414119720459 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.h new file mode 100644 index 00000000..9daede16 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.h @@ -0,0 +1,15 @@ +//Numpy array shape [1] +//Min 3.417605638504 +//Max 3.417605638504 +//Number of zeros 0 + +#ifndef B24_H_ +#define B24_H_ + +#ifndef __SYNTHESIS__ +met_weight_bias_t b24[1]; +#else +met_weight_bias_t b24[1] = {2.417605638504028}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.txt new file mode 100644 index 00000000..42659b3e --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.txt @@ -0,0 +1 @@ +2.417605638504028 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.h new file mode 100644 index 00000000..34773dd1 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.h @@ -0,0 +1,15 @@ +//Numpy array shape [6, 2] +//Min -2.672395467758 +//Max 2.548557043076 +//Number of zeros 0 + +#ifndef E3_H_ +#define E3_H_ + +#ifndef __SYNTHESIS__ +embedding0_embeddings_t e3[12]; +#else +embedding0_embeddings_t e3[12] = {1.620906114578247, -0.427226632833481, -2.672395467758179, -0.035970680415630, 2.548557043075562, 0.323681503534317, 1.538867950439453, 1.997532844543457, -0.704283535480499, 0.116950742900372, -0.906534552574158, 0.974053442478180}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.txt new file mode 100644 index 00000000..3c0038cb --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.txt @@ -0,0 +1 @@ +1.620906114578247, -0.427226632833481, -2.672395467758179, -0.035970680415630, 2.548557043075562, 0.323681503534317, 1.538867950439453, 1.997532844543457, -0.704283535480499, 0.116950742900372, -0.906534552574158, 0.974053442478180 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.h new file mode 100644 index 00000000..5835c2a4 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.h @@ -0,0 +1,15 @@ +//Numpy array shape [4, 2] +//Min -1.666811108589 +//Max 1.295734167099 +//Number of zeros 0 + +#ifndef E4_H_ +#define E4_H_ + +#ifndef __SYNTHESIS__ +embedding1_embeddings_t e4[8]; +#else +embedding1_embeddings_t e4[8] = {1.295734167098999, 0.254000633955002, -1.661195635795593, 0.048672962933779, -0.138032227754593, -0.875923097133636, -1.666811108589172, 0.035932607948780}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.txt new file mode 100644 index 00000000..d6d8ec98 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.txt @@ -0,0 +1 @@ +1.295734167098999, 0.254000633955002, -1.661195635795593, 0.048672962933779, -0.138032227754593, -0.875923097133636, -1.666811108589172, 0.035932607948780 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.h new file mode 100644 index 00000000..2716062b --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.h @@ -0,0 +1,15 @@ +//Numpy array shape [8, 12] +//Min -3.111060142517 +//Max 1.904985547066 +//Number of zeros 0 + +#ifndef W22_H_ +#define W22_H_ + +#ifndef __SYNTHESIS__ +dense_weight_t w22[96]; +#else +dense_weight_t w22[96] = {-0.005496513564140, -0.077705480158329, -0.291069507598877, 0.003703390946612, 0.009928826242685, 0.002178243128583, 0.007691420149058, -0.122642949223518, -0.004901545587927, 0.184459403157234, 0.077915966510773, -0.002935178112239, -0.630976676940918, -0.129218742251396, 0.172030463814735, -0.613496303558350, -0.006485627032816, -0.007314948830754, -0.013219951651990, 0.035634342581034, 0.011636621318758, -0.013739237561822, -0.052910525351763, 0.007741326466203, -0.003901405725628, 0.006636092904955, 0.014814227819443, 0.002152013825253, -0.000235362793319, -0.003903909819201, 0.000954345799983, 0.004304980859160, -0.000281526794424, 0.014572271145880, -0.007630184758455, 0.001944163814187, -0.545304834842682, 0.224154502153397, 1.193614006042480, 0.672130286693573, 0.079618625342846, -0.729031324386597, 1.117634415626526, 0.088703706860542, 0.263513863086700, 0.946384370326996, 0.078555844724178, 0.085146762430668, -0.199110791087151, -0.093624226748943, 0.001692321849987, 0.204557403922081, 0.073481321334839, 0.260788798332214, -0.122535862028599, -0.085562728345394, 0.025333112105727, -0.131282433867455, -0.406875669956207, -0.066440477967262, -0.042630787938833, 0.427074193954468, 1.956082224845886, 0.046955518424511, 0.030683849006891, 0.232642397284508, -0.598365366458893, -0.853525161743164, -0.292229890823364, -2.031559944152832, 0.012307391501963, 0.127083599567413, 0.060593571513891, -0.268928855657578, -0.487386792898178, -0.127690494060516, -0.012389726005495, 0.656857013702393, 0.665676295757294, -0.315022528171539, -0.161770179867744, 0.515646219253540, -0.374600380659103, -0.031053755432367, 0.023489914834499, -0.527695715427399, -0.117961816489697, -0.055053103715181, -0.132891759276390, -0.345012873411179, -0.197673514485359, 0.346816360950470, 0.160986021161079, -0.146570160984993, -0.089796856045723, -0.088734544813633}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.txt new file mode 100644 index 00000000..e518ed71 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.txt @@ -0,0 +1 @@ +-0.005496513564140, -0.077705480158329, -0.291069507598877, 0.003703390946612, 0.009928826242685, 0.002178243128583, 0.007691420149058, -0.122642949223518, -0.004901545587927, 0.184459403157234, 0.077915966510773, -0.002935178112239, -0.630976676940918, -0.129218742251396, 0.172030463814735, -0.613496303558350, -0.006485627032816, -0.007314948830754, -0.013219951651990, 0.035634342581034, 0.011636621318758, -0.013739237561822, -0.052910525351763, 0.007741326466203, -0.003901405725628, 0.006636092904955, 0.014814227819443, 0.002152013825253, -0.000235362793319, -0.003903909819201, 0.000954345799983, 0.004304980859160, -0.000281526794424, 0.014572271145880, -0.007630184758455, 0.001944163814187, -0.545304834842682, 0.224154502153397, 1.193614006042480, 0.672130286693573, 0.079618625342846, -0.729031324386597, 1.117634415626526, 0.088703706860542, 0.263513863086700, 0.946384370326996, 0.078555844724178, 0.085146762430668, -0.199110791087151, -0.093624226748943, 0.001692321849987, 0.204557403922081, 0.073481321334839, 0.260788798332214, -0.122535862028599, -0.085562728345394, 0.025333112105727, -0.131282433867455, -0.406875669956207, -0.066440477967262, -0.042630787938833, 0.427074193954468, 1.956082224845886, 0.046955518424511, 0.030683849006891, 0.232642397284508, -0.598365366458893, -0.853525161743164, -0.292229890823364, -2.031559944152832, 0.012307391501963, 0.127083599567413, 0.060593571513891, -0.268928855657578, -0.487386792898178, -0.127690494060516, -0.012389726005495, 0.656857013702393, 0.665676295757294, -0.315022528171539, -0.161770179867744, 0.515646219253540, -0.374600380659103, -0.031053755432367, 0.023489914834499, -0.527695715427399, -0.117961816489697, -0.055053103715181, -0.132891759276390, -0.345012873411179, -0.197673514485359, 0.346816360950470, 0.160986021161079, -0.146570160984993, -0.089796856045723, -0.088734544813633 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.h new file mode 100644 index 00000000..ed36b365 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.h @@ -0,0 +1,15 @@ +//Numpy array shape [12, 36] +//Min -1.362776517868 +//Max 1.903477072716 +//Number of zeros 0 + +#ifndef W23_H_ +#define W23_H_ + +#ifndef __SYNTHESIS__ +dense_1_weight_t w23[432]; +#else +dense_1_weight_t w23[432] = {-34.272384643554688, -15.242276191711426, -12.153550148010254, -0.517302453517914, 16.459009170532227, -0.927374601364136, -16.420297622680664, 2.281730651855469, -10.332298278808594, 1.368979692459106, -14.852247238159180, 2.289415359497070, 4.450272083282471, 15.091560363769531, 13.234894752502441, 5.019698619842529, 0.659490466117859, -10.297682762145996, -6.819073200225830, 0.061284162104130, -17.845451354980469, -12.342288970947266, 9.018982887268066, -7.944165229797363, -9.916581153869629, -9.689590454101562, 1.593392252922058, -0.548580884933472, 2.834589481353760, 13.109604835510254, -13.948617935180664, 4.110248565673828, -2.018397331237793, -6.860967636108398, 2.082887887954712, -1.957600474357605, -0.321475505828857, 0.820872783660889, 2.677054405212402, -1.768133521080017, -1.177917003631592, -1.118692636489868, 3.211776494979858, -3.037288188934326, 5.288896083831787, -0.581319391727448, 0.694460690021515, 0.631229758262634, 3.280242681503296, 6.218200206756592, -2.129676103591919, 8.349854469299316, 1.471119880676270, 7.490054130554199, 0.753753662109375, -0.399599075317383, 2.416818141937256, 1.339800357818604, -0.718787252902985, 0.338143020868301, 1.145107865333557, 5.754922389984131, -4.704513549804688, -1.252747058868408, 0.465840101242065, -3.112505197525024, 4.487011432647705, -0.808801710605621, 7.409661293029785, -8.000079154968262, 0.356041222810745, -1.234661579132080, -13.651395797729492, 4.551627635955811, 3.547161102294922, -1.346346378326416, -6.481750965118408, 0.371593445539474, -0.909239649772644, 0.803896009922028, -0.864329278469086, -0.167551159858704, 1.271770358085632, 0.128098145127296, -0.319244086742401, -8.963575363159180, -4.575497150421143, -4.347470760345459, 0.099872648715973, 1.076389431953430, 1.537157297134399, -0.342850208282471, -3.088666439056396, 1.880550146102905, -2.499561071395874, 0.960815191268921, 1.989226579666138, 5.396582126617432, 4.611053466796875, 1.478802204132080, -0.381258249282837, -1.447740316390991, -0.485423654317856, 1.209582686424255, -6.765387535095215, 0.879579961299896, 3.126605033874512, -1.396452188491821, 35.301498413085938, 16.390518188476562, 10.991186141967773, 0.457286953926086, -16.055135726928711, 0.732447206974030, 12.833724975585938, -0.869582533836365, 5.935638427734375, 2.171858549118042, 15.994698524475098, -1.975315093994141, -0.577428340911865, -16.300628662109375, -12.036094665527344, -12.248717308044434, -0.296559274196625, 9.253703117370605, 7.236478328704834, 0.100461378693581, 15.662371635437012, 13.149472236633301, -9.011061668395996, 9.156368255615234, 9.083997726440430, 8.143834114074707, 6.395058631896973, 0.768283843994141, -2.189213037490845, -12.856546401977539, 10.946484565734863, -3.122458934783936, 2.356916427612305, 10.203166007995605, 3.314955234527588, 2.006448984146118, -4.138628959655762, 9.784881591796875, -7.701581478118896, -2.161497592926025, 5.081796169281006, 0.722472250461578, -6.947623729705811, 0.428102672100067, -1.017104268074036, -5.616028785705566, 7.207549571990967, -3.425596952438354, -0.324499905109406, -1.508072257041931, -0.423026353120804, -6.807011127471924, -2.165873289108276, -6.257976055145264, -1.110751748085022, -0.680330693721771, -8.726241111755371, 6.876333713531494, 0.122669994831085, -6.020811080932617, -1.936614274978638, 7.679961681365967, 6.832388401031494, 2.089343547821045, -5.815147399902344, 1.034743905067444, 6.128062248229980, 3.326957702636719, -6.113448143005371, 0.656117796897888, -0.316450953483582, 0.792564570903778, 10.708021163940430, 10.246310234069824, -4.989016532897949, -2.966490268707275, 4.010641574859619, 1.000328898429871, -35.920978546142578, -1.863970279693604, 1.379239320755005, -0.364904999732971, 3.232958555221558, -0.646893203258514, 4.449232578277588, -6.601441383361816, 7.810013294219971, 0.764219939708710, -0.887412309646606, 4.851296424865723, -3.773882389068604, 0.953490376472473, -28.108135223388672, 7.164631843566895, 5.078193187713623, -4.744826793670654, -7.120871067047119, -7.749808311462402, 10.820018768310547, 0.171118795871735, -1.084927797317505, 1.892885923385620, -28.651664733886719, 10.953318595886230, -5.435957431793213, -21.623348236083984, 2.465915918350220, -8.539632797241211, -7.903433799743652, -3.474239349365234, 0.243321105837822, -0.380062937736511, 5.332633972167969, 2.151208877563477, 2.022930383682251, -1.463849902153015, -2.719141244888306, -1.985015749931335, 1.754704952239990, -3.623456001281738, 4.132822036743164, -2.998028755187988, 4.864254474639893, 7.019001007080078, -2.887226343154907, -2.157429456710815, -13.182174682617188, -0.038866952061653, -5.827670574188232, 5.151016235351562, 2.542974710464478, -12.306578636169434, -1.044925689697266, 11.257448196411133, -1.981187462806702, -1.172790408134460, -1.593691825866699, 5.988854408264160, 11.212390899658203, 6.184563636779785, 1.851197481155396, -7.376731395721436, 2.947922706604004, -3.116251468658447, 9.032855033874512, 11.189463615417480, -14.047230720520020, -1.882185339927673, 13.061312675476074, -3.894136667251587, -18.382831573486328, 5.108212947845459, 1.680236458778381, 5.855550289154053, 1.753978013992310, 8.817825317382812, 4.784208774566650, -9.456546783447266, 6.749723434448242, -3.826550960540771, 8.439210891723633, 2.381058931350708, -6.254682064056396, 0.979307055473328, -12.932164192199707, 6.614181041717529, 7.724326133728027, -8.186627388000488, -11.564584732055664, -5.705511093139648, 0.615724623203278, 2.294805049896240, 8.561786651611328, 10.862165451049805, -11.637836456298828, 8.550187110900879, -2.799665927886963, -4.847795963287354, 2.903936386108398, -6.381844997406006, 6.450922012329102, 26.763093948364258, 3.013844728469849, -0.924964666366577, -6.920816898345947, -2.560798168182373, -34.196998596191406, 1.623008966445923, 10.048088073730469, 0.985973894596100, 23.329315185546875, 1.722676992416382, -0.090961724519730, -5.953221797943115, 0.280752390623093, -14.521141052246094, 1.948345661163330, 6.979897975921631, 4.035674571990967, 1.044640779495239, -19.007211685180664, 21.142364501953125, -1.837882161140442, 2.050447940826416, 1.542031645774841, -7.565482616424561, 26.773376464843750, 0.171006053686142, 3.358534336090088, -8.234274864196777, -27.216566085815430, 7.212102413177490, -5.000186920166016, -20.917554855346680, 2.152885198593140, -3.181938886642456, 10.018072128295898, 6.692709445953369, 7.562778472900391, -0.397445559501648, -11.695134162902832, 1.699540257453918, 1.424039125442505, -1.628181338310242, 4.050493717193604, 0.047106776386499, -0.717159509658813, -3.311089277267456, -2.847960948944092, -9.831811904907227, -7.529915332794189, -2.769558668136597, -2.932808637619019, 6.914423465728760, 13.812906265258789, 0.466079294681549, -0.697627902030945, 0.283607631921768, -7.250504493713379, 13.122053146362305, 4.825413227081299, 2.828585863113403, 6.724539756774902, -0.596229493618011, -4.759947776794434, -9.946646690368652, -2.232836484909058, -3.401717901229858, -1.199927449226379, 3.097918748855591, 0.726092457771301, 1.352552175521851, -1.831664323806763, -6.564773082733154, 1.155098319053650, -2.088497400283813, -0.057716656476259, -0.293432414531708, -5.829917907714844, -2.137289047241211, 2.680857658386230, -3.795029640197754, 0.601609170436859, -2.534255266189575, 0.599966049194336, 9.936664581298828, -1.825383901596069, 2.551906108856201, -2.613932371139526, 5.252158164978027, -0.459596127271652, -1.080929756164551, -5.785776615142822, -4.251605510711670, 1.853045225143433, 1.728189826011658, -1.679710865020752, 3.655097484588623, -4.362958908081055, 1.981420755386353, -4.094293117523193, 0.941113770008087, -7.290192604064941, 2.577519655227661, 0.405787110328674, -5.861212253570557, -2.390504837036133, -4.859991073608398, 21.085351943969727, 2.001378059387207, -24.684366226196289, -4.457293987274170, 22.137004852294922, -1.187330603599548, -37.353851318359375, -1.755694746971130, 1.482097148895264, -1.574132204055786, 12.119773864746094, -4.834329605102539, 0.834708034992218, -21.962982177734375, 39.640460968017578, -9.078592300415039, -3.510553598403931, -16.044708251953125, -4.902245998382568, 0.224997147917747, -34.180931091308594, 7.509442806243896, 3.701504945755005, -11.197209358215332, -22.056798934936523, -27.012636184692383, 15.913613319396973, 4.364429473876953, -1.197503089904785, 8.571378707885742, -19.667821884155273, 22.258554458618164, -7.341328144073486, -17.936431884765625, -0.463554143905640, -5.726800918579102}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.txt new file mode 100644 index 00000000..d99bb2c7 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.txt @@ -0,0 +1 @@ +-34.272384643554688, -15.242276191711426, -12.153550148010254, -0.517302453517914, 16.459009170532227, -0.927374601364136, -16.420297622680664, 2.281730651855469, -10.332298278808594, 1.368979692459106, -14.852247238159180, 2.289415359497070, 4.450272083282471, 15.091560363769531, 13.234894752502441, 5.019698619842529, 0.659490466117859, -10.297682762145996, -6.819073200225830, 0.061284162104130, -17.845451354980469, -12.342288970947266, 9.018982887268066, -7.944165229797363, -9.916581153869629, -9.689590454101562, 1.593392252922058, -0.548580884933472, 2.834589481353760, 13.109604835510254, -13.948617935180664, 4.110248565673828, -2.018397331237793, -6.860967636108398, 2.082887887954712, -1.957600474357605, -0.321475505828857, 0.820872783660889, 2.677054405212402, -1.768133521080017, -1.177917003631592, -1.118692636489868, 3.211776494979858, -3.037288188934326, 5.288896083831787, -0.581319391727448, 0.694460690021515, 0.631229758262634, 3.280242681503296, 6.218200206756592, -2.129676103591919, 8.349854469299316, 1.471119880676270, 7.490054130554199, 0.753753662109375, -0.399599075317383, 2.416818141937256, 1.339800357818604, -0.718787252902985, 0.338143020868301, 1.145107865333557, 5.754922389984131, -4.704513549804688, -1.252747058868408, 0.465840101242065, -3.112505197525024, 4.487011432647705, -0.808801710605621, 7.409661293029785, -8.000079154968262, 0.356041222810745, -1.234661579132080, -13.651395797729492, 4.551627635955811, 3.547161102294922, -1.346346378326416, -6.481750965118408, 0.371593445539474, -0.909239649772644, 0.803896009922028, -0.864329278469086, -0.167551159858704, 1.271770358085632, 0.128098145127296, -0.319244086742401, -8.963575363159180, -4.575497150421143, -4.347470760345459, 0.099872648715973, 1.076389431953430, 1.537157297134399, -0.342850208282471, -3.088666439056396, 1.880550146102905, -2.499561071395874, 0.960815191268921, 1.989226579666138, 5.396582126617432, 4.611053466796875, 1.478802204132080, -0.381258249282837, -1.447740316390991, -0.485423654317856, 1.209582686424255, -6.765387535095215, 0.879579961299896, 3.126605033874512, -1.396452188491821, 35.301498413085938, 16.390518188476562, 10.991186141967773, 0.457286953926086, -16.055135726928711, 0.732447206974030, 12.833724975585938, -0.869582533836365, 5.935638427734375, 2.171858549118042, 15.994698524475098, -1.975315093994141, -0.577428340911865, -16.300628662109375, -12.036094665527344, -12.248717308044434, -0.296559274196625, 9.253703117370605, 7.236478328704834, 0.100461378693581, 15.662371635437012, 13.149472236633301, -9.011061668395996, 9.156368255615234, 9.083997726440430, 8.143834114074707, 6.395058631896973, 0.768283843994141, -2.189213037490845, -12.856546401977539, 10.946484565734863, -3.122458934783936, 2.356916427612305, 10.203166007995605, 3.314955234527588, 2.006448984146118, -4.138628959655762, 9.784881591796875, -7.701581478118896, -2.161497592926025, 5.081796169281006, 0.722472250461578, -6.947623729705811, 0.428102672100067, -1.017104268074036, -5.616028785705566, 7.207549571990967, -3.425596952438354, -0.324499905109406, -1.508072257041931, -0.423026353120804, -6.807011127471924, -2.165873289108276, -6.257976055145264, -1.110751748085022, -0.680330693721771, -8.726241111755371, 6.876333713531494, 0.122669994831085, -6.020811080932617, -1.936614274978638, 7.679961681365967, 6.832388401031494, 2.089343547821045, -5.815147399902344, 1.034743905067444, 6.128062248229980, 3.326957702636719, -6.113448143005371, 0.656117796897888, -0.316450953483582, 0.792564570903778, 10.708021163940430, 10.246310234069824, -4.989016532897949, -2.966490268707275, 4.010641574859619, 1.000328898429871, -35.920978546142578, -1.863970279693604, 1.379239320755005, -0.364904999732971, 3.232958555221558, -0.646893203258514, 4.449232578277588, -6.601441383361816, 7.810013294219971, 0.764219939708710, -0.887412309646606, 4.851296424865723, -3.773882389068604, 0.953490376472473, -28.108135223388672, 7.164631843566895, 5.078193187713623, -4.744826793670654, -7.120871067047119, -7.749808311462402, 10.820018768310547, 0.171118795871735, -1.084927797317505, 1.892885923385620, -28.651664733886719, 10.953318595886230, -5.435957431793213, -21.623348236083984, 2.465915918350220, -8.539632797241211, -7.903433799743652, -3.474239349365234, 0.243321105837822, -0.380062937736511, 5.332633972167969, 2.151208877563477, 2.022930383682251, -1.463849902153015, -2.719141244888306, -1.985015749931335, 1.754704952239990, -3.623456001281738, 4.132822036743164, -2.998028755187988, 4.864254474639893, 7.019001007080078, -2.887226343154907, -2.157429456710815, -13.182174682617188, -0.038866952061653, -5.827670574188232, 5.151016235351562, 2.542974710464478, -12.306578636169434, -1.044925689697266, 11.257448196411133, -1.981187462806702, -1.172790408134460, -1.593691825866699, 5.988854408264160, 11.212390899658203, 6.184563636779785, 1.851197481155396, -7.376731395721436, 2.947922706604004, -3.116251468658447, 9.032855033874512, 11.189463615417480, -14.047230720520020, -1.882185339927673, 13.061312675476074, -3.894136667251587, -18.382831573486328, 5.108212947845459, 1.680236458778381, 5.855550289154053, 1.753978013992310, 8.817825317382812, 4.784208774566650, -9.456546783447266, 6.749723434448242, -3.826550960540771, 8.439210891723633, 2.381058931350708, -6.254682064056396, 0.979307055473328, -12.932164192199707, 6.614181041717529, 7.724326133728027, -8.186627388000488, -11.564584732055664, -5.705511093139648, 0.615724623203278, 2.294805049896240, 8.561786651611328, 10.862165451049805, -11.637836456298828, 8.550187110900879, -2.799665927886963, -4.847795963287354, 2.903936386108398, -6.381844997406006, 6.450922012329102, 26.763093948364258, 3.013844728469849, -0.924964666366577, -6.920816898345947, -2.560798168182373, -34.196998596191406, 1.623008966445923, 10.048088073730469, 0.985973894596100, 23.329315185546875, 1.722676992416382, -0.090961724519730, -5.953221797943115, 0.280752390623093, -14.521141052246094, 1.948345661163330, 6.979897975921631, 4.035674571990967, 1.044640779495239, -19.007211685180664, 21.142364501953125, -1.837882161140442, 2.050447940826416, 1.542031645774841, -7.565482616424561, 26.773376464843750, 0.171006053686142, 3.358534336090088, -8.234274864196777, -27.216566085815430, 7.212102413177490, -5.000186920166016, -20.917554855346680, 2.152885198593140, -3.181938886642456, 10.018072128295898, 6.692709445953369, 7.562778472900391, -0.397445559501648, -11.695134162902832, 1.699540257453918, 1.424039125442505, -1.628181338310242, 4.050493717193604, 0.047106776386499, -0.717159509658813, -3.311089277267456, -2.847960948944092, -9.831811904907227, -7.529915332794189, -2.769558668136597, -2.932808637619019, 6.914423465728760, 13.812906265258789, 0.466079294681549, -0.697627902030945, 0.283607631921768, -7.250504493713379, 13.122053146362305, 4.825413227081299, 2.828585863113403, 6.724539756774902, -0.596229493618011, -4.759947776794434, -9.946646690368652, -2.232836484909058, -3.401717901229858, -1.199927449226379, 3.097918748855591, 0.726092457771301, 1.352552175521851, -1.831664323806763, -6.564773082733154, 1.155098319053650, -2.088497400283813, -0.057716656476259, -0.293432414531708, -5.829917907714844, -2.137289047241211, 2.680857658386230, -3.795029640197754, 0.601609170436859, -2.534255266189575, 0.599966049194336, 9.936664581298828, -1.825383901596069, 2.551906108856201, -2.613932371139526, 5.252158164978027, -0.459596127271652, -1.080929756164551, -5.785776615142822, -4.251605510711670, 1.853045225143433, 1.728189826011658, -1.679710865020752, 3.655097484588623, -4.362958908081055, 1.981420755386353, -4.094293117523193, 0.941113770008087, -7.290192604064941, 2.577519655227661, 0.405787110328674, -5.861212253570557, -2.390504837036133, -4.859991073608398, 21.085351943969727, 2.001378059387207, -24.684366226196289, -4.457293987274170, 22.137004852294922, -1.187330603599548, -37.353851318359375, -1.755694746971130, 1.482097148895264, -1.574132204055786, 12.119773864746094, -4.834329605102539, 0.834708034992218, -21.962982177734375, 39.640460968017578, -9.078592300415039, -3.510553598403931, -16.044708251953125, -4.902245998382568, 0.224997147917747, -34.180931091308594, 7.509442806243896, 3.701504945755005, -11.197209358215332, -22.056798934936523, -27.012636184692383, 15.913613319396973, 4.364429473876953, -1.197503089904785, 8.571378707885742, -19.667821884155273, 22.258554458618164, -7.341328144073486, -17.936431884765625, -0.463554143905640, -5.726800918579102 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.h new file mode 100644 index 00000000..421ca42b --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.h @@ -0,0 +1,15 @@ +//Numpy array shape [36, 1] +//Min -16.967756271362 +//Max 12.259524345398 +//Number of zeros 0 + +#ifndef W24_H_ +#define W24_H_ + +#ifndef __SYNTHESIS__ +met_weight_weight_t w24[36]; +#else +met_weight_weight_t w24[36] = {-16.967756271362305, -3.760226726531982, 3.262881755828857, -9.485597610473633, -3.357334852218628, -15.149440765380859, 3.543870449066162, -2.800054788589478, 4.344166755676270, -2.786701679229736, 6.405607700347900, -3.039294004440308, -2.860914230346680, 2.979121685028076, -3.144270658493042, -3.578038454055786, -2.965110778808594, 3.106849431991577, 3.355989456176758, -2.746005535125732, 3.465666294097900, -3.180762529373169, -2.911018371582031, 3.576281547546387, 3.597542285919189, 3.606025695800781, -3.061075925827026, 12.259524345397949, -3.002163410186768, -3.301740884780884, 3.924034357070923, -3.431127548217773, 2.811718702316284, 2.879392385482788, -2.894979476928711, 3.125239610671997}; +#endif + +#endif diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.txt new file mode 100644 index 00000000..5fab1cdb --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.txt @@ -0,0 +1 @@ +-16.967756271362305, -3.760226726531982, 3.262881755828857, -9.485597610473633, -3.357334852218628, -15.149440765380859, 3.543870449066162, -2.800054788589478, 4.344166755676270, -2.786701679229736, 6.405607700347900, -3.039294004440308, -2.860914230346680, 2.979121685028076, -3.144270658493042, -3.578038454055786, -2.965110778808594, 3.106849431991577, 3.355989456176758, -2.746005535125732, 3.465666294097900, -3.180762529373169, -2.911018371582031, 3.576281547546387, 3.597542285919189, 3.606025695800781, -3.061075925827026, 12.259524345397949, -3.002163410186768, -3.301740884780884, 3.924034357070923, -3.431127548217773, 2.811718702316284, 2.879392385482788, -2.894979476928711, 3.125239610671997 \ No newline at end of file diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/hls4ml_config.yml b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/hls4ml_config.yml new file mode 100644 index 00000000..466ce955 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/hls4ml_config.yml @@ -0,0 +1,119 @@ +Backend: Vivado +ClockPeriod: 5 +HLSConfig: + LayerName: + activation: + Precision: + result: ap_fixed<32,16> + Trace: true + activation_1: + Precision: + result: ap_fixed<32,16> + Trace: true + batch_normalization: + Precision: + bias: ap_fixed<32,16> + result: ap_fixed<32,16> + scale: ap_fixed<32,16> + Trace: true + batch_normalization_1: + Precision: + bias: ap_fixed<32,16> + result: ap_fixed<32,16> + scale: ap_fixed<32,16> + Trace: true + concatenate: + Precision: + result: ap_fixed<32,16> + Trace: true + concatenate_1: + Precision: + result: ap_fixed<32,16> + Trace: true + dense: + Precision: + bias: ap_fixed<32,16> + result: ap_fixed<32,16> + weight: ap_fixed<32,16> + Trace: true + dense_1: + Precision: + bias: ap_fixed<32,16> + result: ap_fixed<32,16> + weight: ap_fixed<32,16> + Trace: true + dense_1_linear: + Precision: + result: ap_fixed<32,16> + Trace: true + dense_linear: + Precision: + result: ap_fixed<32,16> + Trace: true + embedding0: + Precision: + embeddings: ap_fixed<32,16> + result: ap_fixed<32,16> + Trace: true + embedding1: + Precision: + embeddings: ap_fixed<32,16> + result: ap_fixed<32,16> + Trace: true + input_cat0: + Precision: + result: ap_uint<4> + Trace: true + input_cat1: + Precision: + result: ap_uint<4> + Trace: true + input_cont: + Precision: + result: ap_fixed<32,16> + Trace: true + input_pxpy: + Precision: + result: ap_fixed<32,16> + Trace: true + met_weight: + Precision: + bias: ap_fixed<32,16> + result: ap_fixed<32,16> + weight: ap_fixed<32,16> + Trace: true + met_weight_linear: + Precision: + result: ap_fixed<32,16> + Trace: true + met_weight_minus_one: + Precision: + bias: ap_fixed<32,16> + result: ap_fixed<32,16> + scale: ap_fixed<32,16> + Trace: true + multiply: + Precision: + result: ap_fixed<32,16> + Trace: true + n_elem: 100 + output: + Precision: + result: ap_fixed<32,16> + Trace: true + n_filt: 2 + Model: + BramFactor: 1000000000 + Precision: ap_fixed<32,16> + ReuseFactor: 1 + Strategy: Latency + TraceOutput: false +IOType: io_parallel +InputData: null +KerasModel: !keras_model 'hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5' +OutputDir: hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16> +OutputPredictions: null +Part: xcvu13p-flga2577-2-e +ProjectName: L1METML_v1 +Stamp: 95715E3e +Version: 1.0.0 diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5 b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5 new file mode 100644 index 00000000..13a4d599 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5 differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model.h5 b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model.h5 new file mode 100644 index 00000000..13a4d599 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model.h5 differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model_hls4ml.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model_hls4ml.png new file mode 100644 index 00000000..f04b84a5 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model_hls4ml.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET.png new file mode 100644 index 00000000..2545be2b Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_x.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_x.png new file mode 100644 index 00000000..633ee2cc Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_x.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_y.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_y.png new file mode 100644 index 00000000..beacc4dd Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_y.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation.png new file mode 100644 index 00000000..7886541a Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation_1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation_1.png new file mode 100644 index 00000000..5d5a9d33 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation_1.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate.png new file mode 100644 index 00000000..0cf674a0 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate_1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate_1.png new file mode 100644 index 00000000..c22563c7 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate_1.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense.png new file mode 100644 index 00000000..cda2f599 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense_1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense_1.png new file mode 100644 index 00000000..c10e1cad Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense_1.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding0.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding0.png new file mode 100644 index 00000000..f0a1ab32 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding0.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding1.png new file mode 100644 index 00000000..e54c2a54 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding1.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_met_weight.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_met_weight.png new file mode 100644 index 00000000..cf1781d7 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_met_weight.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_multiply.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_multiply.png new file mode 100644 index 00000000..dc0aed93 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_multiply.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_output.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_output.png new file mode 100644 index 00000000..776dda69 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_output.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/project.tcl b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/project.tcl new file mode 100644 index 00000000..d5cf7610 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/project.tcl @@ -0,0 +1,12 @@ +variable project_name +set project_name "L1METML_v1" +variable backend +set backend "vivado" +variable part +set part "xcvu13p-flga2577-2-e" +variable clock_period +set clock_period 5 +variable clock_uncertainty +set clock_uncertainty 12.5% +variable version +set version "1.0.0" diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/response_MET.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/response_MET.png new file mode 100644 index 00000000..c6699875 Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/response_MET.png differ diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/vivado_synth.tcl b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/vivado_synth.tcl new file mode 100644 index 00000000..4634b166 --- /dev/null +++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/vivado_synth.tcl @@ -0,0 +1,6 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +add_files ${project_name}_prj/solution1/syn/vhdl +synth_design -top ${project_name} -part $part +report_utilization -file vivado_synth.rpt diff --git a/l1metml-job2.yml b/l1metml-job2.yml new file mode 100644 index 00000000..99616905 --- /dev/null +++ b/l1metml-job2.yml @@ -0,0 +1,35 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: l1metml +spec: + template: + spec: + containers: + - name: gpu-container + image: gitlab-registry.nrp-nautilus.io/jmduarte/l1metml:latest + command: + - "/bin/bash" + - "-c" + - " git clone https://github.com/ucsd-hep-ex/L1METML.git -b gnn && + cd L1METML && + python train.py --workflowType dataGenerator --input /home/users/dprimosc/data/l1_trigger_ntuples/TTbar --mode 1 --epochs 500 --maxNPF 100 --batch-size 256 --units 12 36 --output models/quantized-dense-embedding/ --quantized 8 2 --model dense_embedding --compute-edge-feat 0 --model-output models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test --normFac 1" + volumeMounts: + - mountPath: /l1metmlvol + name: l1metmlvol + resources: + limits: + memory: 32Gi + cpu: "2" + nvidia.com/gpu: "1" + requests: + memory: 16Gi + cpu: "1" + nvidia.com/gpu: "1" + volumes: + - name: l1metmlvol + persistentVolumeClaim: + claimName: l1metmlvol + + restartPolicy: Never + backoffLimit: 0 diff --git a/loss.py b/loss.py index 581a96ae..b32670fe 100644 --- a/loss.py +++ b/loss.py @@ -4,7 +4,6 @@ def custom_loss_wrapper(normFac=1): by balancing the response above one and below one ''' - def custom_loss(y_true, y_pred): import tensorflow.keras.backend as K import tensorflow as tf @@ -16,8 +15,8 @@ def custom_loss(y_true, y_pred): pt_truth = K.sqrt(px_truth*px_truth + py_truth*py_truth) - #px_truth1 = px_truth / pt_truth - #py_truth1 = py_truth / pt_truth + # px_truth1 = px_truth / pt_truth + # py_truth1 = py_truth / pt_truth # using absolute response # upar_pred = (px_truth1 * px_pred + py_truth1 * py_pred)/pt_truth @@ -26,7 +25,7 @@ def custom_loss(y_true, y_pred): upar_pred = tf.boolean_mask(upar_pred, pt_cut) pt_truth_filtered = tf.boolean_mask(pt_truth, pt_cut) - #filter_bin0 = pt_truth_filtered < 50./normFac + # filter_bin0 = pt_truth_filtered < 50./normFac filter_bin0 = tf.logical_and(pt_truth_filtered > 50./normFac, pt_truth_filtered < 100./normFac) filter_bin1 = tf.logical_and(pt_truth_filtered > 100./normFac, pt_truth_filtered < 200./normFac) filter_bin2 = tf.logical_and(pt_truth_filtered > 200./normFac, pt_truth_filtered < 300./normFac) @@ -43,21 +42,21 @@ def custom_loss(y_true, y_pred): upar_pred_neg_bin3 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin3, upar_pred < 0.)) upar_pred_pos_bin4 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin4, upar_pred > 0.)) upar_pred_neg_bin4 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin4, upar_pred < 0.)) - #upar_pred_pos_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred > 0.)) - #upar_pred_neg_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred < 0.)) + # upar_pred_pos_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred > 0.)) + # upar_pred_neg_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred < 0.)) norm = tf.reduce_sum(pt_truth_filtered) dev = tf.abs(tf.reduce_sum(upar_pred_pos_bin0) + tf.reduce_sum(upar_pred_neg_bin0)) dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin1) + tf.reduce_sum(upar_pred_neg_bin1)) dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin2) + tf.reduce_sum(upar_pred_neg_bin2)) dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin3) + tf.reduce_sum(upar_pred_neg_bin3)) dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin4) + tf.reduce_sum(upar_pred_neg_bin4)) - #dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin5) + tf.reduce_sum(upar_pred_neg_bin5)) + # dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin5) + tf.reduce_sum(upar_pred_neg_bin5)) dev /= norm loss = 0.5*normFac**2*K.mean((px_pred - px_truth)**2 + (py_pred - py_truth)**2) - #loss += 200.*dev + # loss += 200.*dev loss += 5000.*dev return loss - + return custom_loss diff --git a/micromamba_setup.sh b/micromamba_setup.sh new file mode 100644 index 00000000..d9ef8f2e --- /dev/null +++ b/micromamba_setup.sh @@ -0,0 +1,2 @@ +micromamba create --file environment.yml --name l1metml +micromamba activate l1metml diff --git a/models/quantized-dense-embedding/MET_pt.png b/models/quantized-dense-embedding/MET_pt.png new file mode 100644 index 00000000..735e5ff1 Binary files /dev/null and b/models/quantized-dense-embedding/MET_pt.png differ diff --git a/models/quantized-dense-embedding/MET_response.png b/models/quantized-dense-embedding/MET_response.png new file mode 100644 index 00000000..ec94cb55 Binary files /dev/null and b/models/quantized-dense-embedding/MET_response.png differ diff --git a/models/quantized-dense-embedding/MET_x.png b/models/quantized-dense-embedding/MET_x.png new file mode 100644 index 00000000..5f321c1c Binary files /dev/null and b/models/quantized-dense-embedding/MET_x.png differ diff --git a/models/quantized-dense-embedding/MET_y.png b/models/quantized-dense-embedding/MET_y.png new file mode 100644 index 00000000..c27b3845 Binary files /dev/null and b/models/quantized-dense-embedding/MET_y.png differ diff --git a/models/quantized-dense-embedding/Phi_abs_err.png b/models/quantized-dense-embedding/Phi_abs_err.png new file mode 100644 index 00000000..d0adbe09 Binary files /dev/null and b/models/quantized-dense-embedding/Phi_abs_err.png differ diff --git a/models/quantized-dense-embedding/PrVSGen.png b/models/quantized-dense-embedding/PrVSGen.png new file mode 100644 index 00000000..53599066 Binary files /dev/null and b/models/quantized-dense-embedding/PrVSGen.png differ diff --git a/models/quantized-dense-embedding/Pt_abs_error.png b/models/quantized-dense-embedding/Pt_abs_error.png new file mode 100644 index 00000000..3646eecd Binary files /dev/null and b/models/quantized-dense-embedding/Pt_abs_error.png differ diff --git a/models/quantized-dense-embedding/TTbar_feature_array_MLMET.npy b/models/quantized-dense-embedding/TTbar_feature_array_MLMET.npy new file mode 100644 index 00000000..ad976743 Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_feature_array_MLMET.npy differ diff --git a/models/quantized-dense-embedding/TTbar_feature_array_PUMET.npy b/models/quantized-dense-embedding/TTbar_feature_array_PUMET.npy new file mode 100644 index 00000000..bb232dd5 Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_feature_array_PUMET.npy differ diff --git a/models/quantized-dense-embedding/TTbar_target_array_MLMET.npy b/models/quantized-dense-embedding/TTbar_target_array_MLMET.npy new file mode 100644 index 00000000..fd349a9b Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_target_array_MLMET.npy differ diff --git a/models/quantized-dense-embedding/TTbar_target_array_PUMET.npy b/models/quantized-dense-embedding/TTbar_target_array_PUMET.npy new file mode 100644 index 00000000..fd349a9b Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_target_array_PUMET.npy differ diff --git a/models/quantized-dense-embedding/XY_resolution_plots.png b/models/quantized-dense-embedding/XY_resolution_plots.png new file mode 100644 index 00000000..ccd6a61e Binary files /dev/null and b/models/quantized-dense-embedding/XY_resolution_plots.png differ diff --git a/models/quantized-dense-embedding/loss_history.log b/models/quantized-dense-embedding/loss_history.log new file mode 100644 index 00000000..b1d96145 --- /dev/null +++ b/models/quantized-dense-embedding/loss_history.log @@ -0,0 +1,97 @@ +epoch,loss,lr,mean_absolute_error,mean_squared_error,val_loss,val_mean_absolute_error,val_mean_squared_error +0,5365.49755859375,0.0003,36.625160217285156,2374.283447265625,2642.266845703125,34.32746124267578,2102.33837890625 +1,2478.82275390625,0.0003,32.99213790893555,1934.72412109375,2403.994873046875,31.916332244873047,1810.71142578125 +2,2352.487060546875,0.0003,32.17837142944336,1842.6265869140625,2313.86962890625,32.05146408081055,1838.9298095703125 +3,2296.711181640625,0.0003,31.804044723510742,1801.236083984375,2271.6572265625,31.789581298828125,1806.3812255859375 +4,2270.8251953125,0.0003,31.612276077270508,1779.804931640625,2257.845947265625,31.16737174987793,1735.5069580078125 +5,2258.5390625,0.0003,31.538549423217773,1771.061279296875,2253.50732421875,31.11001205444336,1727.9681396484375 +6,2249.881103515625,0.0003,31.480377197265625,1765.043701171875,2246.50927734375,31.05948829650879,1725.2110595703125 +7,2245.15283203125,0.0003,31.43949317932129,1761.727294921875,2232.66845703125,31.450876235961914,1769.3106689453125 +8,2243.132080078125,0.0003,31.429136276245117,1762.26416015625,2232.498046875,31.162405014038086,1736.4827880859375 +9,2245.653076171875,0.0003,31.421859741210938,1761.437744140625,2226.985595703125,31.18681526184082,1740.423583984375 +10,2245.4873046875,0.0003,31.400266647338867,1760.3131103515625,2225.4541015625,31.112810134887695,1733.8387451171875 +11,2241.443115234375,0.0003,31.413358688354492,1761.822021484375,2239.147705078125,30.872859954833984,1705.24267578125 +12,2238.0751953125,0.0003,31.35942840576172,1756.65625,2220.971435546875,31.09974479675293,1733.426513671875 +13,2233.3056640625,0.0003,31.350557327270508,1755.82958984375,2251.1357421875,30.87445640563965,1705.130126953125 +14,2231.28759765625,0.0003,31.316877365112305,1752.25634765625,2260.2607421875,30.62598419189453,1676.5386962890625 +15,2228.76171875,0.0003,31.30657196044922,1751.412841796875,2219.97412109375,31.45556640625,1772.76025390625 +16,2227.719482421875,0.0003,31.29245376586914,1750.1439208984375,2221.25048828125,31.31220054626465,1755.291259765625 +17,2225.812255859375,0.0003,31.29678726196289,1751.6265869140625,2214.671630859375,31.47364616394043,1776.827392578125 +18,2223.05078125,0.0003,31.272422790527344,1749.5374755859375,2209.434814453125,30.99241828918457,1719.739990234375 +19,2218.818115234375,0.0003,31.24805450439453,1746.823974609375,2209.388671875,31.06031036376953,1731.7838134765625 +20,2218.491943359375,0.0003,31.245458602905273,1747.4522705078125,2269.787109375,30.592639923095703,1676.3988037109375 +21,2216.6357421875,0.0003,31.25051498413086,1748.0118408203125,2218.935546875,30.852811813354492,1710.2021484375 +22,2215.057861328125,0.0003,31.226913452148438,1746.06103515625,2243.838623046875,31.873579025268555,1820.3507080078125 +23,2216.621337890625,0.0003,31.237436294555664,1747.4488525390625,2210.60400390625,31.3365478515625,1761.2696533203125 +24,2214.016845703125,0.0003,31.231857299804688,1746.2939453125,2228.604736328125,30.702482223510742,1688.9754638671875 +25,2211.27490234375,0.0003,31.219966888427734,1744.6165771484375,2205.664306640625,31.038711547851562,1723.0206298828125 +26,2207.486328125,0.0003,31.195711135864258,1742.8887939453125,2194.477294921875,30.95041275024414,1719.947265625 +27,2210.222412109375,0.0003,31.204673767089844,1743.8411865234375,2200.50927734375,31.29070281982422,1757.7618408203125 +28,2211.854248046875,0.0003,31.204816818237305,1743.28076171875,2227.3544921875,30.58995819091797,1679.155029296875 +29,2213.34765625,0.0003,31.25057601928711,1747.6446533203125,2206.81494140625,30.798568725585938,1700.878173828125 +30,2212.620849609375,0.0003,31.226085662841797,1743.9351806640625,2213.621826171875,30.753026962280273,1697.3277587890625 +31,2210.94921875,0.0003,31.220905303955078,1744.9833984375,2286.822998046875,30.77813148498535,1682.1134033203125 +32,2209.916748046875,0.0003,31.20001792907715,1742.58544921875,2209.856201171875,30.90184783935547,1712.124267578125 +33,2212.094970703125,0.0003,31.221921920776367,1744.3253173828125,2207.29931640625,30.911880493164062,1712.8521728515625 +34,2210.8544921875,0.0003,31.212121963500977,1742.1922607421875,2209.007568359375,30.852182388305664,1704.3905029296875 +35,2208.66650390625,0.0003,31.210289001464844,1743.2523193359375,2215.027587890625,30.700241088867188,1692.73486328125 +36,2209.481201171875,0.0003,31.19826316833496,1743.426513671875,2215.21923828125,31.48198890686035,1778.38330078125 +37,2210.159423828125,0.0003,31.214448928833008,1744.8856201171875,2220.15966796875,30.610177993774414,1681.4605712890625 +38,2210.488525390625,0.0003,31.215967178344727,1744.15771484375,2313.10498046875,32.419185638427734,1888.31103515625 +39,2210.69482421875,0.0003,31.20148468017578,1741.19287109375,2201.347412109375,31.019954681396484,1725.19677734375 +40,2207.256103515625,0.0003,31.194265365600586,1741.663818359375,2195.50732421875,31.203859329223633,1751.430419921875 +41,2204.386474609375,0.0003,31.183460235595703,1743.04345703125,2198.1875,30.85906410217285,1708.7384033203125 +42,2202.40478515625,0.0003,31.156558990478516,1739.5374755859375,2208.386474609375,30.73008918762207,1691.7432861328125 +43,2199.3857421875,0.0003,31.13861083984375,1737.30615234375,2193.83056640625,30.826841354370117,1705.76806640625 +44,2199.031494140625,0.0003,31.13396644592285,1736.3363037109375,2196.921142578125,30.907873153686523,1710.51025390625 +45,2201.519287109375,0.0003,31.154233932495117,1739.555419921875,2194.886474609375,31.024913787841797,1729.93701171875 +46,2201.720458984375,0.0003,31.153945922851562,1738.5020751953125,2200.976806640625,30.806833267211914,1699.372314453125 +47,2201.043212890625,0.0003,31.148744583129883,1737.4951171875,2198.75927734375,30.850374221801758,1708.83447265625 +48,2204.1826171875,0.0003,31.15572738647461,1737.10693359375,2208.6689453125,30.689353942871094,1687.5474853515625 +49,2203.097900390625,0.0003,31.155651092529297,1736.9141845703125,2193.733154296875,30.915498733520508,1712.6759033203125 +50,2200.683837890625,0.0003,31.16202735900879,1737.0792236328125,2192.33544921875,31.115854263305664,1736.875732421875 +51,2201.66357421875,0.0003,31.151227951049805,1735.9931640625,2207.9580078125,30.661632537841797,1682.8206787109375 +52,2203.215576171875,0.0003,31.143098831176758,1735.8565673828125,2196.600830078125,30.820405960083008,1703.871826171875 +53,2201.114013671875,0.0003,31.127216339111328,1736.264404296875,2195.122802734375,31.004093170166016,1726.9163818359375 +54,2199.779296875,0.0003,31.147878646850586,1738.2113037109375,2194.158203125,31.19964599609375,1750.343017578125 +55,2201.775146484375,0.0003,31.15053367614746,1738.500244140625,2190.744140625,30.997722625732422,1726.93310546875 +56,2204.49267578125,0.0003,31.181379318237305,1741.8153076171875,2204.71142578125,30.76068878173828,1698.86474609375 +57,2202.5205078125,0.0003,31.1776065826416,1740.7110595703125,2207.032470703125,30.753690719604492,1696.7855224609375 +58,2203.484130859375,0.0003,31.152755737304688,1737.710205078125,2196.21435546875,30.987199783325195,1721.74169921875 +59,2204.532958984375,0.0003,31.17318344116211,1739.4593505859375,2194.90087890625,30.944604873657227,1718.6607666015625 +60,2205.913818359375,0.0003,31.170658111572266,1738.680419921875,2200.27880859375,30.946495056152344,1720.03759765625 +61,2205.884033203125,0.0003,31.18505859375,1740.677001953125,2195.715576171875,31.04903793334961,1728.323486328125 +62,2207.506591796875,0.0003,31.178726196289062,1739.660400390625,2202.005126953125,30.92181396484375,1710.3450927734375 +63,2205.39697265625,0.0003,31.189022064208984,1739.7410888671875,2346.319091796875,30.911865234375,1687.64453125 +64,2201.806884765625,0.0003,31.17337417602539,1738.7607421875,2193.619384765625,31.078243255615234,1731.8031005859375 +65,2204.08642578125,0.0003,31.168397903442383,1739.8743896484375,2200.516845703125,30.913759231567383,1714.22412109375 +66,2203.070068359375,0.0003,31.17813491821289,1739.9871826171875,2212.877685546875,30.637502670288086,1680.600830078125 +67,2203.95458984375,0.0003,31.193883895874023,1742.715576171875,2200.10888671875,30.81962776184082,1707.2403564453125 +68,2207.74951171875,0.0003,31.195785522460938,1743.4793701171875,2211.089111328125,30.718929290771484,1690.9881591796875 +69,2206.971435546875,0.0003,31.193485260009766,1743.16796875,2215.083251953125,31.532989501953125,1789.220703125 +70,2206.98388671875,0.0003,31.192052841186523,1742.7562255859375,2197.8544921875,31.161483764648438,1738.7413330078125 +71,2209.69873046875,0.0003,31.207612991333008,1744.2576904296875,2203.982421875,30.88957405090332,1713.370361328125 +72,2210.47314453125,0.0003,31.24172019958496,1748.81640625,2205.13037109375,31.064043045043945,1731.40966796875 +73,2206.58642578125,0.0003,31.204240798950195,1743.779541015625,2203.295654296875,31.299882888793945,1757.2547607421875 +74,2210.520263671875,0.0003,31.234872817993164,1748.2420654296875,2194.092041015625,31.166475296020508,1744.5223388671875 +75,2208.08203125,0.0003,31.211593627929688,1744.8179931640625,2201.28857421875,31.07516860961914,1735.3389892578125 +76,2209.787841796875,0.0003,31.226259231567383,1746.712646484375,2265.71875,32.07892990112305,1854.2498779296875 +77,2208.861328125,0.0003,31.226646423339844,1746.5357666015625,2217.926513671875,30.675195693969727,1688.148193359375 +78,2207.03125,0.0003,31.2060604095459,1744.0958251953125,2195.196044921875,31.022859573364258,1725.2369384765625 +79,2207.662841796875,0.0003,31.2130069732666,1745.017333984375,2199.871826171875,30.941226959228516,1715.8800048828125 +80,2208.997802734375,0.0003,31.231077194213867,1747.3485107421875,2204.09130859375,30.840715408325195,1705.426513671875 +81,2209.9228515625,0.0003,31.214202880859375,1745.3836669921875,2196.96923828125,31.148229598999023,1743.1463623046875 +82,2209.47314453125,0.0003,31.219953536987305,1746.4140625,2200.6884765625,31.20162582397461,1746.3389892578125 +83,2210.9306640625,0.0003,31.211957931518555,1745.69970703125,2207.14599609375,30.89093589782715,1716.6256103515625 +84,2211.049560546875,0.0003,31.225465774536133,1747.16357421875,2202.51220703125,30.883039474487305,1713.4200439453125 +85,2212.10107421875,0.0003,31.219194412231445,1747.7564697265625,2203.7255859375,31.25891876220703,1749.3304443359375 +86,2212.598876953125,0.0003,31.23923110961914,1749.4521484375,2238.5771484375,31.83201026916504,1819.1221923828125 +87,2210.39501953125,0.0003,31.237064361572266,1748.4912109375,2207.60107421875,30.849815368652344,1704.5650634765625 +88,2211.44775390625,0.0003,31.229516983032227,1748.939208984375,2210.279541015625,31.19259262084961,1746.7022705078125 +89,2210.99072265625,0.0003,31.244524002075195,1750.119384765625,2227.881591796875,30.589513778686523,1679.3865966796875 +90,2211.454833984375,0.0003,31.226518630981445,1749.206787109375,2197.9755859375,31.094615936279297,1737.0526123046875 +91,2209.25732421875,0.0003,31.223045349121094,1748.1630859375,2199.78173828125,30.957643508911133,1720.9564208984375 +92,2209.30908203125,0.0003,31.23832130432129,1749.5999755859375,2210.670654296875,30.77962303161621,1699.195068359375 +93,2209.58154296875,0.0003,31.230850219726562,1749.1483154296875,2232.6435546875,31.708250045776367,1807.9969482421875 +94,2211.325927734375,0.0003,31.207908630371094,1746.332763671875,2208.9716796875,30.927019119262695,1717.478759765625 +95,2211.26318359375,0.0003,31.248823165893555,1751.1241455078125,2198.548828125,31.21225929260254,1749.599365234375 diff --git a/models/quantized-dense-embedding/model.h5 b/models/quantized-dense-embedding/model.h5 new file mode 100644 index 00000000..f3337314 Binary files /dev/null and b/models/quantized-dense-embedding/model.h5 differ diff --git a/models/quantized-dense-embedding/pt_resolution_plots.png b/models/quantized-dense-embedding/pt_resolution_plots.png new file mode 100644 index 00000000..03c1655c Binary files /dev/null and b/models/quantized-dense-embedding/pt_resolution_plots.png differ diff --git a/models/quantized-dense-embedding/rel_error_opaque.png b/models/quantized-dense-embedding/rel_error_opaque.png new file mode 100644 index 00000000..cedd7ca8 Binary files /dev/null and b/models/quantized-dense-embedding/rel_error_opaque.png differ diff --git a/models/quantized-dense-embedding/time.txt b/models/quantized-dense-embedding/time.txt new file mode 100644 index 00000000..ec9f99a9 --- /dev/null +++ b/models/quantized-dense-embedding/time.txt @@ -0,0 +1 @@ +Working Time (s) : 19636.019178152084Working Time (m) : 327.26698630253475 \ No newline at end of file diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_tes.h5 b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_tes.h5 new file mode 100644 index 00000000..fbdc91c8 Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_tes.h5 differ diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/fingerprint.pb b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/fingerprint.pb new file mode 100644 index 00000000..57fbb672 Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/fingerprint.pb differ diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/keras_metadata.pb b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/keras_metadata.pb new file mode 100644 index 00000000..970106c8 --- /dev/null +++ b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/keras_metadata.pb @@ -0,0 +1,24 @@ + +èËroot"_tf_keras_network*ÅË{"name": "model", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Functional", "config": {"name": "model", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat0"}, "name": "input_cat0", "inbound_nodes": []}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat1"}, "name": "input_cat1", "inbound_nodes": []}, {"class_name": "Embedding", "config": {"name": "embedding0", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 6, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding0", "inbound_nodes": [[["input_cat0", 0, 0, {}]]]}, {"class_name": "Embedding", "config": {"name": "embedding1", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 4, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding1", "inbound_nodes": [[["input_cat1", 0, 0, {}]]]}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cont"}, "name": "input_cont", "inbound_nodes": []}, {"class_name": "Concatenate", "config": {"name": "concatenate", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate", "inbound_nodes": [[["embedding0", 0, 0, {}], ["embedding1", 0, 0, {}]]]}, {"class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate_1", "inbound_nodes": [[["input_cont", 0, 0, {}], ["concatenate", 0, 0, {}]]]}, {"class_name": "QDense", "config": {"name": "q_dense", "trainable": true, "dtype": "float32", "units": 12, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 10}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 11}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 12}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 13}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 14}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense", "inbound_nodes": [[["concatenate_1", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization", "inbound_nodes": [[["q_dense", 0, 0, {}]]]}, {"class_name": "QActivation", "config": {"name": "q_activation", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation", "inbound_nodes": [[["batch_normalization", 0, 0, {}]]]}, {"class_name": "QDense", "config": {"name": "q_dense_1", "trainable": true, "dtype": "float32", "units": 36, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 23}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 24}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 25}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 26}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 27}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense_1", "inbound_nodes": [[["q_activation", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_1", "inbound_nodes": [[["q_dense_1", 0, 0, {}]]]}, {"class_name": "QActivation", "config": {"name": "q_activation_1", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation_1", "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]]}, {"class_name": "QDense", "config": {"name": "met_weight", "trainable": true, "dtype": "float32", "units": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "VarianceScaling", "config": {"scale": 0.02, "mode": "fan_in", "distribution": "truncated_normal", "seed": null}, "__passive_serialization__": true, "shared_object_id": 35}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 36}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 38}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 39}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "met_weight", "inbound_nodes": [[["q_activation_1", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "met_weight_minus_one", "trainable": false, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": false, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "met_weight_minus_one", "inbound_nodes": [[["met_weight", 0, 0, {}]]]}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_pxpy"}, "name": "input_pxpy", "inbound_nodes": []}, {"class_name": "Multiply", "config": {"name": "multiply", "trainable": true, "dtype": "float32"}, "name": "multiply", "inbound_nodes": [[["met_weight_minus_one", 0, 0, {}], ["input_pxpy", 0, 0, {}]]]}, {"class_name": "GlobalAveragePooling1D", "config": {"name": "output", "trainable": true, "dtype": "float32", "data_format": "channels_last", "keepdims": false}, "name": "output", "inbound_nodes": [[["multiply", 0, 0, {}]]]}], "input_layers": [["input_cont", 0, 0], ["input_pxpy", 0, 0], ["input_cat0", 0, 0], ["input_cat1", 0, 0]], "output_layers": [["output", 0, 0]]}, "shared_object_id": 49, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}}, {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}}, {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}, {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 4]}, {"class_name": "TensorShape", "items": [null, 100, 2]}, {"class_name": "TensorShape", "items": [null, 100]}, {"class_name": "TensorShape", "items": [null, 100]}], "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 4]}, "float32", "input_cont"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 2]}, "float32", "input_pxpy"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat0"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat1"]}]], {}]}, "save_spec": [{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 4]}, "float32", "input_cont"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 2]}, "float32", "input_pxpy"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat0"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat1"]}], "keras_version": "2.11.0", "backend": "tensorflow", "model_config": {"class_name": "Functional", "config": {"name": "model", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat0"}, "name": "input_cat0", "inbound_nodes": [], "shared_object_id": 0}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat1"}, "name": "input_cat1", "inbound_nodes": [], "shared_object_id": 1}, {"class_name": "Embedding", "config": {"name": "embedding0", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 6, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 2}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding0", "inbound_nodes": [[["input_cat0", 0, 0, {}]]], "shared_object_id": 3}, {"class_name": "Embedding", "config": {"name": "embedding1", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 4, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 4}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding1", "inbound_nodes": [[["input_cat1", 0, 0, {}]]], "shared_object_id": 5}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cont"}, "name": "input_cont", "inbound_nodes": [], "shared_object_id": 6}, {"class_name": "Concatenate", "config": {"name": "concatenate", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate", "inbound_nodes": [[["embedding0", 0, 0, {}], ["embedding1", 0, 0, {}]]], "shared_object_id": 7}, {"class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate_1", "inbound_nodes": [[["input_cont", 0, 0, {}], ["concatenate", 0, 0, {}]]], "shared_object_id": 8}, {"class_name": "QDense", "config": {"name": "q_dense", "trainable": true, "dtype": "float32", "units": 12, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 10}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 11}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 12}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 13}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 14}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense", "inbound_nodes": [[["concatenate_1", 0, 0, {}]]], "shared_object_id": 15}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 16}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 17}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 18}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 19}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization", "inbound_nodes": [[["q_dense", 0, 0, {}]]], "shared_object_id": 20}, {"class_name": "QActivation", "config": {"name": "q_activation", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation", "inbound_nodes": [[["batch_normalization", 0, 0, {}]]], "shared_object_id": 22}, {"class_name": "QDense", "config": {"name": "q_dense_1", "trainable": true, "dtype": "float32", "units": 36, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 23}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 24}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 25}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 26}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 27}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense_1", "inbound_nodes": [[["q_activation", 0, 0, {}]]], "shared_object_id": 28}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 29}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 30}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 31}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 32}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_1", "inbound_nodes": [[["q_dense_1", 0, 0, {}]]], "shared_object_id": 33}, {"class_name": "QActivation", "config": {"name": "q_activation_1", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation_1", "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]], "shared_object_id": 34}, {"class_name": "QDense", "config": {"name": "met_weight", "trainable": true, "dtype": "float32", "units": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "VarianceScaling", "config": {"scale": 0.02, "mode": "fan_in", "distribution": "truncated_normal", "seed": null}, "__passive_serialization__": true, "shared_object_id": 35}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 36}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 38}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 39}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "met_weight", "inbound_nodes": [[["q_activation_1", 0, 0, {}]]], "shared_object_id": 40}, {"class_name": "BatchNormalization", "config": {"name": "met_weight_minus_one", "trainable": false, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": false, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 41}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 42}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 43}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 44}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "met_weight_minus_one", "inbound_nodes": [[["met_weight", 0, 0, {}]]], "shared_object_id": 45}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_pxpy"}, "name": "input_pxpy", "inbound_nodes": [], "shared_object_id": 46}, {"class_name": "Multiply", "config": {"name": "multiply", "trainable": true, "dtype": "float32"}, "name": "multiply", "inbound_nodes": [[["met_weight_minus_one", 0, 0, {}], ["input_pxpy", 0, 0, {}]]], "shared_object_id": 47}, {"class_name": "GlobalAveragePooling1D", "config": {"name": "output", "trainable": true, "dtype": "float32", "data_format": "channels_last", "keepdims": false}, "name": "output", "inbound_nodes": [[["multiply", 0, 0, {}]]], "shared_object_id": 48}], "input_layers": [["input_cont", 0, 0], ["input_pxpy", 0, 0], ["input_cat0", 0, 0], ["input_cat1", 0, 0]], "output_layers": [["output", 0, 0]]}}, "training_config": {"loss": "custom_loss", "metrics": [[{"class_name": "MeanMetricWrapper", "config": {"name": "mean_absolute_error", "dtype": "float32", "fn": "mean_absolute_error"}, "shared_object_id": 54}, {"class_name": "MeanMetricWrapper", "config": {"name": "mean_squared_error", "dtype": "float32", "fn": "mean_squared_error"}, "shared_object_id": 55}]], "weighted_metrics": null, "loss_weights": null, "optimizer_config": {"class_name": "Custom>Adam", "config": {"name": "Adam", "weight_decay": null, "clipnorm": 1.0, "global_clipnorm": null, "clipvalue": null, "use_ema": false, "ema_momentum": 0.99, "ema_overwrite_frequency": null, "jit_compile": false, "is_legacy_optimizer": false, "learning_rate": 0.0003000000142492354, "beta_1": 0.9, "beta_2": 0.999, "epsilon": 1e-07, "amsgrad": false}}}}2 +€ root.layer-0"_tf_keras_input_layer*Ð{"class_name": "InputLayer", "name": "input_cat0", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat0"}}2 +€ root.layer-1"_tf_keras_input_layer*Ð{"class_name": "InputLayer", "name": "input_cat1", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat1"}}2 +¼root.layer_with_weights-0"_tf_keras_layer*…{"name": "embedding0", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Embedding", "config": {"name": "embedding0", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 6, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 2}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "inbound_nodes": [[["input_cat0", 0, 0, {}]]], "shared_object_id": 3, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100]}}2 +¼root.layer_with_weights-1"_tf_keras_layer*…{"name": "embedding1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Embedding", "config": {"name": "embedding1", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 4, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 4}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "inbound_nodes": [[["input_cat1", 0, 0, {}]]], "shared_object_id": 5, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100]}}2 +† root.layer-4"_tf_keras_input_layer*Ö{"class_name": "InputLayer", "name": "input_cont", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cont"}}2 +ñ root.layer-5"_tf_keras_layer*Ç{"name": "concatenate", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Concatenate", "config": {"name": "concatenate", "trainable": true, "dtype": "float32", "axis": -1}, "inbound_nodes": [[["embedding0", 0, 0, {}], ["embedding1", 0, 0, {}]]], "shared_object_id": 7, "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 2]}, {"class_name": "TensorShape", "items": [null, 100, 2]}]}2 +ö root.layer-6"_tf_keras_layer*Ì{"name": "concatenate_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "dtype": "float32", "axis": -1}, "inbound_nodes": [[["input_cont", 0, 0, {}], ["concatenate", 0, 0, {}]]], "shared_object_id": 8, "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 4]}, {"class_name": "TensorShape", "items": [null, 100, 4]}]}2 +žroot.layer_with_weights-2"_tf_keras_layer*ç{"name": "q_dense", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QDense", "config": {"name": "q_dense", "trainable": true, "dtype": "float32", "units": 12, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 10}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 11}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 12}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 13}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 14}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "inbound_nodes": [[["concatenate_1", 0, 0, {}]]], "shared_object_id": 15, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 8}}, "shared_object_id": 56}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 8]}}2 +÷  root.layer_with_weights-3"_tf_keras_layer*À {"name": "batch_normalization", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 16}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 17}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 18}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 19}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["q_dense", 0, 0, {}]]], "shared_object_id": 20, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 12}}, "shared_object_id": 57}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 12]}}2 +© + root.layer-9"_tf_keras_layer*ÿ{"name": "q_activation", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QActivation", "config": {"name": "q_activation", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "inbound_nodes": [[["batch_normalization", 0, 0, {}]]], "shared_object_id": 22, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 12]}}2 +£ root.layer_with_weights-4"_tf_keras_layer*ì{"name": "q_dense_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QDense", "config": {"name": "q_dense_1", "trainable": true, "dtype": "float32", "units": 36, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 23}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 24}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 25}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 26}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 27}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "inbound_nodes": [[["q_activation", 0, 0, {}]]], "shared_object_id": 28, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 12}}, "shared_object_id": 58}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 12]}}2 +ý  root.layer_with_weights-5"_tf_keras_layer*Æ {"name": "batch_normalization_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 29}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 30}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 31}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 32}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["q_dense_1", 0, 0, {}]]], "shared_object_id": 33, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 36}}, "shared_object_id": 59}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 36]}}2 +°  root.layer-12"_tf_keras_layer*…{"name": "q_activation_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QActivation", "config": {"name": "q_activation_1", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]], "shared_object_id": 34, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 36]}}2 +îroot.layer_with_weights-6"_tf_keras_layer*·{"name": "met_weight", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QDense", "config": {"name": "met_weight", "trainable": true, "dtype": "float32", "units": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "VarianceScaling", "config": {"scale": 0.02, "mode": "fan_in", "distribution": "truncated_normal", "seed": null}, "__passive_serialization__": true, "shared_object_id": 35}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 36}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 38}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 39}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "inbound_nodes": [[["q_activation_1", 0, 0, {}]]], "shared_object_id": 40, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 36}}, "shared_object_id": 60}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 36]}}2 +ü root.layer_with_weights-7"_tf_keras_layer*Å {"name": "met_weight_minus_one", "trainable": false, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "BatchNormalization", "config": {"name": "met_weight_minus_one", "trainable": false, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": false, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 41}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 42}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 43}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 44}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["met_weight", 0, 0, {}]]], "shared_object_id": 45, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 1}}, "shared_object_id": 61}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 1]}}2 +‡ root.layer-15"_tf_keras_input_layer*Ö{"class_name": "InputLayer", "name": "input_pxpy", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_pxpy"}}2 +è root.layer-16"_tf_keras_layer*½{"name": "multiply", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Multiply", "config": {"name": "multiply", "trainable": true, "dtype": "float32"}, "inbound_nodes": [[["met_weight_minus_one", 0, 0, {}], ["input_pxpy", 0, 0, {}]]], "shared_object_id": 47, "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 1]}, {"class_name": "TensorShape", "items": [null, 100, 2]}]}2 +í root.layer-17"_tf_keras_layer*Â{"name": "output", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "GlobalAveragePooling1D", "config": {"name": "output", "trainable": true, "dtype": "float32", "data_format": "channels_last", "keepdims": false}, "inbound_nodes": [[["multiply", 0, 0, {}]]], "shared_object_id": 48, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}, "shared_object_id": 62}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 2]}}2 +º‰root.keras_api.metrics.0"_tf_keras_metric*‚{"class_name": "Mean", "name": "loss", "dtype": "float32", "config": {"name": "loss", "dtype": "float32"}, "shared_object_id": 63}2 +‚Šroot.keras_api.metrics.1"_tf_keras_metric*Ê{"class_name": "MeanMetricWrapper", "name": "mean_absolute_error", "dtype": "float32", "config": {"name": "mean_absolute_error", "dtype": "float32", "fn": "mean_absolute_error"}, "shared_object_id": 54}2 +ÿ‹root.keras_api.metrics.2"_tf_keras_metric*Ç{"class_name": "MeanMetricWrapper", "name": "mean_squared_error", "dtype": "float32", "config": {"name": "mean_squared_error", "dtype": "float32", "fn": "mean_squared_error"}, "shared_object_id": 55}2 \ No newline at end of file diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/saved_model.pb b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/saved_model.pb new file mode 100644 index 00000000..d3b371bd Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/saved_model.pb differ diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.data-00000-of-00001 b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.data-00000-of-00001 new file mode 100644 index 00000000..6b749f73 Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.data-00000-of-00001 differ diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.index b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.index new file mode 100644 index 00000000..5077d741 Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.index differ diff --git a/train.py b/train.py index 9b1d2ecd..01dd340c 100644 --- a/train.py +++ b/train.py @@ -11,7 +11,7 @@ import matplotlib.pyplot as plt import argparse import math -#import setGPU +# import setGPU import time import os import pathlib diff --git a/utils.py b/utils.py index 6fc43146..a0575396 100644 --- a/utils.py +++ b/utils.py @@ -101,7 +101,7 @@ def MakePlots(trueXY, mlXY, puppiXY, path_out): # width of a distribution at 1 standard deviation def resolqt(y): - return(np.percentile(y, 84)-np.percentile(y, 16))/2.0 + return (np.percentile(y, 84)-np.percentile(y, 16))/2.0 # response correction factors # the events are split into 20 bins based on true_pt and get assigned the corresponding `truth_means/ml_means` of all events in that bin