diff --git a/Write_MET_binned_histogram.py b/Write_MET_binned_histogram.py
index d152e838..0f963a94 100644
--- a/Write_MET_binned_histogram.py
+++ b/Write_MET_binned_histogram.py
@@ -332,8 +332,8 @@ def MET_rel_error_bad(predict_met, gen_met, name='Met_res.pdf'):
     # for i in range(rel_err.shape[0]):
     #    std += (mean - rel_err[i]) **2
 
-    #std = std/rel_err.shape[0]
-    #std = math.sqrt(std)
+    # std = std/rel_err.shape[0]
+    # std = math.sqrt(std)
 
     mean = mean * 1000
     mean = int(mean)
@@ -467,8 +467,8 @@ def Phi_abs_error(predict_met, gen_met, name='Met_res.pdf'):
 def Pt_abs_error_opaque(puppi_met, ml_met, gen_met, name='Met_res.pdf'):
     puppi_err = (puppi_met - gen_met)
     ml_err = (ml_met - gen_met)
-    #minErr = min(np.array([rel_err, rel_err2]).flatten())
-    #maxErr = max(np.array([rel_err, rel_err2]).flatten())
+    # minErr = min(np.array([rel_err, rel_err2]).flatten())
+    # maxErr = max(np.array([rel_err, rel_err2]).flatten())
     plt.figure()
     plt.hist(puppi_err, bins=np.linspace(-250, 250, 50+1), alpha=0.5, label='puppi')
     plt.hist(ml_err, bins=np.linspace(-250, 250, 50+1), alpha=0.5, label='ML')
@@ -573,7 +573,7 @@ def MET_binned_predict_mean(predict_met, gen_met, binning, mini, maxi, genMET_cu
     plt.xlim(mini, maxi)
     plt.ylim(mini, 700)
     plt.xlabel('Gen MET mean [GeV]', fontsize=16)
-    #plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16)
+    # plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16)
     plt.ylabel('predicted MET mean [GeV]', fontsize=16)
     plt.legend()
     plt.savefig(name)
@@ -621,7 +621,7 @@ def MET_binned_predict_mean_opaque(predict_met, predict_met2, gen_met, binning,
     plt.xlim(mini, maxi)
     plt.ylim(mini, maxi)
     plt.xlabel('Gen MET mean [GeV]', fontsize=16)
-    #plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16)
+    # plt.ylabel('PUPPI MET mean [GeV]', fontsize = 16)
     plt.ylabel('predicted MET mean [GeV]', fontsize=16)
     plt.legend()
     plt.savefig(name)
@@ -673,9 +673,9 @@ def extract_result(feat_array, targ_array, path, name, mode):
 def histo_2D(predict_pT, gen_pT, min_, max_, name='2D_histo.png'):
     X_hist = np.arange(0, 500, 20)
     Y_hist = X_hist  # 1.25*X_hist
-    #Y_hist_1 = 0.75*X_hist
+    # Y_hist_1 = 0.75*X_hist
     plt.plot(X_hist, Y_hist, '-r')
-    #plt.plot(X_hist, Y_hist_1, '-r')
+    # plt.plot(X_hist, Y_hist_1, '-r')
     x_bins = np.linspace(min_, max_, 50)
     y_bins = np.linspace(min_, max_, 50)
     plt.hist2d(gen_pT, predict_pT,  bins=[x_bins, y_bins], cmap=plt.cm.jet)
diff --git a/convertNanoToHDF5_L1triggerToDeepMET.py b/convertNanoToHDF5_L1triggerToDeepMET.py
index 33a6a21f..7bb9cdc7 100644
--- a/convertNanoToHDF5_L1triggerToDeepMET.py
+++ b/convertNanoToHDF5_L1triggerToDeepMET.py
@@ -6,7 +6,7 @@
 import numpy as np
 import awkward as ak
 import h5py
-#import progressbar
+# import progressbar
 from tqdm import tqdm
 import os
 
diff --git a/convert_full_model.py b/convert_full_model.py
index f54971a2..ef42d3de 100644
--- a/convert_full_model.py
+++ b/convert_full_model.py
@@ -1,3 +1,4 @@
+import argparse
 import tensorflow
 from models import dense_embedding
 from tensorflow.keras.layers import Input, Concatenate
@@ -10,7 +11,11 @@
 from utils import preProcessing
 import h5py
 import scipy
+import seaborn
+import pandas as pd
+import matplotlib.pyplot as plt
 
+# TODO: what does this do?
 co = {}
 _add_supported_quantized_objects(co)
 
@@ -26,140 +31,239 @@ def print_dict(d, indent=0):
             print(':' + ' ' * (20 - len(key) - 2 * indent) + str(value))
 
 
-# load full model:
-model_name = 'trained_DeepMET'
-# model_name = 'trained_quantized_DeepMET'
-# model_name = 'trained_quantized_DeepMET_normfac1000'
-model = tensorflow.keras.models.load_model(f'models/baseline_DeepMET{"_quantized" if "quantized" in model_name else ""}/{model_name}.h5', compile=False, custom_objects=co)
-
-reuse_factor = 1
-precision = 'ap_fixed<32,16>'
-io_type = 'io_parallel'
-strategy = 'Latency'
-output_dir = 'hls_output_{}_{}_{}_rf{}_{}'.format(model_name ,io_type, strategy, reuse_factor, precision)
-batch_size = 1
-synth = False
-trace = True
-normFac = 1
-
-# check everthing works
-model.summary()
-model.save('{}/model.h5'.format(output_dir))
-
-config = hls4ml.utils.config_from_keras_model(model, 
-                                              granularity='name',
-                                              default_reuse_factor=reuse_factor, 
-                                              default_precision=precision)
-config['Model']['Strategy'] = strategy
-for name in config['LayerName'].keys():
-    config['LayerName'][name]['Trace'] = trace
-config['LayerName']['input_cat0']['Precision']['result'] = 'ap_uint<4>'
-config['LayerName']['input_cat1']['Precision']['result'] = 'ap_uint<4>'
-# config['LayerName']['input_cont']['Precision']['result'] = 'ap_fixed<20,10>'
-#if 'q_dense' in config['LayerName']:
-#    config['LayerName']['q_dense']['Precision']['accum'] = 'ap_fixed<32,16>'
-#    config['LayerName']['q_dense']['Precision']['weight'] = 'ap_fixed<32,16>'
-#    config['LayerName']['q_dense']['Precision']['bias'] = 'ap_fixed<32,16>'
-# config['LayerName']['q_dense_1']['Precision']['accum'] = 'ap_fixed<32,16>'
-# config['LayerName']['q_dense_1']['Precision']['weight'] = 'ap_fixed<32,16>'
-# config['LayerName']['q_dense_1']['Precision']['bias'] = 'ap_fixed<32,16>'
-config['LayerName']['multiply']['n_elem'] = 100
-config['LayerName']['output']['n_filt'] = 2
-# skip optimize_pointwise_conv
-# config['SkipOptimizers'] = ['optimize_pointwise_conv']
-# for layer in config['LayerName'].keys():
-#    config['LayerName'][layer]['Trace'] = True
-
-print("-----------------------------------")
-print_dict(config)
-print("-----------------------------------")
-hls_model = hls4ml.converters.convert_from_keras_model(model,
-                                                       hls_config=config,
-                                                       io_type=io_type,
-                                                       output_dir=output_dir,
-                                                       part='xcvu13p-flga2577-2-e',
-                                                       clock_period=5,
-                                                       project_name='L1METML_v1',
-)
-hls_model.compile()
-
-hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file='{}/model_hls4ml.png'.format(output_dir))
-
-if synth:
-    hls_model.build(synth=synth)
-    hls4ml.report.read_vivado_report(output_dir)
-
-f = h5py.File('data/test_data.h5')
-# 1000 test events is good enough
-X = f['X'][:1000]
-y = -f['Y'][:1000]
-
-# preprocessing
-X_pre = list(preProcessing(X, normFac=normFac))
-X_pre = [np.ascontiguousarray(x) for x in X_pre]
-
-y_pred = model.predict(X_pre)
-y_hls = hls_model.predict(X_pre)
-
-met = np.hypot(y[:, 0], y[:, 1])
-met_pred = np.hypot(y_pred[:, 0], y_pred[:, 1]) * normFac
-met_hls = np.hypot(y_hls[:, 0], y_hls[:, 1]) * normFac
-met_pup_x = np.sum(X[:, :, 1], axis=-1)
-met_pup_y = np.sum(X[:, :, 2], axis=-1)
-met_pup = np.hypot(met_pup_x, met_pup_y)
+def load_model(model_name):
+    if 'quantized' in model_name:
+        model = tensorflow.keras.models.load_model(f'models/baseline_DeepMET_quantized/{model_name}.h5', compile=False, custom_objects=co)
+    if 'test' in model_name:
+        model = tensorflow.keras.models.load_model(f'test_12_36/model.h5', compile=False, custom_objects=co)
+    else:
+        model = tensorflow.keras.models.load_model(f'models/baseline_DeepMET/{model_name}.h5', compile=False)
+    return model
 
-import seaborn
-import pandas as pd
-import matplotlib.pyplot as plt
 
-df = pd.DataFrame.from_dict({'Gen MET': met, 'PUPPI MET': met_pup, 'QKeras MET': met_pred, 'hls4ml MET': met_hls})
-plt.figure()
-seaborn.pairplot(df, corner=True)
-plt.savefig(f'{output_dir}/profiling_MET.png', dpi=300)
-
-df = pd.DataFrame.from_dict({'Gen MET x': y[:, 0], 'PUPPI MET x': met_pup_x, 'QKeras MET x': y_pred[:, 0], 'hls4ml MET x': y_hls[:, 0]})
-plt.figure()
-seaborn.pairplot(df, corner=True)
-plt.savefig(f'{output_dir}/profiling_MET_x.png', dpi=300)
-
-df = pd.DataFrame.from_dict({'Gen MET y': y[:, 1], 'PUPPI MET y': met_pup_y, 'QKeras MET y': y_pred[:, 1], 'hls4ml MET y': y_hls[:, 1]})
-plt.figure()
-seaborn.pairplot(df, corner=True)
-plt.savefig(f'{output_dir}/profiling_MET_y.png', dpi=300)
-
-response_pup = met_pup / met
-response_pred = met_pred / met
-response_hls = met_hls / met
-bins = np.linspace(0, 2, 25)
-plt.figure(figsize=(12, 5))
-plt.subplot(1, 3, 1)
-plt.hist(response_pup, bins=bins, label=f'PUPPI, median={np.median(response_pup):0.2f}, IQR={scipy.stats.iqr(response_pup):0.2f}')
-plt.legend()
-plt.xlabel("MET response $\hat{y}/y$")
-plt.ylabel("Events")
-plt.subplot(1, 3, 2)
-plt.hist(response_pred, bins=bins, label=f'QKeras, median={np.median(response_pred):0.2f}, IQR={scipy.stats.iqr(response_pred):0.2f}')
-plt.legend()
-plt.xlabel("MET response $\hat{y}/y$")
-plt.ylabel("Events")
-plt.subplot(1, 3, 3)
-plt.hist(response_hls, bins=bins, label=f'hls4ml, median={np.median(response_hls):0.2f}, IQR={scipy.stats.iqr(response_hls):0.2f}')
-plt.legend()
-plt.xlabel("MET response $\hat{y}/y$")
-plt.ylabel("Events")
-plt.tight_layout()
-plt.savefig(f"{output_dir}/response_MET.png", dpi=300)
-
-y_hls, hls4ml_trace = hls_model.trace(X_pre)
-keras_trace = hls4ml.model.profiling.get_ymodel_keras(model, X_pre)
-
-for layer in hls4ml_trace.keys():
+def configure_hls_model(model, config_params):
+    config = hls4ml.utils.config_from_keras_model(
+                                                model,
+                                                granularity='name',
+                                                default_reuse_factor=config_params['reuse-factor'],
+                                                default_precision=config_params['precision'])
+    config['Model']['Strategy'] = config_params['strategy']
+    for name in config['LayerName'].keys():
+        config['LayerName'][name]['Trace'] = config_params['trace']
+    config['LayerName']['input_cat0']['Precision']['result'] = 'ap_uint<4>'
+    config['LayerName']['input_cat1']['Precision']['result'] = 'ap_uint<4>'
+    # config['LayerName']['input_cont']['Precision']['result'] = 'ap_fixed<20,10>'
+    # if 'q_dense' in config['LayerName']:
+    #    config['LayerName']['q_dense']['Precision']['accum'] = 'ap_fixed<32,16>'
+    #    config['LayerName']['q_dense']['Precision']['weight'] = 'ap_fixed<32,16>'
+    #    config['LayerName']['q_dense']['Precision']['bias'] = 'ap_fixed<32,16>'
+    # config['LayerName']['q_dense_1']['Precision']['accum'] = 'ap_fixed<32,16>'
+    # config['LayerName']['q_dense_1']['Precision']['weight'] = 'ap_fixed<32,16>'
+    # config['LayerName']['q_dense_1']['Precision']['bias'] = 'ap_fixed<32,16>'
+    config['LayerName']['multiply']['n_elem'] = 100
+    config['LayerName']['output']['n_filt'] = 2
+    # skip optimize_pointwise_conv
+    # config['SkipOptimizers'] = ['optimize_pointwise_conv']
+    # for layer in config['LayerName'].keys():
+    #    config['LayerName'][layer]['Trace'] = True
+
+    print("-----------------------------------")
+    print_dict(config)
+    return config
+
+
+def convert_to_hls_model(model, config, output_dir, io_type, part, clock_period, project_name):
+    print("-----------------------------------")
+    hls_model = hls4ml.converters.convert_from_keras_model(model,
+                                                           hls_config=config,
+                                                           io_type=io_type,
+                                                           output_dir=output_dir,
+                                                           part=part,
+                                                           clock_period=clock_period,
+                                                           project_name=project_name,
+                                                           )
+    hls_model.compile()
+    return hls_model
+
+
+def preprocess_data(file_path, norm_factor):
+    with h5py.File(file_path, 'r') as f:
+        # 1000 test events is good enough
+        X = f['X'][:1000]
+        y = -f['Y'][:1000]
+    X_preprocessed = list(preProcessing(X, normFac=norm_factor))
+    return [np.ascontiguousarray(x) for x in X_preprocessed], X, y
+
+
+def plot_metrics(data_to_plot, hls_model, model, output_dir):
+    met = data_to_plot['met']
+    met_pred = data_to_plot['met_pred']
+    met_hls = data_to_plot['met_hls']
+    met_pup = data_to_plot['met_pup']
+    met_pup_x = data_to_plot['met_pup_x']
+    met_pup_y = data_to_plot['met_pup_y']
+    y_pred = data_to_plot['y_pred']
+    y_hls = data_to_plot['y_hls']
+    y = data_to_plot['y']
+    X_pre = data_to_plot['x_pre']
+
+    df = pd.DataFrame.from_dict({
+            'Gen MET': met,
+            'PUPPI MET': met_pup,
+            'QKeras MET': met_pred,
+            'hls4ml MET': met_hls,
+            })
     plt.figure()
-    if layer not in keras_trace: continue
-    plt.scatter(hls4ml_trace[layer].flatten(), keras_trace[layer].flatten(), s=0.2)
-    min_x = min(np.amin(hls4ml_trace[layer]), np.amin(keras_trace[layer]))
-    max_x = max(np.amax(hls4ml_trace[layer]), np.amax(keras_trace[layer]))
-    plt.plot([min_x, max_x], [min_x, max_x], c='gray')
-    plt.xlabel(f'hls4ml {layer}')
-    plt.ylabel(f'QKeras {layer}')
-    plt.savefig(f'{output_dir}/profiling_{layer}.png', dpi=300)
+    seaborn.pairplot(df, corner=True)
+    plt.savefig(f'{output_dir}/profiling_MET.png', dpi=300)
+    plt.close()
+
+    df = pd.DataFrame.from_dict(
+            {'Gen MET x': y[:, 0],
+             'PUPPI MET x': met_pup_x,
+             'QKeras MET x': y_pred[:, 0],
+             'hls4ml MET x': y_hls[:, 0],
+             })
+    plt.figure()
+    seaborn.pairplot(df, corner=True)
+    plt.savefig(f'{output_dir}/profiling_MET_x.png', dpi=300)
+
+    df = pd.DataFrame.from_dict({
+            'Gen MET y': y[:, 1],
+            'PUPPI MET y': met_pup_y,
+            'QKeras MET y': y_pred[:, 1],
+            'hls4ml MET y': y_hls[:, 1]
+            })
+    plt.figure()
+    seaborn.pairplot(df, corner=True)
+    plt.savefig(f'{output_dir}/profiling_MET_y.png', dpi=300)
+
+    response_pup = met_pup / met
+    response_pred = met_pred / met
+    response_hls = met_hls / met
+    bins = np.linspace(0, 2, 25)
+    plt.figure(figsize=(12, 5))
+    plt.subplot(1, 3, 1)
+    plt.hist(response_pup, bins=bins, label=f'PUPPI, median={np.median(response_pup):0.2f}, IQR={scipy.stats.iqr(response_pup):0.2f}')
+    plt.legend()
+    plt.xlabel("MET response $\\hat{y}/y$")
+    plt.ylabel("Events")
+    plt.subplot(1, 3, 2)
+    plt.hist(response_pred, bins=bins, label=f'QKeras, median={np.median(response_pred):0.2f}, IQR={scipy.stats.iqr(response_pred):0.2f}')
+    plt.legend()
+    plt.xlabel("MET response $\\hat{y}/y$")
+    plt.ylabel("Events")
+    plt.subplot(1, 3, 3)
+    plt.hist(response_hls, bins=bins, label=f'hls4ml, median={np.median(response_hls):0.2f}, IQR={scipy.stats.iqr(response_hls):0.2f}')
+    plt.legend()
+    plt.xlabel("MET response $\\hat{y}/y$")
+    plt.ylabel("Events")
+    plt.tight_layout()
+    plt.savefig(f"{output_dir}/response_MET.png", dpi=300)
+
+    y_hls, hls4ml_trace = hls_model.trace(X_pre)
+    keras_trace = hls4ml.model.profiling.get_ymodel_keras(model, X_pre)
+
+    for layer in hls4ml_trace.keys():
+        plt.figure()
+        if layer not in keras_trace:
+            continue
+        plt.scatter(hls4ml_trace[layer].flatten(), keras_trace[layer].flatten(), s=0.2)
+        min_x = min(np.amin(hls4ml_trace[layer]), np.amin(keras_trace[layer]))
+        max_x = max(np.amax(hls4ml_trace[layer]), np.amax(keras_trace[layer]))
+        plt.plot([min_x, max_x], [min_x, max_x], c='gray')
+        plt.xlabel(f'hls4ml {layer}')
+        plt.ylabel(f'QKeras {layer}')
+        plt.savefig(f'{output_dir}/profiling_{layer}.png', dpi=300)
+
+
+def main(args):
+    model_name = args.model_name
+
+    model = load_model(model_name)
+
+    config_params = {
+        'reuse-factor': 1,
+        'strategy': 'Latency',
+        'precision': 'ap_fixed<32,16>',
+        'trace': True,
+        }
+    io_type = 'io_parallel'
+    output_dir = 'hls_output_{}_{}_{}_rf{}_{}'.format(
+                                                    model_name,
+                                                    io_type,
+                                                    config_params['strategy'],
+                                                    config_params['reuse-factor'],
+                                                    config_params['precision']
+                                                    )
+    batch_size = 1
+    synth = False
+    trace = True
+    normFac = 1  # identify where NormFac is used (and how) and if it can be fed via argument
+
+    # check everthing works
+    model.summary()
+    model.save('{}/model.h5'.format(output_dir))
+
+    # create hls model
+    config = configure_hls_model(model, config_params)
+    hls_model = convert_to_hls_model(model, config, output_dir, io_type, 'xcvu13p-flga2577-2-e', 5, 'L1METML_v1')
+
+    hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file='{}/model_hls4ml.png'.format(output_dir))
+
+    if synth:
+        hls_model.build(synth=synth)
+        hls4ml.report.read_vivado_report(output_dir)
+
+    # load and preprocess data
+    X_pre, X, y = preprocess_data(args.data_path, norm_factor=1)
+
+    y_pred = model.predict(X_pre)
+    y_hls = hls_model.predict(X_pre)
+
+    met = np.hypot(y[:, 0], y[:, 1])
+    met_pred = np.hypot(y_pred[:, 0], y_pred[:, 1]) * normFac
+    met_hls = np.hypot(y_hls[:, 0], y_hls[:, 1]) * normFac
+    met_pup_x = np.sum(X[:, :, 1], axis=-1)  # does this need to be X_pre? previously X
+    met_pup_y = np.sum(X[:, :, 2], axis=-1)  # does this need to be X_pre? previously X
+    met_pup = np.hypot(met_pup_x, met_pup_y)
+
+    data_to_plot = {
+        'met': met,
+        'met_pred': met_pred,
+        'met_hls': met_hls,
+        'met_pup': met_pup,
+        'met_pup_x': met_pup_x,
+        'met_pup_y': met_pup_y,
+        'y_pred': y_pred,
+        'y_hls': y_hls,
+        'y': y,
+        'x_pre': X_pre,
+    }
+
+    plot_metrics(data_to_plot, hls_model, model, output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default='trained_DeepMET',
+        choices=[
+            'trained_DeepMET',
+            'trained_quantized_DeepMET',
+            'trained_quantized_DeepMET_normfac1000',
+            'test_12_36'],
+        help='Model name')
+    parser.add_argument(
+        '--data-path',
+        type=str,
+        default='data/test_data.h5',
+        help='Location of data file (.h5 format)')
+
+    args = parser.parse_args()
+    # TODO: figure what knobs are tuned here by the user and pass them as arguments
+    # TODO: refactor commented part of hls_config, potentially adding args or default values
+    main(args)
diff --git a/hls_conversion_config.yaml b/hls_conversion_config.yaml
new file mode 100644
index 00000000..203884c3
--- /dev/null
+++ b/hls_conversion_config.yaml
@@ -0,0 +1,19 @@
+model_name: "trained_DeepMET"  # Choose from available models
+
+# not implemented yet 
+
+# HLS config parameters
+config_params:
+  reuse-factor: 1
+  strategy: "Latency"
+  precision: "ap_fixed<32,16>"
+  trace: true
+
+
+io_type: "io_parallel"
+part: "xcvu13p-flga2577-2-e"
+clock_period: 5
+project_name: "L1METML_v1"
+batch_size: 1
+synth: false
+normFac: 1  # Identify where NormFac is used and if it can be fed via argument
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>.tar.gz b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>.tar.gz
new file mode 100644
index 00000000..68bcd4a2
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>.tar.gz differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_bridge.cpp b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_bridge.cpp
new file mode 100644
index 00000000..ed18b460
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_bridge.cpp
@@ -0,0 +1,104 @@
+#ifndef L1METML_V1_BRIDGE_H_
+#define L1METML_V1_BRIDGE_H_
+
+#include "firmware/L1METML_v1.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+// hls-fpga-machine-learning insert bram
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("embedding0", (void *) malloc(N_LAYER_1_3*N_LAYER_2_3 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("embedding1", (void *) malloc(N_LAYER_1_4*N_LAYER_2_4 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("concatenate", (void *) malloc(OUT_CONCAT_0_6*OUT_CONCAT_1_6 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("concatenate_1", (void *) malloc(OUT_CONCAT_0_7*OUT_CONCAT_1_7 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("dense", (void *) malloc(N_OUTPUTS_22*N_FILT_22 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("activation", (void *) malloc(N_LAYER_1_8*N_LAYER_2_8 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("dense_1", (void *) malloc(N_OUTPUTS_23*N_FILT_23 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("activation_1", (void *) malloc(N_LAYER_1_12*N_LAYER_2_12 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("met_weight", (void *) malloc(N_OUTPUTS_24*N_FILT_24 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("multiply", (void *) malloc(N_INPUT_1_19*N_INPUT_2_19 * element_size)));
+    nnet::trace_outputs->insert(std::pair<std::string, void *>("output", (void *) malloc(N_FILT_21 * element_size)));
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// Wrapper of top level function for Python bridge
+void L1METML_v1_float(
+    float input_cont[N_INPUT_1_5*N_INPUT_2_5], float input_pxpy[N_INPUT_1_19*N_INPUT_2_19], float input_cat0[N_INPUT_1_1], float input_cat1[N_INPUT_1_2],
+    float layer21_out[N_FILT_21]
+) {
+
+    input5_t input_cont_ap[N_INPUT_1_5*N_INPUT_2_5];
+    nnet::convert_data<float, input5_t, N_INPUT_1_5*N_INPUT_2_5>(input_cont, input_cont_ap);
+    input19_t input_pxpy_ap[N_INPUT_1_19*N_INPUT_2_19];
+    nnet::convert_data<float, input19_t, N_INPUT_1_19*N_INPUT_2_19>(input_pxpy, input_pxpy_ap);
+    input_t input_cat0_ap[N_INPUT_1_1];
+    nnet::convert_data<float, input_t, N_INPUT_1_1>(input_cat0, input_cat0_ap);
+    input2_t input_cat1_ap[N_INPUT_1_2];
+    nnet::convert_data<float, input2_t, N_INPUT_1_2>(input_cat1, input_cat1_ap);
+
+    result_t layer21_out_ap[N_FILT_21];
+
+    L1METML_v1(input_cont_ap,input_pxpy_ap,input_cat0_ap,input_cat1_ap,layer21_out_ap);
+
+    nnet::convert_data<result_t, float, N_FILT_21>(layer21_out_ap, layer21_out);
+}
+
+void L1METML_v1_double(
+    double input_cont[N_INPUT_1_5*N_INPUT_2_5], double input_pxpy[N_INPUT_1_19*N_INPUT_2_19], double input_cat0[N_INPUT_1_1], double input_cat1[N_INPUT_1_2],
+    double layer21_out[N_FILT_21]
+) {
+    input5_t input_cont_ap[N_INPUT_1_5*N_INPUT_2_5];
+    nnet::convert_data<double, input5_t, N_INPUT_1_5*N_INPUT_2_5>(input_cont, input_cont_ap);
+    input19_t input_pxpy_ap[N_INPUT_1_19*N_INPUT_2_19];
+    nnet::convert_data<double, input19_t, N_INPUT_1_19*N_INPUT_2_19>(input_pxpy, input_pxpy_ap);
+    input_t input_cat0_ap[N_INPUT_1_1];
+    nnet::convert_data<double, input_t, N_INPUT_1_1>(input_cat0, input_cat0_ap);
+    input2_t input_cat1_ap[N_INPUT_1_2];
+    nnet::convert_data<double, input2_t, N_INPUT_1_2>(input_cat1, input_cat1_ap);
+
+    result_t layer21_out_ap[N_FILT_21];
+
+    L1METML_v1(input_cont_ap,input_pxpy_ap,input_cat0_ap,input_cat1_ap,layer21_out_ap);
+
+    nnet::convert_data<result_t, double, N_FILT_21>(layer21_out_ap, layer21_out);
+}
+}
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_test.cpp b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_test.cpp
new file mode 100644
index 00000000..1c452f68
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/L1METML_v1_test.cpp
@@ -0,0 +1,120 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "firmware/L1METML_v1.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+int main(int argc, char **argv) {
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+      input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5];
+      nnet::copy_data<float, input5_t, 0, N_INPUT_1_5*N_INPUT_2_5>(in, input_cont);
+      input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19];
+      nnet::copy_data<float, input19_t, 400, N_INPUT_1_19*N_INPUT_2_19>(in, input_pxpy);
+      input_t input_cat0[N_INPUT_1_1];
+      nnet::copy_data<float, input_t, 600, N_INPUT_1_1>(in, input_cat0);
+      input2_t input_cat1[N_INPUT_1_2];
+      nnet::copy_data<float, input2_t, 700, N_INPUT_1_2>(in, input_cat1);
+      result_t layer21_out[N_FILT_21];
+
+            // hls-fpga-machine-learning insert top-level-function
+            L1METML_v1(input_cont,input_pxpy,input_cat0,input_cat1,layer21_out);
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                for(int i = 0; i < N_FILT_21; i++) {
+                  std::cout << pr[i] << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+                nnet::print_result<result_t, N_FILT_21>(layer21_out, std::cout, true);
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+            nnet::print_result<result_t, N_FILT_21>(layer21_out, fout);
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+
+        // hls-fpga-machine-learning insert zero
+    input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5];
+    nnet::fill_zero<input5_t, N_INPUT_1_5*N_INPUT_2_5>(input_cont);
+    input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19];
+    nnet::fill_zero<input19_t, N_INPUT_1_19*N_INPUT_2_19>(input_pxpy);
+    input_t input_cat0[N_INPUT_1_1];
+    nnet::fill_zero<input_t, N_INPUT_1_1>(input_cat0);
+    input2_t input_cat1[N_INPUT_1_2];
+    nnet::fill_zero<input2_t, N_INPUT_1_2>(input_cat1);
+    result_t layer21_out[N_FILT_21];
+
+        // hls-fpga-machine-learning insert top-level-function
+        L1METML_v1(input_cont,input_pxpy,input_cat0,input_cat1,layer21_out);
+
+        // hls-fpga-machine-learning insert output
+        nnet::print_result<result_t, N_FILT_21>(layer21_out, std::cout, true);
+
+        // hls-fpga-machine-learning insert tb-output
+        nnet::print_result<result_t, N_FILT_21>(layer21_out, fout);
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_lib.sh b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_lib.sh
new file mode 100644
index 00000000..d60a2dd3
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_lib.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+LDFLAGS=
+INCFLAGS="-Ifirmware/ap_types/"
+PROJECT=L1METML_v1
+LIB_STAMP=95715E3e
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_prj.tcl b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_prj.tcl
new file mode 100644
index 00000000..82b3c5a6
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/build_prj.tcl
@@ -0,0 +1,250 @@
+#################
+#    HLS4ML
+#################
+array set opt {
+    reset      0
+    csim       1
+    synth      1
+    cosim      1
+    validation 1
+    export     0
+    vsynth     0
+    fifo_opt   0
+}
+
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+proc remove_recursive_log_wave {} {
+    set tcldir [file dirname [info script]]
+    source [file join $tcldir project.tcl]
+
+    set filename ${project_name}_prj/solution1/sim/verilog/${project_name}.tcl
+    set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}]
+    set temp     $filename.new.$timestamp
+    # set backup   $filename.bak.$timestamp
+
+    set in  [open $filename r]
+    set out [open $temp     w]
+
+    # line-by-line, read the original file
+    while {[gets $in line] != -1} {
+        if {[string equal "$line" "log_wave -r /"]} {
+            set line { }
+        }
+        puts $out $line
+    }
+
+    close $in
+    close $out
+
+    # move the new data to the proper filename
+    file delete -force $filename
+    file rename -force $temp $filename
+}
+
+proc add_vcd_instructions_tcl {} {
+    set tcldir [file dirname [info script]]
+    source [file join $tcldir project.tcl]
+
+    set filename ${project_name}_prj/solution1/sim/verilog/${project_name}.tcl
+    set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}]
+    set temp     $filename.new.$timestamp
+    # set backup   $filename.bak.$timestamp
+
+    set in  [open $filename r]
+    set out [open $temp     w]
+
+    # line-by-line, read the original file
+    while {[gets $in line] != -1} {
+        if {[string equal "$line" "log_wave -r /"]} {
+            set line {source "../../../../project.tcl"
+                if {[string equal "$backend" "vivadoaccelerator"]} {
+                    current_scope [get_scopes -regex "/apatb_${project_name}_axi_top/AESL_inst_${project_name}_axi/${project_name}_U0.*"]
+                    set scopes [get_scopes -regexp {layer(\d*)_.*data_0_V_U.*}]
+                    append scopes { }
+                    current_scope "/apatb_${project_name}_axi_top/AESL_inst_${project_name}_axi"
+                    append scopes [get_scopes -regexp {(in_local_V_data.*_0_.*)}]
+                    append scopes { }
+                    append scopes [get_scopes -regexp {(out_local_V_data.*_0_.*)}]
+                } else {
+                    current_scope [get_scopes -regex "/apatb_${project_name}_top/AESL_inst_${project_name}"]
+                    set scopes [get_scopes -regexp {layer(\d*)_.*data_0_V_U.*}]
+                }
+                open_vcd fifo_opt.vcd
+                foreach scope $scopes {
+                    current_scope $scope
+                    if {[catch [get_objects usedw]] == 0} {
+                        puts "$scope skipped"
+                        continue
+                    }
+                    set usedw [get_objects usedw]
+                    set depth [get_objects DEPTH]
+                    add_wave $usedw
+                    log_vcd $usedw
+                    log_wave $usedw
+                    add_wave $depth
+                    log_vcd $depth
+                    log_wave $depth
+                }
+            }
+        }
+
+        if {[string equal "$line" "quit"]} {
+            set line {flush_vcd
+                close_vcd
+                quit
+            }
+        }
+        # then write the transformed line
+        puts $out $line
+    }
+
+    close $in
+    close $out
+
+    # move the new data to the proper filename
+    file delete -force $filename
+    file rename -force $temp $filename
+}
+
+foreach arg $::argv {
+    foreach o [lsort [array names opt]] {
+        regexp "$o=+(\\w+)" $arg unused opt($o)
+    }
+}
+
+proc report_time { op_name time_start time_end } {
+    set time_taken [expr $time_end - $time_start]
+    set time_s [expr ($time_taken / 1000) % 60]
+    set time_m [expr ($time_taken / (1000*60)) % 60]
+    set time_h [expr ($time_taken / (1000*60*60)) % 24]
+    puts "***** ${op_name} COMPLETED IN ${time_h}h${time_m}m${time_s}s *****"
+}
+
+# Compare file content: 1 = same, 0 = different
+proc compare_files {file_1 file_2} {
+    # Check if files exist, error otherwise
+    if {! ([file exists $file_1] && [file exists $file_2])} {
+        return 0
+    }
+    # Files with different sizes are obviously different
+    if {[file size $file_1] != [file size $file_2]} {
+        return 0
+    }
+
+    # String compare the content of the files
+    set fh_1 [open $file_1 r]
+    set fh_2 [open $file_2 r]
+    set equal [string equal [read $fh_1] [read $fh_2]]
+    close $fh_1
+    close $fh_2
+    return $equal
+}
+
+file mkdir tb_data
+set CSIM_RESULTS "./tb_data/csim_results.log"
+set RTL_COSIM_RESULTS "./tb_data/rtl_cosim_results.log"
+
+if {$opt(reset)} {
+    open_project -reset ${project_name}_prj
+} else {
+    open_project ${project_name}_prj
+}
+set_top ${project_name}
+add_files firmware/${project_name}.cpp -cflags "-std=c++0x"
+add_files -tb ${project_name}_test.cpp -cflags "-std=c++0x"
+add_files -tb firmware/weights
+add_files -tb tb_data
+if {$opt(reset)} {
+    open_solution -reset "solution1"
+} else {
+    open_solution "solution1"
+}
+catch {config_array_partition -maximum_size 8192}
+config_compile -name_max_length 80
+set_part $part
+config_schedule -enable_dsp_full_reg=false
+create_clock -period $clock_period -name default
+set_clock_uncertainty $clock_uncertainty default
+
+
+if {$opt(csim)} {
+    puts "***** C SIMULATION *****"
+    set time_start [clock clicks -milliseconds]
+    csim_design
+    set time_end [clock clicks -milliseconds]
+    report_time "C SIMULATION" $time_start $time_end
+}
+
+if {$opt(synth)} {
+    puts "***** C/RTL SYNTHESIS *****"
+    set time_start [clock clicks -milliseconds]
+    csynth_design
+    set time_end [clock clicks -milliseconds]
+    report_time "C/RTL SYNTHESIS" $time_start $time_end
+}
+
+if {$opt(cosim)} {
+    puts "***** C/RTL SIMULATION *****"
+    # TODO: This is a workaround (Xilinx defines __RTL_SIMULATION__ only for SystemC testbenches).
+    add_files -tb ${project_name}_test.cpp -cflags "-std=c++0x -DRTL_SIM"
+    set time_start [clock clicks -milliseconds]
+
+    cosim_design -trace_level all -setup
+
+    if {$opt(fifo_opt)} {
+        puts "\[hls4ml\] - FIFO optimization started"
+        add_vcd_instructions_tcl
+    }
+
+    remove_recursive_log_wave
+    set old_pwd [pwd]
+    cd ${project_name}_prj/solution1/sim/verilog/
+    source run_sim.tcl
+    cd $old_pwd
+
+    set time_end [clock clicks -milliseconds]
+    puts "INFO:"
+    if {[string equal "$backend" "vivadoaccelerator"]} {
+        puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]]
+    } else {
+        puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]]
+    }
+    report_time "C/RTL SIMULATION" $time_start $time_end
+}
+
+if {$opt(validation)} {
+    puts "***** C/RTL VALIDATION *****"
+    if {[compare_files $CSIM_RESULTS $RTL_COSIM_RESULTS]} {
+        puts "INFO: Test PASSED"
+    } else {
+        puts "ERROR: Test failed"
+        puts "ERROR: - csim log:      $CSIM_RESULTS"
+        puts "ERROR: - RTL-cosim log: $RTL_COSIM_RESULTS"
+        exit 1
+    }
+}
+
+if {$opt(export)} {
+    puts "***** EXPORT IP *****"
+    set time_start [clock clicks -milliseconds]
+    export_design -format ip_catalog -version $version
+    set time_end [clock clicks -milliseconds]
+    report_time "EXPORT IP" $time_start $time_end
+}
+
+if {$opt(vsynth)} {
+    puts "***** VIVADO SYNTHESIS *****"
+    if {[file exist ${project_name}_prj/solution1/syn/vhdl]} {
+        set time_start [clock clicks -milliseconds]
+        exec vivado -mode batch -source vivado_synth.tcl >@ stdout
+        set time_end [clock clicks -milliseconds]
+        report_time "VIVADO SYNTHESIS" $time_start $time_end
+    } else {
+        puts "ERROR: Cannot find generated VHDL files. Did you run C synthesis?"
+        exit 1
+    }
+}
+
+exit
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-2Bd4CD9f.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-2Bd4CD9f.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-2Bd4CD9f.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-87B65ff2.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-87B65ff2.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-87B65ff2.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-8aEF503a.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-8aEF503a.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-8aEF503a.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-95715E3e.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-95715E3e.so
new file mode 100755
index 00000000..7b00d3fa
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-95715E3e.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-B1BDE0dd.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-B1BDE0dd.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-B1BDE0dd.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-CEB54420.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-CEB54420.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-CEB54420.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-F1DF32D7.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-F1DF32D7.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-F1DF32D7.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-dDAfeD3b.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-dDAfeD3b.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-dDAfeD3b.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-ecB7D1bC.so b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-ecB7D1bC.so
new file mode 100755
index 00000000..2a7b45ec
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1-ecB7D1bC.so differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.cpp b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.cpp
new file mode 100644
index 00000000..18ef7438
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.cpp
@@ -0,0 +1,117 @@
+#include <iostream>
+
+#include "L1METML_v1.h"
+#include "parameters.h"
+
+void L1METML_v1(
+    input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5], input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19], input_t input_cat0[N_INPUT_1_1], input2_t input_cat1[N_INPUT_1_2],
+    result_t layer21_out[N_FILT_21]
+) {
+
+    // hls-fpga-machine-learning insert IO
+    #pragma HLS ARRAY_RESHAPE variable=input_cont complete dim=0
+    #pragma HLS ARRAY_RESHAPE variable=input_pxpy complete dim=0
+    #pragma HLS ARRAY_RESHAPE variable=input_cat0 complete dim=0
+    #pragma HLS ARRAY_RESHAPE variable=input_cat1 complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=layer21_out complete dim=0
+    #pragma HLS INTERFACE ap_vld port=input_cont,input_pxpy,input_cat0,input_cat1,layer21_out 
+    #pragma HLS DATAFLOW 
+
+#ifndef __SYNTHESIS__
+    static bool loaded_weights = false;
+    if (!loaded_weights) {
+        // hls-fpga-machine-learning insert load weights
+        nnet::load_weights_from_txt<embedding0_embeddings_t, 12>(e3, "e3.txt");
+        nnet::load_weights_from_txt<embedding1_embeddings_t, 8>(e4, "e4.txt");
+        nnet::load_weights_from_txt<dense_weight_t, 96>(w22, "w22.txt");
+        nnet::load_weights_from_txt<dense_bias_t, 12>(b22, "b22.txt");
+        nnet::load_weights_from_txt<dense_1_weight_t, 432>(w23, "w23.txt");
+        nnet::load_weights_from_txt<dense_1_bias_t, 36>(b23, "b23.txt");
+        nnet::load_weights_from_txt<met_weight_weight_t, 36>(w24, "w24.txt");
+        nnet::load_weights_from_txt<met_weight_bias_t, 1>(b24, "b24.txt");
+        loaded_weights = true;
+    }
+#endif
+
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    // hls-fpga-machine-learning insert layers
+
+    layer3_t layer3_out[N_LAYER_1_3*N_LAYER_2_3];
+    #pragma HLS ARRAY_PARTITION variable=layer3_out complete dim=0
+    nnet::embedding<input_t, layer3_t, config3>(input_cat0, layer3_out, e3); // embedding0
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer3_t>(layer3_out, "embedding0", N_LAYER_1_3*N_LAYER_2_3);
+#endif
+
+    layer4_t layer4_out[N_LAYER_1_4*N_LAYER_2_4];
+    #pragma HLS ARRAY_PARTITION variable=layer4_out complete dim=0
+    nnet::embedding<input2_t, layer4_t, config4>(input_cat1, layer4_out, e4); // embedding1
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer4_t>(layer4_out, "embedding1", N_LAYER_1_4*N_LAYER_2_4);
+#endif
+
+    layer6_t layer6_out[OUT_CONCAT_0_6*OUT_CONCAT_1_6];
+    #pragma HLS ARRAY_PARTITION variable=layer6_out complete dim=0
+    nnet::concatenate2d<layer3_t, layer4_t, layer6_t, config6>(layer3_out, layer4_out, layer6_out); // concatenate
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer6_t>(layer6_out, "concatenate", OUT_CONCAT_0_6*OUT_CONCAT_1_6);
+#endif
+
+    layer7_t layer7_out[OUT_CONCAT_0_7*OUT_CONCAT_1_7];
+    #pragma HLS ARRAY_PARTITION variable=layer7_out complete dim=0
+    nnet::concatenate2d<input5_t, layer6_t, layer7_t, config7>(input_cont, layer6_out, layer7_out); // concatenate_1
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer7_t>(layer7_out, "concatenate_1", OUT_CONCAT_0_7*OUT_CONCAT_1_7);
+#endif
+
+    layer22_t layer22_out[N_OUTPUTS_22*N_FILT_22];
+    #pragma HLS ARRAY_PARTITION variable=layer22_out complete dim=0
+    nnet::pointwise_conv_1d_cl<layer7_t, layer22_t, config22>(layer7_out, layer22_out, w22, b22); // dense
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer22_t>(layer22_out, "dense", N_OUTPUTS_22*N_FILT_22);
+#endif
+
+    layer11_t layer11_out[N_LAYER_1_8*N_LAYER_2_8];
+    #pragma HLS ARRAY_PARTITION variable=layer11_out complete dim=0
+    nnet::tanh<layer22_t, layer11_t, tanh_config11>(layer22_out, layer11_out); // activation
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer11_t>(layer11_out, "activation", N_LAYER_1_8*N_LAYER_2_8);
+#endif
+
+    layer23_t layer23_out[N_OUTPUTS_23*N_FILT_23];
+    #pragma HLS ARRAY_PARTITION variable=layer23_out complete dim=0
+    nnet::pointwise_conv_1d_cl<layer11_t, layer23_t, config23>(layer11_out, layer23_out, w23, b23); // dense_1
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer23_t>(layer23_out, "dense_1", N_OUTPUTS_23*N_FILT_23);
+#endif
+
+    layer15_t layer15_out[N_LAYER_1_12*N_LAYER_2_12];
+    #pragma HLS ARRAY_PARTITION variable=layer15_out complete dim=0
+    nnet::tanh<layer23_t, layer15_t, tanh_config15>(layer23_out, layer15_out); // activation_1
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer15_t>(layer15_out, "activation_1", N_LAYER_1_12*N_LAYER_2_12);
+#endif
+
+    layer24_t layer24_out[N_OUTPUTS_24*N_FILT_24];
+    #pragma HLS ARRAY_PARTITION variable=layer24_out complete dim=0
+    nnet::pointwise_conv_1d_cl<layer15_t, layer24_t, config24>(layer15_out, layer24_out, w24, b24); // met_weight
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer24_t>(layer24_out, "met_weight", N_OUTPUTS_24*N_FILT_24);
+#endif
+
+    layer20_t layer20_out[N_INPUT_1_19*N_INPUT_2_19];
+    #pragma HLS ARRAY_PARTITION variable=layer20_out complete dim=0
+    nnet::multiply<layer24_t, input19_t, layer20_t, config20>(layer24_out, input_pxpy, layer20_out); // multiply
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<layer20_t>(layer20_out, "multiply", N_INPUT_1_19*N_INPUT_2_19);
+#endif
+
+    nnet::global_pooling1d_cl<layer20_t, result_t, config21>(layer20_out, layer21_out); // output
+#ifndef __SYNTHESIS__
+    nnet::save_layer_output<result_t>(layer21_out, "output", N_FILT_21);
+#endif
+
+}
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.h
new file mode 100644
index 00000000..69dd92ca
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/L1METML_v1.h
@@ -0,0 +1,16 @@
+#ifndef L1METML_V1_H_
+#define L1METML_V1_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+#include "hls_stream.h"
+
+#include "defines.h"
+
+// Prototype of top level function for C-synthesis
+void L1METML_v1(
+    input5_t input_cont[N_INPUT_1_5*N_INPUT_2_5], input19_t input_pxpy[N_INPUT_1_19*N_INPUT_2_19], input_t input_cat0[N_INPUT_1_1], input2_t input_cat1[N_INPUT_1_2],
+    result_t layer21_out[N_FILT_21]
+);
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_common.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_common.h
new file mode 100644
index 00000000..4d2886cb
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_common.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_COMMON_H__
+#define __AP_COMMON_H__
+
+// ----------------------------------------------------------------------
+
+// Forward declaration of all AP types.
+#include <ap_decl.h>
+
+
+#ifdef __SYNTHESIS__
+#error "The open-source version of AP types does not support synthesis."
+#endif // ifdef __SYNTHESIS__
+#define _AP_ENABLE_HALF_ 0
+
+
+#if _AP_ENABLE_HALF_ == 1
+// Before ap_private definition.
+#ifdef __SYNTHESIS__
+#define _HLS_HALF_DEFINED_
+typedef __fp16 half;
+#else
+class half;
+#endif // __SYNTHESIS__
+#endif // _AP_ENABLE_HALF_
+
+// ----------------------------------------------------------------------
+
+// Macro functions
+#define AP_MAX(a, b) ((a) > (b) ? (a) : (b))
+#define AP_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define AP_ABS(a) ((a) >= 0 ? (a) : -(a))
+
+#ifndef AP_ASSERT
+#ifndef __SYNTHESIS__
+#include <assert.h>
+#define AP_ASSERT(cond, msg) assert((cond) && (msg))
+#else
+#define AP_ASSERT(cond, msg)
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_ASSERT
+
+#ifndef __SYNTHESIS__
+// for fprintf messages.
+#include <stdio.h>
+// for exit on error.
+#include <stdlib.h>
+#endif
+
+// same disable condition as assert.
+#if !defined(__SYNTHESIS__) && !defined(NDEBUG)
+
+#define _AP_DEBUG(cond, ...)                  \
+  do {                                        \
+    if ((cond)) {                             \
+      fprintf(stderr, "DEBUG: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                  \
+    }                                         \
+  } while (0)
+#define _AP_WARNING(cond, ...)                  \
+  do {                                          \
+    if ((cond)) {                               \
+      fprintf(stderr, "WARNING: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                    \
+    }                                           \
+  } while (0)
+#define _AP_ERROR(cond, ...)                  \
+  do {                                        \
+    if ((cond)) {                             \
+      fprintf(stderr, "ERROR: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                  \
+      abort();                                \
+    }                                         \
+  } while (0)
+
+#else // if !defined(__SYNTHESIS__) && !defined(NDEBUG)
+
+#define __AP_VOID_CAST static_cast<void>
+#define _AP_DEBUG(cond, ...) (__AP_VOID_CAST(0))
+#define _AP_WARNING(cond, ...) (__AP_VOID_CAST(0))
+#define _AP_ERROR(cond, ...) (__AP_VOID_CAST(0))
+
+#endif // if !defined(__SYNTHESIS__) && !defined(NDEBUG) else
+
+// ----------------------------------------------------------------------
+
+// Attribute only for synthesis
+#ifdef __SYNTHESIS__
+#define INLINE inline __attribute__((always_inline))
+//#define INLINE inline __attribute__((noinline))
+#else
+#define INLINE inline
+#endif
+
+#define AP_WEAK
+// __attribute__((weak))
+
+#ifndef AP_INT_MAX_W
+#define AP_INT_MAX_W 1024
+#endif
+
+#define BIT_WIDTH_UPPER_LIMIT (1 << 15)
+#if AP_INT_MAX_W > BIT_WIDTH_UPPER_LIMIT
+#error "Bitwidth exceeds 32768 (1 << 15), the maximum allowed value"
+#endif
+
+#define MAX_MODE(BITS) ((BITS + 1023) / 1024)
+
+// ----------------------------------------------------------------------
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+// for overload operator<<
+#include <iostream>
+#endif
+#endif // ifndef AP_AUTOCC
+
+#ifndef __SYNTHESIS__
+// for string format.
+#include <sstream>
+// for string.
+#include <string>
+#endif
+
+// for detecting if char is signed.
+enum { CHAR_IS_SIGNED = (char)-1 < 0 };
+
+// TODO we have similar traits in x_hls_utils.h, should consider unify.
+namespace _ap_type {
+template <typename _Tp>
+struct is_signed {
+  static const bool value = _Tp(-1) < _Tp(1);
+};
+
+template <typename _Tp>
+struct is_integral {
+  static const bool value = false;
+};
+#define DEF_IS_INTEGRAL(CTYPE)      \
+  template <>                       \
+  struct is_integral<CTYPE> {       \
+    static const bool value = true; \
+  };
+DEF_IS_INTEGRAL(bool)
+DEF_IS_INTEGRAL(char)
+DEF_IS_INTEGRAL(signed char)
+DEF_IS_INTEGRAL(unsigned char)
+DEF_IS_INTEGRAL(short)
+DEF_IS_INTEGRAL(unsigned short)
+DEF_IS_INTEGRAL(int)
+DEF_IS_INTEGRAL(unsigned int)
+DEF_IS_INTEGRAL(long)
+DEF_IS_INTEGRAL(unsigned long)
+DEF_IS_INTEGRAL(ap_slong)
+DEF_IS_INTEGRAL(ap_ulong)
+#undef DEF_IS_INTEGRAL
+
+template <bool, typename _Tp = void>
+struct enable_if {};
+// partial specialization for true
+template <typename _Tp>
+struct enable_if<true, _Tp> {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_const {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_const<_Tp const> {
+  typedef _Tp type;
+};
+} // namespace _ap_type
+
+// ----------------------------------------------------------------------
+
+// Define ssdm_int and _ssdm_op.
+// XXX deleted in open-source version
+
+#ifndef NON_C99STRING
+#define _AP_C99 true
+#else
+#define _AP_C99 false
+#endif
+
+static inline unsigned char guess_radix(const char* s) {
+  unsigned char rd = 10; ///< default radix
+  const char* p = s;
+  // skip neg sign if it exists
+  if (p[0] == '-' || p[0] == '+') ++p;
+  // guess based on following two bits.
+  if (p[0] == '0') {
+    if (p[1] == 'b' || p[1] == 'B') {
+      rd = 2;
+    } else if (p[1] == 'o' || p[1] == 'O') {
+      rd = 8;
+    } else if (p[1] == 'x' || p[1] == 'X') {
+      rd = 16;
+    } else if (p[1] == 'd' || p[1] == 'D') {
+      rd = 10;
+    }
+  }
+  return rd;
+}
+
+// ----------------------------------------------------------------------
+
+// Basic integral struct upon which ap_int and ap_fixed are defined.
+#ifdef __SYNTHESIS__
+// Use ssdm_int, a compiler dependent, attribute constrained integeral type as
+// basic data type.
+#define _AP_ROOT_TYPE ssdm_int
+// Basic ops.
+#define _AP_ROOT_op_concat(Ret, X, Y) _ssdm_op_concat(Ret, X, Y)
+#define _AP_ROOT_op_get_bit(Val, Bit) _ssdm_op_get_bit(Val, Bit)
+#define _AP_ROOT_op_set_bit(Val, Bit, Repl) _ssdm_op_set_bit(Val, Bit, Repl)
+#define _AP_ROOT_op_get_range(Val, Lo, Hi) _ssdm_op_get_range(Val, Lo, Hi)
+#define _AP_ROOT_op_set_range(Val, Lo, Hi, Repl) \
+  _ssdm_op_set_range(Val, Lo, Hi, Repl)
+#define _AP_ROOT_op_reduce(Op, Val) _ssdm_op_reduce(Op, Val)
+#else // ifdef __SYNTHESIS__
+// Use ap_private for compiler-independent basic data type
+template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
+class ap_private;
+/// model ssdm_int in standard C++ for simulation.
+template <int _AP_W, bool _AP_S>
+struct ssdm_int_sim {
+  /// integral type with template-specified width and signedness.
+  ap_private<_AP_W, _AP_S> V;
+  ssdm_int_sim() {}
+};
+#define _AP_ROOT_TYPE ssdm_int_sim
+// private's ref uses _AP_ROOT_TYPE.
+#include <etc/ap_private.h>
+// XXX The C-sim model cannot use GCC-extension
+// Basic ops. Ret and Val are ap_private.
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ROOT_op_concat(const _Tp1& Ret, const _Tp2& X, const _Tp3& Y) {
+  _Tp1 r = (X).operator,(Y);
+  return r;
+}
+#define _AP_ROOT_op_get_bit(Val, Bit) (Val).get_bit((Bit))
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1& _AP_ROOT_op_set_bit(_Tp1& Val, const _Tp2& Bit, const _Tp3& Repl) {
+  (Val).set_bit((Bit), (Repl));
+  return Val;
+}
+// notice the order of high and low index is different in ssdm call and
+// ap_private.range()...
+#define _AP_ROOT_op_get_range(Val, Lo, Hi) (Val).range((Hi), (Lo))
+template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+inline _Tp1& _AP_ROOT_op_set_range(_Tp1& Val, const _Tp2& Lo, const _Tp3& Hi,
+                                   const _Tp4& Repl) {
+  (Val).range((Hi), (Lo)) = Repl;
+  return (Val);
+}
+#define _AP_ROOT_op_and_reduce(Val) (Val).and_reduce()
+#define _AP_ROOT_op_nand_reduce(Val) (Val).nand_reduce()
+#define _AP_ROOT_op_or_reduce(Val) (Val).or_reduce()
+#define _AP_ROOT_op_xor_reduce(Val) (Val).xor_reduce()
+// ## is the concatenation in preprocessor:
+#define _AP_ROOT_op_reduce(Op, Val) _AP_ROOT_op_##Op##_reduce(Val)
+#endif // ifdef __SYNTHESIS__ else
+
+// ----------------------------------------------------------------------
+
+// Constants for half, single, double pricision floating points
+#define HALF_MAN 10
+#define FLOAT_MAN 23
+#define DOUBLE_MAN 52
+
+#define HALF_EXP 5
+#define FLOAT_EXP 8
+#define DOUBLE_EXP 11
+
+#define BIAS(e) ((1L << (e - 1L)) - 1L)
+#define HALF_BIAS BIAS(HALF_EXP)
+#define FLOAT_BIAS BIAS(FLOAT_EXP)
+#define DOUBLE_BIAS BIAS(DOUBLE_EXP)
+
+#define APFX_IEEE_DOUBLE_E_MAX DOUBLE_BIAS
+#define APFX_IEEE_DOUBLE_E_MIN (-DOUBLE_BIAS + 1)
+
+INLINE ap_ulong doubleToRawBits(double pf) {
+  union {
+    ap_ulong __L;
+    double __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+}
+
+INLINE unsigned int floatToRawBits(float pf) {
+  union {
+    unsigned int __L;
+    float __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+}
+
+#if _AP_ENABLE_HALF_ == 1
+INLINE unsigned short halfToRawBits(half pf) {
+#ifdef __SYNTHESIS__
+  union {
+    unsigned short __L;
+    half __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+#else
+  return pf.get_bits();
+#endif
+}
+#endif
+
+// usigned long long is at least 64-bit
+INLINE double rawBitsToDouble(ap_ulong pi) {
+  union {
+    ap_ulong __L;
+    double __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+}
+
+// long is at least 32-bit
+INLINE float rawBitsToFloat(unsigned long pi) {
+  union {
+    unsigned int __L;
+    float __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+}
+
+#if _AP_ENABLE_HALF_ == 1
+// short is at least 16-bit
+INLINE half rawBitsToHalf(unsigned short pi) {
+#ifdef __SYNTHESIS__
+  union {
+    unsigned short __L;
+    half __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+#else
+  // sim model of half has a non-trivial constructor
+  half __D;
+  __D.set_bits(pi);
+  return __D;
+#endif
+}
+#endif
+
+#endif // ifndef __AP_COMMON_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_decl.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_decl.h
new file mode 100644
index 00000000..ddd00f1c
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_decl.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_DECL_H__
+#define __AP_DECL_H__
+
+// ----------------------------------------------------------------------
+
+#if !defined(__AP_FIXED_H__) && !defined(__AP_INT_H__) && !defined(__AUTOPILOT_CBE_H__) && !defined(__HLS_HALF_H__)
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+// Test __SYNTHESIS__ only for mode
+#if !defined(__SYNTHESIS__) && (defined(AESL_SYN) || defined(__HLS_SYN__))
+//#pragma message "AESL_SYN and __HLS_SYN__ should be replaced by __SYNTHESIS__"
+#define __SYNTHESIS__
+#endif
+
+/* for safety*/
+#if (defined(_AP_N) || defined(_AP_C))
+#error One or more of the following is defined: _AP_N, _AP_C. Definition conflicts with their usage as template parameters.
+#endif
+
+/* for safety*/
+#if (defined(_AP_W) || defined(_AP_I) || defined(_AP_S) || defined(_AP_Q) || \
+     defined(_AP_O) || defined(_AP_W2) || defined(_AP_I2) ||                 \
+     defined(_AP_S2) || defined(_AP_Q2) || defined(_AP_O2) ||                \
+     defined(_AP_N) || defined(_AP_N2))
+#error \
+    "One or more of the following is defined: _AP_W, _AP_I, _AP_S, _AP_Q, _AP_O,  _AP_N, _AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2. Definition conflicts with their usage as template parameters."
+#endif
+
+/*for safety*/
+#if (defined(_AP_W3) || defined(_AP_S3) || defined(_AP_W4) || defined(_AP_S4))
+#error \
+    "One or more of the following is defined: _AP_W3, _AP_S3, _AP_W4,_AP_S4. Definition conflicts with their usage as template parameters."
+#endif
+
+#if (defined(_AP_W1) || defined(_AP_S1) || defined(_AP_T) || \
+     defined(_AP_T1) || defined(_AP_T2) || defined(_AP_T3) || defined(_AP_T4))
+#error \
+    "One or more of the following is defined: _AP_W1, _AP_S1, _AP_T,  _AP_T1, _AP_T2, _AP_T3, _AP_T4. Definition conflicts with their usage as template parameters."
+#endif
+
+#ifndef __cplusplus
+#error "AP data type can only be used in C++"
+#endif
+
+// ----------------------------------------------------------------------
+
+#ifndef __SC_COMPATIBLE__
+/// ap_fixed quantification mode
+enum ap_q_mode {
+  AP_RND,         //< rounding to plus infinity
+  AP_RND_ZERO,    //< rounding to zero
+  AP_RND_MIN_INF, //< rounding to minus infinity
+  AP_RND_INF,     //< rounding to infinity
+  AP_RND_CONV,    //< convergent rounding
+  AP_TRN,         //< truncation
+  AP_TRN_ZERO,    //< truncation to zero
+};
+
+// FIXME for legacy code
+#ifndef SYSTEMC_INCLUDED
+#define SC_RND AP_RND
+#define SC_RND_ZERO AP_RND_ZERO
+#define SC_RND_MIN_INF AP_RND_MIN_INF
+#define SC_RND_INF AP_RND_INF
+#define SC_RND_CONV AP_RND_CONV
+#define SC_TRN AP_TRN
+#define SC_TRN_ZERO AP_TRN_ZERO
+#endif // !defined(SYSTEMC_INCLUDED)
+
+/// ap_fixed saturation mode
+enum ap_o_mode {
+  AP_SAT,      //< saturation
+  AP_SAT_ZERO, //< saturation to zero
+  AP_SAT_SYM,  //< symmetrical saturation
+  AP_WRAP,     //< wrap-around (*)
+  AP_WRAP_SM,  //< sign magnitude wrap-around (*)
+};
+
+// FIXME for legacy code
+#ifndef SYSTEMC_INCLUDED
+#define SC_SAT AP_SAT
+#define SC_SAT_ZERO AP_SAT_ZERO
+#define SC_SAT_SYM AP_SAT_SYM
+#define SC_WRAP AP_WRAP
+#define SC_WRAP_SM AP_WRAP_SM
+#endif // !defined(SYSTEMC_INCLUDED)
+
+#else // defined(__SC_COMPATIBLE__)
+
+// There will not be sc_fxdefs.h, and the emu should be defined by ap_fixed.
+
+/// ap_fixed quantification mode
+enum ap_q_mode {
+  SC_RND,         //< rounding to plus infinity
+  SC_RND_ZERO,    //< rounding to zero
+  SC_RND_MIN_INF, //< rounding to minus infinity
+  SC_RND_INF,     //< rounding to infinity
+  SC_RND_CONV,    //< convergent rounding
+  SC_TRN,         //< truncation
+  SC_TRN_ZERO,    //< truncation to zero
+};
+
+#define AP_RND SC_RND
+#define AP_RND_ZERO SC_RND_ZERO
+#define AP_RND_MIN_INF SC_RND_MIN_INF
+#define AP_RND_INF SC_RND_INF
+#define AP_RND_CONV SC_RND_CONV
+#define AP_TRN SC_TRN
+#define AP_TRN_ZERO SC_TRN_ZERO
+
+/// ap_fixed saturation mode
+enum ap_o_mode {
+  SC_SAT,      //< saturation
+  SC_SAT_ZERO, //< saturation to zero
+  SC_SAT_SYM,  //< symmetrical saturation
+  SC_WRAP,     //< wrap-around (*)
+  SC_WRAP_SM,  //< sign magnitude wrap-around (*)
+};
+
+#define AP_SAT SC_SAT
+#define AP_SAT_ZERO SC_SAT_ZERO
+#define AP_SAT_SYM SC_SAT_SYM
+#define AP_WRAP SC_WRAP
+#define AP_WRAP_SM SC_WRAP_SM
+
+#endif // defined(__SC_COMPATIBLE__)
+
+template <int _AP_W, bool _AP_S>
+struct ap_int_base;
+
+template <int _AP_W>
+struct ap_int;
+
+template <int _AP_W>
+struct ap_uint;
+
+template <int _AP_W, bool _AP_S>
+struct ap_range_ref;
+
+template <int _AP_W, bool _AP_S>
+struct ap_bit_ref;
+
+template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+struct ap_concat_ref;
+
+template <int _AP_W, int _AP_I, bool _AP_S = true, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_fixed_base;
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_fixed;
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_ufixed;
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_range_ref;
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_bit_ref;
+
+/// string base mode
+enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 };
+
+#ifndef SYSTEMC_INCLUDED
+#define SC_BIN 2
+#define SC_OCT 8
+#define SC_DEC 10
+#define SC_HEX 16
+#endif // !defined(SYSTEMC_INCLUDED)
+
+// Alias C data types
+#ifdef _MSC_VER
+typedef signed __int64 ap_slong;
+typedef unsigned __int64 ap_ulong;
+#else  // !defined(_MSC_VER)
+typedef signed long long ap_slong;
+typedef unsigned long long ap_ulong;
+#endif // !defined(_MSC_VER)
+
+enum {
+  _AP_SIZE_char = 8,
+  _AP_SIZE_short = sizeof(short) * 8,
+  _AP_SIZE_int = sizeof(int) * 8,
+  _AP_SIZE_long = sizeof(long) * 8,
+  _AP_SIZE_ap_slong = sizeof(ap_slong) * 8
+};
+
+#endif // !defined(__AP_DECL_H__)
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed.h
new file mode 100644
index 00000000..cd0192bc
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_H__
+#define __AP_FIXED_H__
+
+#include <ap_common.h>
+#include <ap_fixed_base.h>
+#include <ap_fixed_ref.h>
+
+//---------------------------------------------------------------
+
+/// Signed Arbitrary Precision Fixed-Point Type.
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> {
+  typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base;
+  // Constructor
+  /// default ctor
+  INLINE ap_fixed() : Base() {}
+
+  /// default copy ctor
+  INLINE ap_fixed(const ap_fixed& op) { Base::V = op.V; }
+
+  /// copy ctor from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                      _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                               _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  //// from ap_fixed
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //// from ap_ufixed.
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  /// copy ctor from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  //// from ap_int.
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const volatile ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //// from ap_uint.
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const volatile ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  // from ap_bit_ref.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  // from ap_range_ref.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  // from ap_concat_ref.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
+      : Base(op) {}
+
+  // from af_bit_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  // from af_range_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+// from c types.
+#define CTOR(TYPE) \
+  INLINE ap_fixed(TYPE v) : Base(v) {}
+
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  CTOR(half)
+#endif
+  CTOR(float)
+  CTOR(double)
+#undef CTOR
+
+  INLINE ap_fixed(const char* s) : Base(s) {}
+
+  INLINE ap_fixed(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  // The assignment operator is technically inherited; however, it is always
+  // hidden by an explicitly or implicitly defined assignment operator for the
+  // derived class.
+  /* XXX ctor will be used when right is not of proper type. */
+  INLINE ap_fixed& operator=(
+      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+
+  INLINE ap_fixed& operator=(
+      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+}; // struct ap_fixed.
+
+//-------------------------------------------------------------------
+
+// Unsigned Arbitrary Precision Fixed-Point Type.
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> {
+  typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base;
+  // Constructor
+  /// default ctor
+  INLINE ap_ufixed() : Base() {}
+
+  /// default copy ctor
+  INLINE ap_ufixed(const ap_ufixed& op) { Base::V = op.V; }
+
+  /// copy ctor from ap_fixed_base
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                       _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  /// copy ctor from ap_fixed_base
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                                _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  /// copy ctor from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const volatile ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const volatile ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_ufixed(TYPE v) : Base(v) {}
+
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  CTOR(half)
+#endif
+  CTOR(float)
+  CTOR(double)
+#undef CTOR
+
+  INLINE ap_ufixed(const char* s) : Base(s) {}
+
+  INLINE ap_ufixed(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  INLINE ap_ufixed& operator=(
+      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+
+  INLINE ap_ufixed& operator=(
+      const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O,
+                                                 _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+}; // struct ap_ufixed
+
+
+#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
+// XXX sc_trace overload for ap_fixed is already included in
+// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+INLINE void sc_trace(sc_core::sc_trace_file* tf,
+                     const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
+                     const std::string& name) {
+  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+INLINE void sc_trace(sc_core::sc_trace_file* tf,
+                     const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
+                     const std::string& name) {
+  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+#endif // System C sim
+
+// Specialization of std containers, so that std::complex<ap_fixed> can have its
+// image part automatically zero-initialized when only real part is provided.
+#include <ap_fixed_special.h>
+
+#endif // ifndef __AP_FIXED_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_base.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_base.h
new file mode 100644
index 00000000..1d94b938
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_base.h
@@ -0,0 +1,2354 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_BASE_H__
+#define __AP_FIXED_BASE_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+// for ap_int_base and its reference types.
+#include <ap_int.h>
+#ifndef __SYNTHESIS__
+#if _AP_ENABLE_HALF_ == 1
+// for half type
+#include <hls_half.h>
+#endif
+// for std io
+#include <iostream>
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+#else // __cplusplus
+
+// for warning on unsupported rounding mode in conversion to float/double.
+#if !defined(__SYNTHESIS__) && __cplusplus >= 201103L && \
+    (defined(__gnu_linux__) || defined(_WIN32))
+#define AP_FIXED_ENABLE_CPP_FENV 1
+#include <cfenv>
+#endif
+
+// ----------------------------------------------------------------------
+
+/* Major TODO
+  long double support: constructor, assign and other operators.
+  binary operators with ap_fixed_base and const char*.
+  return ap_fixed/ap_ufixed when result signedness is known.
+*/
+
+// Helper function in conversion to floating point types.
+
+#ifdef __SYNTHESIS__
+#define _AP_ctype_op_get_bit(var, index) _AP_ROOT_op_get_bit(var, index)
+#define _AP_ctype_op_set_bit(var, index, x) _AP_ROOT_op_set_bit(var, index, x)
+#define _AP_ctype_op_get_range(var, low, high) \
+  _AP_ROOT_op_get_range(var, low, high)
+#define _AP_ctype_op_set_range(var, low, high, x) \
+  _AP_ROOT_op_set_range(var, low, high, x)
+#else // ifdef __SYNTHESIS__
+template <typename _Tp1, typename _Tp2>
+inline bool _AP_ctype_op_get_bit(_Tp1& var, const _Tp2& index) {
+  return !!(var & (1ull << (index)));
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ctype_op_set_bit(_Tp1& var, const _Tp2& index, const _Tp3& x) {
+  var |= (((x) ? 1ull : 0ull) << (index));
+  return var;
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ctype_op_get_range(_Tp1& var, const _Tp2& low,
+                                   const _Tp3& high) {
+  _Tp1 r = var;
+  ap_ulong mask = -1ll;
+  mask >>= (sizeof(_Tp1) * 8 - ((high) - (low) + 1));
+  r >>= (low);
+  r &= mask;
+  return r;
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+inline _Tp1 _AP_ctype_op_set_range(_Tp1& var, const _Tp2& low, const _Tp3& high,
+                                   const _Tp4& x) {
+  ap_ulong mask = -1ll;
+  mask >>= (_AP_SIZE_ap_slong - ((high) - (low) + 1));
+  var &= ~(mask << (low));
+  var |= ((mask & x) << (low));
+  return var;
+}
+#endif // ifdef __SYNTHESIS__
+
+
+// trait for letting base class to return derived class.
+// Notice that derived class template is incomplete, and we cannot use
+// the member of the derived class.
+template <int _AP_W2, int _AP_I2, bool _AP_S2>
+struct _ap_fixed_factory;
+template <int _AP_W2, int _AP_I2>
+struct _ap_fixed_factory<_AP_W2, _AP_I2, true> {
+  typedef ap_fixed<_AP_W2, _AP_I2> type;
+};
+template <int _AP_W2, int _AP_I2>
+struct _ap_fixed_factory<_AP_W2, _AP_I2, false> {
+  typedef ap_ufixed<_AP_W2, _AP_I2> type;
+};
+
+/// ap_fixed_base: AutoPilot fixed point.
+/** partial specialization of signed.
+  @tparam _AP_W width.
+  @tparam _AP_I integral part width.
+  @tparam _AP_S signed.
+  @tparam _AP_Q quantization mode. Default is AP_TRN.
+  @tparam _AP_O saturation mode. Default is AP_WRAP.
+  @tparam _AP_N saturation wrap value. Default is 0.
+ */
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct ap_fixed_base : _AP_ROOT_TYPE<_AP_W, _AP_S> {
+ public:
+  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
+  static const int width = _AP_W;
+  static const int iwidth = _AP_I;
+  static const ap_q_mode qmode = _AP_Q;
+  static const ap_o_mode omode = _AP_O;
+
+  /// Return type trait.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2>
+  struct RType {
+    enum {
+      _AP_F = _AP_W - _AP_I,
+      F2 = _AP_W2 - _AP_I2,
+      mult_w = _AP_W + _AP_W2,
+      mult_i = _AP_I + _AP_I2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
+               1 + AP_MAX(_AP_F, F2),
+      plus_i =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1 +
+          AP_MAX(_AP_F, F2),
+      minus_i =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+#ifndef __SC_COMPATIBLE__
+      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0),
+#else
+      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
+#endif
+      div_i = _AP_S2 + _AP_I + F2,
+      div_s = _AP_S || _AP_S2,
+      logic_w =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
+          AP_MAX(_AP_F, F2),
+      logic_i = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+
+    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs;
+    typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs;
+
+    typedef ap_fixed_base<mult_w, mult_i, mult_s> mult_base;
+    typedef ap_fixed_base<plus_w, plus_i, plus_s> plus_base;
+    typedef ap_fixed_base<minus_w, minus_i, minus_s> minus_base;
+    typedef ap_fixed_base<logic_w, logic_i, logic_s> logic_base;
+    typedef ap_fixed_base<div_w, div_i, div_s> div_base;
+    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base;
+
+    typedef typename _ap_fixed_factory<mult_w, mult_i, mult_s>::type mult;
+    typedef typename _ap_fixed_factory<plus_w, plus_i, plus_s>::type plus;
+    typedef typename _ap_fixed_factory<minus_w, minus_i, minus_s>::type minus;
+    typedef typename _ap_fixed_factory<logic_w, logic_i, logic_s>::type logic;
+    typedef typename _ap_fixed_factory<div_w, div_i, div_s>::type div;
+    typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1;
+  };
+
+ private:
+#ifndef __SYNTHESIS__
+  // This cannot handle hex float format string.
+  void fromString(const std::string& val, unsigned char radix) {
+    _AP_ERROR(!(radix == 2 || radix == 8 || radix == 10 || radix == 16),
+              "ap_fixed_base::fromString(%s, %d)", val.c_str(), radix);
+
+    Base::V = 0;
+    int startPos = 0;
+    int endPos = val.length();
+    int decPos = val.find(".");
+    if (decPos == -1) decPos = endPos;
+
+    // handle sign
+    bool isNegative = false;
+    if (val[0] == '-') {
+      isNegative = true;
+      ++startPos;
+    } else if (val[0] == '+')
+      ++startPos;
+
+    // If there are no integer bits, e.g.:
+    // .0000XXXX, then keep at least one bit.
+    // If the width is greater than the number of integer bits, e.g.:
+    // XXXX.XXXX, then we keep the integer bits
+    // if the number of integer bits is greater than the width, e.g.:
+    // XXX000 then we keep the integer bits.
+    // Always keep one bit.
+    ap_fixed_base<AP_MAX(_AP_I, 4) + 4, AP_MAX(_AP_I, 4) + 4, false>
+        integer_bits = 0;
+
+    // Figure out if we can shift instead of multiply
+    unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+    //std::cout << "\n\n" << val << "\n";
+    //std::cout << startPos << " " << decPos << " " << endPos << "\n";
+
+    bool sticky_int = false;
+
+    // Traverse the integer digits from the MSD, multiplying by radix as we go.
+    for (int i = startPos; i < decPos; i++) {
+      // Get a digit
+      char cdigit = val[i];
+      if (cdigit == '\0') continue;
+      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
+
+      sticky_int |= integer_bits[AP_MAX(_AP_I, 4) + 4 - 1] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 2] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 4];
+      // Shift or multiply the value by the radix
+      if (shift)
+        integer_bits <<= shift;
+      else
+        integer_bits *= radix;
+
+      // Add in the digit we just interpreted
+      integer_bits += digit;
+      //std::cout << "idigit = " << digit << " " << integer_bits.to_string()
+      //    << "  " << sticky_int <<  "\n";
+    }
+    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] =
+        integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | sticky_int;
+
+    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 0) + 4 + 4, 4, false> fractional_bits = 0;
+    bool sticky = false;
+
+    // Traverse the fractional digits from the LSD, dividing by radix as we go.
+    for (int i = endPos - 1; i >= decPos + 1; i--) {
+      // Get a digit
+      char cdigit = val[i];
+      if (cdigit == '\0') continue;
+      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
+      // Add in the digit we just interpreted
+      fractional_bits += digit;
+
+      sticky |= fractional_bits[0] | fractional_bits[1] | fractional_bits[2] |
+                fractional_bits[3];
+      // Shift or divide the value by the radix
+      if (shift)
+        fractional_bits >>= shift;
+      else
+        fractional_bits /= radix;
+
+      //std::cout << "fdigit = " << digit << " " << fractional_bits.to_string()
+      //    << " " << sticky << "\n";
+    }
+
+    //std::cout << "Int =" << integer_bits.to_string() << " " <<
+    //    fractional_bits.to_string() << "\n";
+
+    fractional_bits[0] = fractional_bits[0] | sticky;
+
+    if (isNegative)
+      *this = -(integer_bits + fractional_bits);
+    else
+      *this = integer_bits + fractional_bits;
+
+    //std::cout << "end = " << this->to_string(16) << "\n";
+  }
+
+  /// report invalid constrction of ap_fixed_base
+  INLINE void report() {
+    if (!_AP_S && _AP_O == AP_WRAP_SM) {
+      fprintf(stderr, "ap_ufxied<...> cannot support AP_WRAP_SM.\n");
+      exit(1);
+    }
+    if (_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024) {
+      fprintf(stderr,
+              "[E] ap_%sfixed<%d, ...>: Bitwidth exceeds the "
+              "default max value %d. Please use macro "
+              "AP_INT_MAX_W to set a larger max value.\n",
+              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
+      exit(1);
+    }
+  }
+#else
+  INLINE void report() {}
+#endif // ifdef __SYNTHESIS__
+
+  /// @name helper functions.
+  //  @{
+  INLINE void overflow_adjust(bool underflow, bool overflow, bool lD,
+                              bool sign) {
+    if (!underflow && !overflow) return;
+    if (_AP_O == AP_WRAP) {
+      if (_AP_N == 0) return;
+      if (_AP_S) {
+        // signed AP_WRAP
+        // n_bits == 1
+        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
+        if (_AP_N > 1) {
+          // n_bits > 1
+          ap_int_base<_AP_W, false> mask(-1);
+          if (sign) mask.V = 0;
+          Base::V =
+              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
+        }
+      } else {
+        // unsigned AP_WRAP
+        ap_int_base<_AP_W, false> mask(-1);
+        Base::V =
+            _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 1, mask.V);
+      }
+    } else if (_AP_O == AP_SAT_ZERO) {
+      Base::V = 0;
+    } else if (_AP_O == AP_WRAP_SM && _AP_S) {
+      bool Ro = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      if (_AP_N == 0) {
+        if (lD != Ro) {
+          Base::V = ~Base::V;
+          Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, lD);
+        }
+      } else {
+        if (_AP_N == 1 && sign != Ro) {
+          Base::V = ~Base::V;
+        } else if (_AP_N > 1) {
+          bool lNo = _AP_ROOT_op_get_bit(Base::V, _AP_W - _AP_N);
+          if (lNo == sign) Base::V = ~Base::V;
+          ap_int_base<_AP_W, false> mask(-1);
+          if (sign) mask.V = 0;
+          Base::V =
+              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
+        }
+        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
+      }
+    } else {
+      if (_AP_S) {
+        if (overflow) {
+          Base::V = 1;
+          Base::V <<= _AP_W - 1;
+          Base::V = ~Base::V;
+        } else if (underflow) {
+          Base::V = 1;
+          Base::V <<= _AP_W - 1;
+          if (_AP_O == AP_SAT_SYM) Base::V |= 1;
+        }
+      } else {
+        if (overflow)
+          Base::V = ~(ap_int_base<_AP_W, false>(0).V);
+        else if (underflow)
+          Base::V = 0;
+      }
+    }
+  }
+
+  INLINE bool quantization_adjust(bool qb, bool r, bool s) {
+    bool carry = (bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+    if (_AP_Q == AP_TRN) return false;
+    if (_AP_Q == AP_RND_ZERO)
+      qb &= s || r;
+    else if (_AP_Q == AP_RND_MIN_INF)
+      qb &= r;
+    else if (_AP_Q == AP_RND_INF)
+      qb &= !s || r;
+    else if (_AP_Q == AP_RND_CONV)
+      qb &= _AP_ROOT_op_get_bit(Base::V, 0) || r;
+    else if (_AP_Q == AP_TRN_ZERO)
+      qb = s && (qb || r);
+    Base::V += qb;
+    return carry && (!(bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
+  }
+  //  @}
+
+ public:
+  /// @name constructors.
+  //  @{
+  /// default ctor.
+  INLINE ap_fixed_base() {}
+
+  /// copy ctor.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(op);
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(op);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
+    tmp.V = op.V;
+    operator=(tmp);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
+    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
+    tmp.V = op.V;
+    operator=(tmp);
+    report();
+  }
+
+#ifndef __SYNTHESIS__
+#ifndef NON_C99STRING
+  INLINE ap_fixed_base(const char* s, signed char rd = 0) {
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(s, radix); // will guess rd, default 10
+    _AP_ERROR(radix == 0, "ap_fixed_base(const char* \"%s\", %d), str=%s, radix = %d",
+              s, rd, str.c_str(), radix); // TODO remove this check
+    fromString(str, radix);
+  }
+#else
+  INLINE ap_fixed_base(const char* s, signed char rd = 10) {
+    ap_int_base<_AP_W, _AP_S> t(s, rd);
+    Base::V = t.V;
+  }
+#endif // ifndef NON_C99STRING
+#else // ifndef __SYNTHESIS__
+  // XXX _ssdm_string2bits only takes const string and const radix.
+  // It seems XFORM will do compile time processing of the string.
+  INLINE ap_fixed_base(const char* s) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q,
+                      _AP_O, _AP_N, _AP_C99);
+    Base::V = t;
+  }
+  INLINE ap_fixed_base(const char* s, signed char rd) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q,
+                      _AP_O, _AP_N, _AP_C99);
+    Base::V = t;
+  }
+#endif // ifndef __SYNTHESIS__ else
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    *this = ((bool)op);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) {
+    *this = (ap_int_base<_AP_W2, false>(op));
+    report();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_fixed_base(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) {
+    *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op));
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    *this = (bool(op));
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    *this = (ap_int_base<_AP_W2, false>(op));
+    report();
+  }
+
+  // ctors from c types.
+  // make a temp ap_fixed_base first, and use ap_fixed_base.operator=
+#define CTOR_FROM_INT(C_TYPE, _AP_W2, _AP_S2)        \
+  INLINE ap_fixed_base(const C_TYPE x) {             \
+    ap_fixed_base<(_AP_W2), (_AP_W2), (_AP_S2)> tmp; \
+    tmp.V = x;                                       \
+    *this = tmp;                                     \
+  }
+
+  CTOR_FROM_INT(bool, 1, false)
+  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  CTOR_FROM_INT(signed char, 8, true)
+  CTOR_FROM_INT(unsigned char, 8, false)
+  CTOR_FROM_INT(short, _AP_SIZE_short, true)
+  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  CTOR_FROM_INT(int, _AP_SIZE_int, true)
+  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  CTOR_FROM_INT(long, _AP_SIZE_long, true)
+  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+#undef CTOR_FROM_INT
+/*
+ * TODO:
+ *Theere used to be several funtions which were AP_WEAK.
+ *Now they're all INLINE expect ap_fixed_base(double d)
+ *Maybe we can use '#pragma HLS inline' instead of INLINE.
+ */
+  AP_WEAK ap_fixed_base(double d) {
+    ap_int_base<64, false> ireg;
+    ireg.V = doubleToRawBits(d);
+    bool isneg = _AP_ROOT_op_get_bit(ireg.V, 63);
+
+    ap_int_base<DOUBLE_EXP + 1, true> exp;
+    ap_int_base<DOUBLE_EXP, false> exp_tmp;
+    exp_tmp.V =
+        _AP_ROOT_op_get_range(ireg.V, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1);
+    exp = exp_tmp - DOUBLE_BIAS;
+    ap_int_base<DOUBLE_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(ireg.V, 0, DOUBLE_MAN - 1);
+    // do not support NaN
+    _AP_WARNING(exp == APFX_IEEE_DOUBLE_E_MAX + 1 && man.V != 0,
+                "assign NaN to fixed point value");
+    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
+    if (isneg) man = -man;
+    if ((ireg.V & 0x7fffffffffffffffLL) == 0) {
+      Base::V = 0;
+    } else {
+      int _AP_W2 = DOUBLE_MAN + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I,
+          F2 = _AP_W2 - _AP_I2;
+      bool _AP_S2 = true,
+           QUAN_INC = F2 > _AP_F &&
+                      !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
+      bool carry = false;
+      // handle quantization
+      unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2;
+      if (F2 == _AP_F)
+        Base::V = man.V;
+      else if (F2 > _AP_F) {
+        if (sh_amt < DOUBLE_MAN + 2)
+          Base::V = man.V >> sh_amt;
+        else {
+          Base::V = isneg ? -1 : 0;
+        }
+        if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) {
+          bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)_AP_ROOT_op_get_bit(
+                                                        man.V, F2 - _AP_F - 1);
+          bool r =
+              (F2 > _AP_F + 1)
+                  ? _AP_ROOT_op_get_range(man.V, 0, (F2 - _AP_F - 2 < _AP_W2)
+                                                        ? (F2 - _AP_F - 2)
+                                                        : (_AP_W2 - 1)) != 0
+                  : false;
+          carry = quantization_adjust(qb, r, isneg);
+        }
+      } else { // no quantization
+        Base::V = man.V;
+        if (sh_amt < _AP_W)
+          Base::V = Base::V << sh_amt;
+        else
+          Base::V = 0;
+      }
+      // handle overflow/underflow
+      if ((_AP_O != AP_WRAP || _AP_N != 0) &&
+          ((!_AP_S && _AP_S2) ||
+           _AP_I - _AP_S <
+               _AP_I2 - _AP_S2 +
+                   (QUAN_INC ||
+                    (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { // saturation
+        bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true;
+        bool neg_src = isneg;
+        bool lD = false;
+        int pos1 = F2 - _AP_F + _AP_W;
+        int pos2 = F2 - _AP_F + _AP_W + 1;
+        bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+        if (pos1 < _AP_W2 && pos1 >= 0)
+          // lD = _AP_ROOT_op_get_bit(man.V, pos1);
+          lD = (man.V >> pos1) & 1;
+        if (pos1 < _AP_W2) {
+          bool Range1_all_ones = true;
+          bool Range1_all_zeros = true;
+          bool Range2_all_ones = true;
+          ap_int_base<DOUBLE_MAN + 2, false> Range2;
+          ap_int_base<DOUBLE_MAN + 2, false> all_ones(-1);
+
+          if (pos2 >= 0 && pos2 < _AP_W2) {
+            // Range2.V = _AP_ROOT_op_get_range(man.V,
+            //                        pos2, _AP_W2 - 1);
+            Range2.V = man.V;
+            Range2.V >>= pos2;
+            Range2_all_ones = Range2 == (all_ones >> pos2);
+          } else if (pos2 < 0)
+            Range2_all_ones = false;
+          if (pos1 >= 0 && pos2 < _AP_W2) {
+            Range1_all_ones = Range2_all_ones && lD;
+            Range1_all_zeros = !Range2.V && !lD;
+          } else if (pos2 == _AP_W2) {
+            Range1_all_ones = lD;
+            Range1_all_zeros = !lD;
+          } else if (pos1 < 0) {
+            Range1_all_zeros = !man.V;
+            Range1_all_ones = false;
+          }
+
+          deleted_zeros =
+              deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
+          deleted_ones =
+              carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
+          neg_src = isneg && !(carry && Range1_all_ones);
+        } else
+          neg_src = isneg && newsignbit;
+        bool neg_trg = _AP_S && newsignbit;
+        bool overflow = (neg_trg || !deleted_zeros) && !isneg;
+        bool underflow = (!neg_trg || !deleted_ones) && neg_src;
+        if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
+          underflow |=
+              neg_src &&
+              (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
+                         : true);
+        overflow_adjust(underflow, overflow, lD, neg_src);
+      }
+    }
+    report();
+  }
+
+  // TODO more optimized implementation.
+  INLINE ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); }
+
+#if _AP_ENABLE_HALF_ == 1
+  // TODO more optimized implementation.
+  INLINE ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); }
+#endif
+  //  @}
+
+  /// @name assign operator
+  /// assign, using another ap_fixed_base of same template parameters.
+  /*
+  INLINE ap_fixed_base& operator=(
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+  */
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+
+    const int _AP_F = _AP_W - _AP_I;
+    const int F2 = _AP_W2 - _AP_I2;
+    const int QUAN_INC =
+          F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
+
+    if (!op) Base::V = 0;
+    bool carry = false;
+    bool signbit = _AP_ROOT_op_get_bit(op.V, _AP_W2 - 1);
+    bool isneg = signbit && _AP_S2;
+    if (F2 == _AP_F)
+      Base::V = op.V;
+    else if (F2 > _AP_F) {
+      unsigned int sh_amt = F2 - _AP_F;
+      //  moves bits right, handle quantization.
+      if (sh_amt < _AP_W2) {
+        Base::V = op.V >> sh_amt;
+      } else {
+        Base::V = isneg ? -1 : 0;
+      }
+      if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) {
+        bool qbit = _AP_ROOT_op_get_bit(op.V, F2 - _AP_F - 1);
+        // bit after LSB.
+        bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit;
+        enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) };
+        // bits after qb.
+        bool r = (F2 > _AP_F + 1) ? (_AP_ROOT_op_get_range(op.V, 0, hi) != 0) : false;
+        carry = quantization_adjust(qb, r, isneg);
+      }
+    } else {
+      unsigned  sh_amt = _AP_F - F2;
+      // moves bits left, no quantization
+      if (sh_amt < _AP_W) {
+        if (_AP_W > _AP_W2) {
+          // extend and then shift, avoid losing bits.
+          Base::V = op.V;
+          Base::V <<= sh_amt;
+        } else {
+          // shift and truncate.
+          Base::V = op.V << sh_amt;
+        }
+      } else {
+        Base::V = 0;
+      }
+    }
+    // handle overflow/underflow
+    if ((_AP_O != AP_WRAP || _AP_N != 0) &&
+        ((!_AP_S && _AP_S2) ||
+         _AP_I - _AP_S <
+             _AP_I2 - _AP_S2 +
+                 (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { // saturation
+      bool deleted_zeros = _AP_S2 ? true : !carry;
+      bool deleted_ones = true;
+      bool neg_src = isneg;
+      bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 };
+      bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? _AP_ROOT_op_get_bit(op.V, pos1)
+                                             : false;
+      if (pos1 < _AP_W2) {
+        bool Range1_all_ones = true;
+        bool Range1_all_zeros = true;
+        bool Range2_all_ones = true;
+        ap_int_base<_AP_W2, false> all_ones(-1);
+
+        if (pos2 < _AP_W2 && pos2 >= 0) {
+          ap_int_base<_AP_W2, false> Range2;
+          Range2.V = _AP_ROOT_op_get_range(op.V, pos2, _AP_W2 - 1);
+          Range2_all_ones = Range2 == (all_ones >> pos2);
+        } else if (pos2 < 0) {
+          Range2_all_ones = false;
+        }
+
+        if (pos1 >= 0 && pos2 < _AP_W2) {
+          ap_int_base<_AP_W2, false> Range1;
+          Range1.V = _AP_ROOT_op_get_range(op.V, pos1, _AP_W2 - 1);
+          Range1_all_ones = Range1 == (all_ones >> pos1);
+          Range1_all_zeros = !Range1.V;
+        } else if (pos2 == _AP_W2) {
+          Range1_all_ones = lD;
+          Range1_all_zeros = !lD;
+        } else if (pos1 < 0) {
+          Range1_all_zeros = !op.V;
+          Range1_all_ones = false;
+        }
+
+        deleted_zeros =
+            deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
+        deleted_ones =
+            carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
+        neg_src = isneg && !(carry && Range1_all_ones);
+      } else
+        neg_src = isneg && newsignbit;
+      bool neg_trg = _AP_S && newsignbit;
+      bool overflow = (neg_trg || !deleted_zeros) && !isneg;
+      bool underflow = (!neg_trg || !deleted_ones) && neg_src;
+      if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
+        underflow |=
+            neg_src &&
+            (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
+                       : true);
+
+      overflow_adjust(underflow, overflow, lD, neg_src);
+    }
+    return *this;
+  } // operator= 
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator=(
+      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(const_cast<const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(op));
+    return *this;
+  }
+
+  /// Set this ap_fixed_base with ULL.
+  INLINE ap_fixed_base& setBits(ap_ulong bv) {
+    // TODO when ull is not be long enough...
+    Base::V = bv;
+    return *this;
+  }
+
+  /// Return a ap_fixed_base object whose this->V is assigned by bv.
+  static INLINE ap_fixed_base bitsToFixed(ap_ulong bv) {
+    // TODO fix when ull is not be long enough...
+    ap_fixed_base t;
+#ifdef __SYNTHESIS__
+    t.V = bv;
+#else
+    t.V.set_bits(bv);
+#endif
+    return t;
+  }
+
+  // Explicit conversion functions to ap_int_base.
+  /** Captures all integer bits, in truncate mode.
+   *  @param[in] Cnative follow conversion from double to int.
+   */
+  INLINE ap_int_base<AP_MAX(_AP_I, 1), _AP_S> to_ap_int_base(
+      bool Cnative = true) const {
+    ap_int_base<AP_MAX(_AP_I, 1), _AP_S> ret;
+    if (_AP_I == 0) {
+      ret.V = 0;
+    } else if (_AP_I > 0 && _AP_I <= _AP_W) {
+      ret.V = _AP_ROOT_op_get_range(Base::V, _AP_W - _AP_I, _AP_W - 1);
+    } else if (_AP_I > _AP_W) {
+      ret.V = _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 1);
+      ret.V <<= (_AP_I - _AP_W);
+    }
+    /* Consider the following case
+     *   float f = -7.5f;
+     *   ap_fixed<8,4> t = f;  // -8 0 0 0 . 0.5
+     *   int i = t.to_int();
+     * the result should be -7 instead of -8.
+     * Therefore, after truncation, the value should be increated by 1.
+     * For (-1, 0), carry to MSB will happen, but result 0 is still correct.
+     */
+    if (Cnative && _AP_I < _AP_W) {
+      // Follow C native data type, conversion from double to int
+      if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1) && (_AP_I < _AP_W) &&
+          (_AP_ROOT_op_get_range(
+               Base::V, 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1) != 0))
+        ++ret;
+    } else {
+      // Follow OSCI library, conversion from sc_fixed to sc_int
+    }
+    return ret;
+  };
+
+ public:
+  template <int _AP_W2, bool _AP_S2>
+  INLINE operator ap_int_base<_AP_W2, _AP_S2>() const {
+    return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base());
+  }
+
+  // Explicit conversion function to C built-in integral type.
+  INLINE char to_char() const { return to_ap_int_base().to_char(); }
+
+  INLINE int to_int() const { return to_ap_int_base().to_int(); }
+
+  INLINE unsigned to_uint() const { return to_ap_int_base().to_uint(); }
+
+  INLINE ap_slong to_int64() const { return to_ap_int_base().to_int64(); }
+
+  INLINE ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); }
+
+  /// covert function to double.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE double to_double() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = DOUBLE_MAN + DOUBLE_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros(); ///< number of leading zeros.
+    int e = _AP_I - l - 1 + DOUBLE_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - DOUBLE_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // ull is at least 64-bit
+    ap_ulong m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1))
+                           : (ap_ulong)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (ap_ulong)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    //std::cout << '\n' << std::hex << m << '\n'; // TODO delete this
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, DOUBLE_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    //std::cout << m << '\n'; // TODO delete this
+    m = _AP_ctype_op_set_range(m, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1, e);
+    //std::cout << std::hex << m << std::dec << std::endl; // TODO delete this
+    // cast to fp
+    return rawBitsToDouble(m);
+  }
+
+  /// convert function to float.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE float to_float() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = FLOAT_MAN + FLOAT_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
+    int e = _AP_I - l - 1 + FLOAT_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - FLOAT_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // ul is at least 32-bit
+    unsigned long m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1))
+                           : (unsigned long)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (unsigned long)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, FLOAT_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    m = _AP_ctype_op_set_range(m, FLOAT_MAN, FLOAT_MAN + FLOAT_EXP - 1, e);
+    // cast to fp
+    return rawBitsToFloat(m);
+  }
+
+#if _AP_ENABLE_HALF_ == 1
+  /// convert function to half.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE half to_half() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = HALF_MAN + HALF_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
+    int e = _AP_I - l - 1 + HALF_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - HALF_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // short is at least 16-bit
+    unsigned short m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1))
+                           : (unsigned short)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (unsigned short)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, HALF_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    m = _AP_ctype_op_set_range(m, HALF_MAN, HALF_MAN + HALF_EXP - 1, e);
+    // cast to fp
+    return rawBitsToHalf(m);
+  }
+#endif
+
+  // FIXME inherited from old code, this may loose precision!
+  INLINE operator long double() const { return (long double)to_double(); }
+
+  INLINE operator double() const { return to_double(); }
+
+  INLINE operator float() const { return to_float(); }
+
+#if _AP_ENABLE_HALF_ == 1
+  INLINE operator half() const { return to_half(); }
+#endif
+
+  INLINE operator bool() const { return (bool)Base::V != 0; }
+
+  INLINE operator char() const { return (char)to_int(); }
+
+  INLINE operator signed char() const { return (signed char)to_int(); }
+
+  INLINE operator unsigned char() const { return (unsigned char)to_uint(); }
+
+  INLINE operator short() const { return (short)to_int(); }
+
+  INLINE operator unsigned short() const { return (unsigned short)to_uint(); }
+
+  INLINE operator int() const { return to_int(); }
+
+  INLINE operator unsigned int() const { return to_uint(); }
+
+// FIXME don't assume data width...
+#ifdef __x86_64__
+  INLINE operator long() const { return (long)to_int64(); }
+
+  INLINE operator unsigned long() const { return (unsigned long)to_uint64(); }
+#else
+  INLINE operator long() const { return (long)to_int(); }
+
+  INLINE operator unsigned long() const { return (unsigned long)to_uint(); }
+#endif // ifdef __x86_64__ else
+
+  INLINE operator ap_ulong() const { return to_uint64(); }
+
+  INLINE operator ap_slong() const { return to_int64(); }
+
+  INLINE int length() const { return _AP_W; };
+
+  // bits_to_int64 deleted.
+#ifndef __SYNTHESIS__
+  // Used in autowrap, when _AP_W < 64.
+  INLINE ap_ulong bits_to_uint64() const {
+    return (Base::V).to_uint64();
+  }
+#endif
+
+  // Count the number of zeros from the most significant bit
+  // to the first one bit. Note this is only for ap_fixed_base whose
+  // _AP_W <= 64, otherwise will incur assertion.
+  INLINE int countLeadingZeros() {
+#ifdef __SYNTHESIS__
+    // TODO: used llvm.ctlz intrinsic ?
+    if (_AP_W <= 32) {
+      ap_int_base<32, false> t(-1ULL);
+      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
+      return __builtin_ctz(t.V);
+    } else if (_AP_W <= 64) {
+      ap_int_base<64, false> t(-1ULL);
+      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
+      return __builtin_ctzll(t.V);
+    } else {
+      enum {__N = (_AP_W + 63) / 64};
+      int NZeros = 0;
+      int i = 0;
+      bool hitNonZero = false;
+      for (i = 0; i < __N - 1; ++i) {
+        ap_int_base<64, false> t;
+        t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
+        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V);
+        hitNonZero |= (t != 0);
+      }
+      if (!hitNonZero) {
+        ap_int_base<64, false> t(-1ULL);
+        t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64);
+        NZeros += __builtin_clzll(t.V);
+      }
+      return NZeros;
+    }
+#else
+    return Base::V.countLeadingZeros();
+#endif
+  }
+
+  // Arithmetic : Binary
+  // -------------------------------------------------------------------------
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
+      const {
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r, t;
+    r.V = Base::V;
+    t.V = op2.V;
+    r.V *= op2.V;
+    return r;
+  }
+
+  // multiply function deleted.
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
+      const {
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r;
+#ifndef __SYNTHESIS__
+    enum {F2 = _AP_W2-_AP_I2,
+              _W1=AP_MAX(_AP_W + AP_MAX(F2, 0) + ((_AP_S2 && !_AP_S) ? 1 : 0), _AP_W2 + ((_AP_S && !_AP_S2) ? 1 : 0))};
+    ap_int_base<_W1,_AP_S||_AP_S2> dividend,divisior;
+    ap_int_base<_W1,_AP_S> tmp1;
+    ap_int_base<_W1,_AP_S2> tmp2;
+    tmp1.V = Base::V;
+    tmp1.V <<= AP_MAX(F2,0);
+    tmp2.V = op2.V;
+    dividend = tmp1;
+    divisior = tmp2;
+    r.V = ((_AP_S||_AP_S2) ? dividend.V.sdiv(divisior.V): dividend.V.udiv(divisior.V));
+#else
+    #ifndef __SC_COMPATIBLE__
+        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0),_AP_I, _AP_S> t(*this);
+    #else
+        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0) + AP_MAX(_AP_I2, 0),_AP_I, _AP_S> t(*this);
+    #endif
+        r.V = t.V / op2.V;
+#endif
+/*
+    enum {
+      F2 = _AP_W2 - _AP_I2,
+      shl = AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
+#ifndef __SC_COMPATIBLE__
+      shr = AP_MAX(_AP_I2, 0),
+#else
+      shr = 0,
+#endif
+      W3 = _AP_S2 + _AP_W + shl,
+      S3 = _AP_S || _AP_S2,
+    };
+    ap_int_base<W3, S3> dividend, t;
+    dividend.V = Base::V;
+    // multiply both by (1 << F2), and than do integer division.
+    dividend.V <<= (int) shl;
+#ifdef __SYNTHESIS__
+    // .V's have right signedness, and will have right extending.
+    t.V = dividend.V / op2.V;
+#else
+    // XXX op2 may be wider than dividend, and sdiv and udiv takes the same with
+    // as left hand operand, so data might be truncated by mistake if not
+    // handled here.
+    t.V = S3 ? dividend.V.sdiv(op2.V) : dividend.V.udiv(op2.V);
+#endif
+    r.V = t.V >> (int) shr;
+*/
+    return r;
+  }
+
+#define OP_BIN_AF(Sym, Rty)                                                \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
+            ap_o_mode _AP_O2, int _AP_N2>                                  \
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty operator Sym(         \
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
+          op2) const {                                                     \
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty##_base ret, lhs(*this),    \
+        rhs(op2);                                                          \
+    ret.V = lhs.V Sym rhs.V;                                               \
+    return ret;                                                            \
+  }
+
+  OP_BIN_AF(+, plus)
+  OP_BIN_AF(-, minus)
+  OP_BIN_AF(&, logic)
+  OP_BIN_AF(|, logic)
+  OP_BIN_AF(^, logic)
+
+// Arithmetic : assign
+// -------------------------------------------------------------------------
+#define OP_ASSIGN_AF(Sym)                                                  \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
+            ap_o_mode _AP_O2, int _AP_N2>                                  \
+  INLINE ap_fixed_base& operator Sym##=(                                   \
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
+          op2) {                                                           \
+    *this = operator Sym(op2);                                             \
+    return *this;                                                          \
+  }
+
+  OP_ASSIGN_AF(*)
+  OP_ASSIGN_AF(/)
+  OP_ASSIGN_AF(+)
+  OP_ASSIGN_AF(-)
+  OP_ASSIGN_AF(&)
+  OP_ASSIGN_AF(|)
+  OP_ASSIGN_AF(^)
+
+  // Prefix and postfix increment and decrement.
+  // -------------------------------------------------------------------------
+
+  /// Prefix increment
+  INLINE ap_fixed_base& operator++() {
+    operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
+    return *this;
+  }
+
+  /// Prefix decrement.
+  INLINE ap_fixed_base& operator--() {
+    operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
+    return *this;
+  }
+
+  /// Postfix increment
+  INLINE const ap_fixed_base operator++(int) {
+    ap_fixed_base r(*this);
+    operator++();
+    return r;
+  }
+
+  /// Postfix decrement
+  INLINE const ap_fixed_base operator--(int) {
+    ap_fixed_base r(*this);
+    operator--();
+    return r;
+  }
+
+  // Unary arithmetic.
+  // -------------------------------------------------------------------------
+  INLINE ap_fixed_base operator+() { return *this; }
+
+  INLINE ap_fixed_base<_AP_W + 1, _AP_I + 1, true> operator-() const {
+    ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this);
+    r.V = -r.V;
+    return r;
+  }
+
+  INLINE ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() {
+    ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this);
+    r.V = -r.V;
+    return r;
+  }
+
+  // Not (!)
+  // -------------------------------------------------------------------------
+  INLINE bool operator!() const { return Base::V == 0; }
+
+  // Bitwise complement
+  // -------------------------------------------------------------------------
+  // XXX different from Mentor's ac_fixed.
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const {
+    ap_fixed_base<_AP_W, _AP_I, _AP_S> r;
+    r.V = ~Base::V;
+    return r;
+  }
+
+  // Shift
+  // -------------------------------------------------------------------------
+  // left shift is the same as moving point right, i.e. increate I.
+  template <int _AP_SHIFT>
+  INLINE ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const {
+    ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r;
+    r.V = Base::V;
+    return r;
+  }
+
+  template <int _AP_SHIFT>
+  INLINE ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const {
+    ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r;
+    r.V = Base::V;
+    return r;
+  }
+
+  // Because the return type is the type of the the first operand, shift assign
+  // operators do not carry out any quantization or overflow
+  // While systemc, shift assigns for sc_fixed/sc_ufixed will result in
+  // quantization or overflow (depending on the mode of the first operand)
+  INLINE ap_fixed_base operator<<(unsigned int sh) const {
+    ap_fixed_base r;
+    r.V = Base::V << sh;
+// TODO check shift overflow?
+#ifdef __SC_COMPATIBLE__
+    if (sh == 0) return r;
+    if (_AP_O != AP_WRAP || _AP_N != 0) {
+      bool neg_src = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      bool allones, allzeros;
+      ap_int_base<_AP_W, false> ones(-1);
+      if (sh <= _AP_W) {
+        ap_int_base<_AP_W, false> range1;
+        range1.V = _AP_ROOT_op_get_range(
+            const_cast<ap_fixed_base*>(this)->Base::V, _AP_W - sh, _AP_W - 1);
+        allones = range1 == (ones >> (_AP_W - sh));
+        allzeros = range1 == 0;
+      } else {
+        allones = false;
+        allzeros = Base::V == 0;
+      }
+      bool overflow = !allzeros && !neg_src;
+      bool underflow = !allones && neg_src;
+      if ((_AP_O == AP_SAT_SYM) && _AP_S)
+        underflow |=
+            neg_src &&
+            (_AP_W > 1 ? _AP_ROOT_op_get_range(r.V, 0, _AP_W - 2) == 0 : true);
+      bool lD = false;
+      if (sh < _AP_W) lD = _AP_ROOT_op_get_bit(Base::V, _AP_W - sh - 1);
+      r.overflow_adjust(underflow, overflow, lD, neg_src);
+    }
+#endif
+    return r;
+  }
+
+  INLINE ap_fixed_base operator>>(unsigned int sh) const {
+    ap_fixed_base r;
+    r.V = Base::V >> sh;
+// TODO check shift overflow?
+#ifdef __SC_COMPATIBLE__
+    if (sh == 0) return r;
+    if (_AP_Q != AP_TRN) {
+      bool qb = false;
+      if (sh <= _AP_W) qb = _AP_ROOT_op_get_bit(Base::V, sh - 1);
+      bool rb = false;
+      if (sh > 1 && sh <= _AP_W)
+        rb = _AP_ROOT_op_get_range(const_cast<ap_fixed_base*>(this)->Base::V, 0,
+                                   sh - 2) != 0;
+      else if (sh > _AP_W)
+        rb = Base::V != 0;
+      r.quantization_adjust(qb, rb,
+                            _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
+    }
+#endif
+    return r;
+  }
+
+  // left and right shift for int
+  INLINE ap_fixed_base operator<<(int sh) const {
+    ap_fixed_base r;
+    bool isNeg = sh < 0;
+    unsigned int ush = isNeg ? -sh : sh;
+    if (isNeg) {
+      return operator>>(ush);
+    } else {
+      return operator<<(ush);
+    }
+  }
+
+  INLINE ap_fixed_base operator>>(int sh) const {
+    bool isNeg = sh < 0;
+    unsigned int ush = isNeg ? -sh : sh;
+    if (isNeg) {
+      return operator<<(ush);
+    } else {
+      return operator>>(ush);
+    }
+  }
+
+  // left and right shift for ap_int.
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const {
+    // TODO the code seems not optimal. ap_fixed<8,8> << ap_int<2> needs only a
+    // small mux, but integer need a big one!
+    int sh = op2.to_int();
+    return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const {
+    int sh = op2.to_int();
+    return operator>>(sh);
+  }
+
+  // left and right shift for ap_uint.
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const {
+    unsigned int sh = op2.to_uint();
+    return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const {
+    unsigned int sh = op2.to_uint();
+    return operator>>(sh);
+  }
+
+  // left and right shift for ap_fixed
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base operator<<(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          op2) {
+    return operator<<(op2.to_ap_int_base());
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base operator>>(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          op2) {
+    return operator>>(op2.to_ap_int_base());
+  }
+
+  // Shift assign.
+  // -------------------------------------------------------------------------
+
+  // left shift assign.
+  INLINE ap_fixed_base& operator<<=(const int sh) {
+    *this = operator<<(sh);
+    return *this;
+  }
+
+  INLINE ap_fixed_base& operator<<=(const unsigned int sh) {
+    *this = operator<<(sh);
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
+    *this = operator<<(sh.to_int());
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator<<=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          sh) {
+    *this = operator<<(sh.to_int());
+    return *this;
+  }
+
+  // right shift assign.
+  INLINE ap_fixed_base& operator>>=(const int sh) {
+    *this = operator>>(sh);
+    return *this;
+  }
+
+  INLINE ap_fixed_base& operator>>=(const unsigned int sh) {
+    *this = operator>>(sh);
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
+    *this = operator>>(sh.to_int());
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator>>=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          sh) {
+    *this = operator>>(sh.to_int());
+    return *this;
+  }
+
+// Comparisons.
+// -------------------------------------------------------------------------
+#define OP_CMP_AF(Sym)                                                         \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,             \
+            ap_o_mode _AP_O2, int _AP_N2>                                      \
+  INLINE bool operator Sym(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, \
+                                               _AP_O2, _AP_N2>& op2) const {   \
+    enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 };                      \
+    if (_AP_F == F2)                                                           \
+      return Base::V Sym op2.V;                                                \
+    else if (_AP_F > F2)                                                       \
+      return Base::V Sym ap_fixed_base<AP_MAX(_AP_W2 + _AP_F - F2, 1), _AP_I2, \
+                                       _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; \
+    else                                                                       \
+      return ap_fixed_base<AP_MAX(_AP_W + F2 - _AP_F + 1, 1), _AP_I + 1,       \
+                           _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V Sym op2.V;     \
+    return false;                                                              \
+  }
+
+  OP_CMP_AF(>)
+  OP_CMP_AF(<)
+  OP_CMP_AF(>=)
+  OP_CMP_AF(<=)
+  OP_CMP_AF(==)
+  OP_CMP_AF(!=)
+// FIXME: Move compare with double out of struct ap_fixed_base defination
+//        and combine it with compare operator(double, ap_fixed_base)
+#define DOUBLE_CMP_AF(Sym) \
+  INLINE bool operator Sym(double d) const { return to_double() Sym d; }
+
+  DOUBLE_CMP_AF(>)
+  DOUBLE_CMP_AF(<)
+  DOUBLE_CMP_AF(>=)
+  DOUBLE_CMP_AF(<=)
+  DOUBLE_CMP_AF(==)
+  DOUBLE_CMP_AF(!=)
+
+  // Bit and Slice Select
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
+      unsigned index) {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
+                                                                index.to_int());
+  }
+
+  INLINE bool operator[](unsigned index) const {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
+  }
+
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
+      unsigned index) {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
+                                                                index.to_int());
+  }
+
+  INLINE bool bit(unsigned index) const {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
+  }
+
+  template <int _AP_W2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
+      const ap_int_base<_AP_W2, true>& index) {
+    _AP_WARNING(index < _AP_I - _AP_W,
+                "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        this, index.to_int() + _AP_W - _AP_I);
+  }
+
+  INLINE bool get_bit(int index) const {
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
+                               index + _AP_W - _AP_I);
+  }
+#if 0
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
+      int index) {
+    _AP_WARNING(index < _AP_I - _AP_W,
+              "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        this, index + _AP_W - _AP_I);
+  }
+#endif
+
+  template <int _AP_W2>
+  INLINE bool get_bit(const ap_int_base<_AP_W2, true>& index) const {
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
+                               index.to_int() + _AP_W - _AP_I);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi,
+                                                                      int Lo) {
+    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
+    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo);
+  }
+
+  // This is a must to strip constness to produce reference type.
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      int Hi, int Lo) const {
+    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
+    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        const_cast<ap_fixed_base*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      int Hi, int Lo) {
+    return this->range(Hi, Lo);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      int Hi, int Lo) const {
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE bool is_zero() const { return Base::V == 0; }
+
+  INLINE bool is_neg() const {
+    if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) return true;
+    return false;
+  }
+
+  INLINE int wl() const { return _AP_W; }
+
+  INLINE int iwl() const { return _AP_I; }
+
+  INLINE ap_q_mode q_mode() const { return _AP_Q; }
+
+  INLINE ap_o_mode o_mode() const { return _AP_O; }
+
+  INLINE int n_bits() const { return _AP_N; }
+
+  // print a string representation of this number in the given radix.
+  // Radix support is 2, 8, 10, or 16.
+  // The result will include a prefix indicating the radix, except for decimal,
+  // where no prefix is needed.  The default is to output a signed representation
+  // of signed numbers, or an unsigned representation  of unsigned numbers.  For
+  // non-decimal formats, this can be changed by the 'sign' argument.
+#ifndef __SYNTHESIS__
+  std::string to_string(unsigned char radix = 2, bool sign = _AP_S) const {
+    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
+    // initialize sc_lv, which seems incapable of handling format "-0b".
+    if (radix == 2) sign = false;
+
+    std::string str;
+    str.clear();
+    char step = 0;
+    bool isNeg = sign && (Base::V < 0);
+
+    // Extend to take care of the -MAX case.
+    ap_fixed_base<_AP_W + 1, _AP_I + 1> tmp(*this);
+    if (isNeg) {
+      tmp = -tmp;
+      str += '-';
+    }
+    std::string prefix;
+    switch (radix) {
+      case 2:
+        prefix = "0b";
+        step = 1;
+        break;
+      case 8:
+        prefix = "0o";
+        step = 3;
+        break;
+      case 16:
+        prefix = "0x";
+        step = 4;
+        break;
+      default:
+        break;
+    }
+
+    if (_AP_I > 0) {
+      // Note we drop the quantization and rounding flags here.  The
+      // integer part is always in range, and the fractional part we
+      // want to drop.  Also, the number is always positive, because
+      // of the absolute value above.
+      ap_int_base<AP_MAX(_AP_I + 1, 1), false> int_part;
+      //   [1] [ I ] d [ W - I ]
+      //    |     |            |
+      //    |    W-I           0
+      //    W
+      int_part.V = _AP_ROOT_op_get_range(
+          tmp.V, _AP_W - _AP_I, _AP_W);
+      str += int_part.to_string(radix, false);
+    } else {
+      str += prefix;
+      str += '0';
+    }
+
+    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 1), 0, false> frac_part = tmp;
+
+    if (radix == 10) {
+      if (frac_part != 0) {
+        str += ".";
+        while (frac_part != 0) {
+          char digit = (frac_part * radix).to_char();
+          str += static_cast<char>(digit + '0');
+          frac_part *= radix;
+        }
+      }
+    } else {
+      if (frac_part != 0) {
+        str += ".";
+        for (signed i = _AP_W - _AP_I - 1; i >= 0; i -= step) {
+          char digit = frac_part.range(i, AP_MAX(0, i - step + 1)).to_char();
+          // If we have a partial bit pattern at the end, then we need
+          // to put it in the high-order bits of 'digit'.
+          int offset = AP_MIN(0, i - step + 1);
+          digit <<= -offset;
+          str += digit < 10 ? static_cast<char>(digit + '0')
+                            : static_cast<char>(digit - 10 + 'a');
+        }
+        if (radix == 16)
+          str += "p0"; // C99 Hex constants are required to have an exponent.
+      }
+    }
+    return str;
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(unsigned char radix = 2, bool sign = _AP_S) const {
+    return 0;
+  }
+#endif
+}; // struct ap_fixed_base.
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_not(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
+  ret.V = ~op.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_and(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V & op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_or(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V | op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_xor(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V ^ op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void neg(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+  ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2,
+                _AP_N2>
+      t;
+  t.V = -op.V;
+  ret = t;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void lshift(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
+    int i) {
+  enum {
+    F2 = _AP_W2 - _AP_I2,
+    _AP_I3 = AP_MAX(_AP_I, _AP_I2),
+    _AP_W3 = _AP_I3 + F2,
+  };
+  // wide buffer
+  ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
+  t.V = op.V;
+  t.V <<= i; // FIXME overflow?
+  // handle quantization and overflow
+  ret = t;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void rshift(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
+    int i) {
+  enum {
+    F = _AP_W - _AP_I,
+    F2 = _AP_W2 - _AP_I2,
+    F3 = AP_MAX(F, F2),
+    _AP_W3 = _AP_I2 + F3,
+    sh = F - F2,
+  };
+  // wide buffer
+  ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
+  t.V = op.V;
+  if (sh >= 0)
+    t.V <<= (int) sh;
+  t.V >>= i;
+  // handle quantization and overflow
+  ret = t;
+}
+
+//// FIXME
+//// These partial specialization ctors allow code like
+////   char c = 'a';
+////   ap_fixed_base<8, 8, true> x(c);
+//// but what bout ap_fixed_base<9, 9, true> y(c) ?
+//
+
+#ifndef __SYNTHESIS__
+INLINE std::string scientificFormat(std::string& input) {
+  if (input.length() == 0) return input;
+
+  size_t decPosition = input.find('.');
+  if (decPosition == std::string::npos) decPosition = input.length();
+
+  size_t firstNonZeroPos = 0;
+  for (; input[firstNonZeroPos] > '9' || input[firstNonZeroPos] < '1';
+       firstNonZeroPos++)
+    ;
+
+  int exp;
+  if (firstNonZeroPos > decPosition)
+    exp = decPosition - firstNonZeroPos;
+  else
+    exp = decPosition - firstNonZeroPos - 1;
+  std::string expString = "";
+  if (exp == 0)
+    ;
+  else if (exp < 0) {
+    expString += "e-";
+    exp = -exp;
+  } else
+    expString += "e+";
+
+  if (exp < 10 && exp > 0) {
+    expString += '0';
+    expString += (char)('0' + exp);
+  } else if (exp != 0) {
+    std::string tmp;
+
+    std::ostringstream oss;
+    oss << exp;
+
+    tmp = oss.str();
+    expString += tmp;
+  }
+
+  int lastNonZeroPos = (int)(input.length() - 1);
+  for (; lastNonZeroPos >= 0; --lastNonZeroPos)
+    if (input[lastNonZeroPos] <= '9' && input[lastNonZeroPos] > '0') break;
+
+  std::string ans = "";
+  ans += input[firstNonZeroPos];
+  if (firstNonZeroPos != (size_t)lastNonZeroPos) {
+    ans += '.';
+    for (int i = firstNonZeroPos + 1; i <= lastNonZeroPos; i++)
+      if (input[i] != '.') ans += input[i];
+  }
+
+  ans += expString;
+  return ans;
+}
+
+INLINE std::string reduceToPrecision(std::string& input, int precision) {
+  bool isZero = true;
+  size_t inputLen = input.length();
+  for (size_t i = 0; i < inputLen && isZero; i++)
+    if (input[i] != '.' && input[i] != '0') isZero = false;
+  if (isZero) return "0";
+
+  // Find the first valid number, skip '-'
+  int FirstNonZeroPos = 0;
+  int LastNonZeroPos = (int)inputLen - 1;
+  int truncBitPosition = 0;
+  size_t decPosition = input.find('.');
+  for (; input[FirstNonZeroPos] < '1' || input[FirstNonZeroPos] > '9';
+       FirstNonZeroPos++)
+    ;
+
+  for (; input[LastNonZeroPos] < '1' || input[LastNonZeroPos] > '9';
+       LastNonZeroPos--)
+    ;
+
+  if (decPosition == std::string::npos) decPosition = inputLen;
+  // Count the valid number, to decide whether we need to truncate
+  if ((int)decPosition > LastNonZeroPos) {
+    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) return input;
+    truncBitPosition = FirstNonZeroPos + precision;
+  } else if ((int)decPosition < FirstNonZeroPos) { // This is pure decimal
+    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) {
+      if (FirstNonZeroPos - decPosition - 1 < 4) {
+        return input;
+      } else {
+        if (input[0] == '-') {
+          std::string tmp = input.substr(1, inputLen - 1);
+          return std::string("-") + scientificFormat(tmp);
+        } else
+          return scientificFormat(input);
+      }
+    }
+    truncBitPosition = FirstNonZeroPos + precision;
+  } else {
+    if (LastNonZeroPos - FirstNonZeroPos <= precision) return input;
+    truncBitPosition = FirstNonZeroPos + precision + 1;
+  }
+
+  // duplicate the input string, we want to add "0" before the valid numbers
+  // This is easy for quantization, since we may change 9999 to 10000
+  std::string ans = "";
+  std::string dupInput = "0";
+  if (input[0] == '-') {
+    ans += '-';
+    dupInput += input.substr(1, inputLen - 1);
+  } else {
+    dupInput += input.substr(0, inputLen);
+    ++truncBitPosition;
+  }
+
+  // Add 'carry' after truncation, if necessary
+  bool carry = dupInput[truncBitPosition] > '4';
+  for (int i = truncBitPosition - 1; i >= 0 && carry; i--) {
+    if (dupInput[i] == '.') continue;
+    if (dupInput[i] == '9')
+      dupInput[i] = '0';
+    else {
+      ++dupInput[i];
+      carry = false;
+    }
+  }
+
+  // bits outside precision range should be set to 0
+  if (dupInput[0] == '1')
+    FirstNonZeroPos = 0;
+  else {
+    FirstNonZeroPos = 0;
+    while (dupInput[FirstNonZeroPos] < '1' || dupInput[FirstNonZeroPos] > '9')
+      ++FirstNonZeroPos;
+  }
+
+  unsigned it = FirstNonZeroPos;
+  int NValidNumber = 0;
+  while (it < dupInput.length()) {
+    if (dupInput[it] == '.') {
+      ++it;
+      continue;
+    }
+    ++NValidNumber;
+    if (NValidNumber > precision) dupInput[it] = '0';
+    ++it;
+  }
+
+  // Here we wanted to adjust the truncate position and the value
+  decPosition = dupInput.find('.');
+  if (decPosition == std::string::npos) // When this is integer
+    truncBitPosition = (int)dupInput.length();
+  else
+    for (truncBitPosition = (int)(dupInput.length() - 1); truncBitPosition >= 0;
+         --truncBitPosition) {
+      if (dupInput[truncBitPosition] == '.') break;
+      if (dupInput[truncBitPosition] != '0') {
+        truncBitPosition++;
+        break;
+      }
+    }
+
+  if (dupInput[0] == '1')
+    dupInput = dupInput.substr(0, truncBitPosition);
+  else
+    dupInput = dupInput.substr(1, truncBitPosition - 1);
+
+  decPosition = dupInput.find('.');
+  if (decPosition != std::string::npos) {
+    size_t it = 0;
+    for (it = decPosition + 1; dupInput[it] == '0'; it++)
+      ;
+    if (it - decPosition - 1 < 4) {
+      ans += dupInput;
+      return ans;
+    } else {
+      ans += scientificFormat(dupInput);
+      return ans;
+    }
+  } else if ((int)(dupInput.length()) <= precision) {
+    ans += dupInput;
+    return ans;
+  }
+
+  ans += scientificFormat(dupInput);
+  return ans;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void print(
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  if (_AP_I > 0) {
+    ap_int_base<_AP_I, _AP_S> p1;
+    p1.V = x.V >> (_AP_W - _AP_I);
+    print(p1.V); // print overlaod for .V should exit
+  } else {
+    printf("0");
+  }
+  printf(".");
+  if (_AP_I < _AP_W) {
+    ap_int_base<_AP_W - _AP_I, false> p2;
+    p2.V = _AP_ROOT_op_get_range(x.V, 0, _AP_W - _AP_I);
+    print(p2.V, false); // print overlaod for .V should exit
+  }
+}
+#endif // ifndef __SYNTHESIS__
+
+// XXX the following two functions have to exist in synthesis,
+// as some old HLS Video Library code uses the ostream overload,
+// although HLS will later delete I/O function call.
+
+/// Output streaming
+//-----------------------------------------------------------------------------
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& out,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  // TODO support std::ios_base::fmtflags
+  unsigned width = out.width();
+  unsigned precision = out.precision();
+  char fill = out.fill();
+  std::string str = x.to_string(10, _AP_S);
+  str = reduceToPrecision(str, precision);
+  if (width > str.length()) {
+    for (unsigned i = 0; i < width - str.length(); ++i)
+      out << fill;
+  }
+  out << str;
+  return out;
+}
+#endif // ifndef __SYNTHESIS__
+
+/// Input streaming
+// -----------------------------------------------------------------------------
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::istream& operator>>(
+    std::istream& in,
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  double d;
+  in >> d;
+  x = ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(d);
+  return in;
+}
+#endif
+#endif // ifndef AP_AUTOCC
+
+/// Operators mixing Integers with ap_fixed_base
+// -----------------------------------------------------------------------------
+#define AF_BIN_OP_WITH_INT_SF(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)     \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,           \
+            ap_o_mode _AP_O, int _AP_N>                                  \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<    \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                     \
+  operator BIN_OP(                                                       \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \
+      C_TYPE i_op) {                                                     \
+    return op.operator BIN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op));        \
+  }
+
+#define AF_BIN_OP_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)           \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      C_TYPE i_op) {                                                        \
+    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      C_TYPE i_op,                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
+  }
+
+#define AF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE bool operator REL_OP(                                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      C_TYPE i_op) {                                                        \
+    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE bool operator REL_OP(                                              \
+      C_TYPE i_op,                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
+  }
+
+#define AF_ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)               \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N>                                        \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
+  operator ASSIGN_OP(                                                          \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
+      C_TYPE i_op) {                                                           \
+    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define AF_ASSIGN_OP_WITH_INT_SF(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)  \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,       \
+            ap_o_mode _AP_O, int _AP_N>                              \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&    \
+  operator ASSIGN_OP(                                                \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      C_TYPE i_op) {                                                 \
+    return op.operator ASSIGN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define ALL_AF_OP_WITH_INT(C_TYPE, BITS, SIGN)               \
+  AF_BIN_OP_WITH_INT(+, C_TYPE, (BITS), (SIGN), plus)     \
+  AF_BIN_OP_WITH_INT(-, C_TYPE, (BITS), (SIGN), minus)    \
+  AF_BIN_OP_WITH_INT(*, C_TYPE, (BITS), (SIGN), mult)     \
+  AF_BIN_OP_WITH_INT(/, C_TYPE, (BITS), (SIGN), div)      \
+  AF_BIN_OP_WITH_INT(&, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT(|, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT(^, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT_SF(>>, C_TYPE, (BITS), (SIGN), lhs)  \
+  AF_BIN_OP_WITH_INT_SF(<<, C_TYPE, (BITS), (SIGN), lhs)  \
+                                                          \
+  AF_ASSIGN_OP_WITH_INT(+=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(-=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(*=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(/=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(&=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(|=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(^=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT_SF(>>=, C_TYPE, (BITS), (SIGN)) \
+  AF_ASSIGN_OP_WITH_INT_SF(<<=, C_TYPE, (BITS), (SIGN)) \
+                                                          \
+  AF_REL_OP_WITH_INT(>, C_TYPE, (BITS), (SIGN))           \
+  AF_REL_OP_WITH_INT(<, C_TYPE, (BITS), (SIGN))           \
+  AF_REL_OP_WITH_INT(>=, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(<=, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(==, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(!=, C_TYPE, (BITS), (SIGN))
+
+ALL_AF_OP_WITH_INT(bool, 1, false)
+ALL_AF_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_AF_OP_WITH_INT(signed char, 8, true)
+ALL_AF_OP_WITH_INT(unsigned char, 8, false)
+ALL_AF_OP_WITH_INT(short, _AP_SIZE_short, true)
+ALL_AF_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_AF_OP_WITH_INT(int, _AP_SIZE_int, true)
+ALL_AF_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_AF_OP_WITH_INT(long, _AP_SIZE_long, true)
+ALL_AF_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_AF_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_AF_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ALL_AF_OP_WITH_INT
+#undef AF_BIN_OP_WITH_INT
+#undef AF_BIN_OP_WITH_INT_SF
+#undef AF_ASSIGN_OP_WITH_INT
+#undef AF_ASSIGN_OP_WITH_INT_SF
+#undef AF_REL_OP_WITH_INT
+
+/*
+ * **********************************************************************
+ * TODO
+ * There is no operator defined with float/double/long double, so that
+ * code like
+ *   ap_fixed<8,4> a = 1.5f;
+ *   a += 0.5f;
+ * will fail in compilation.
+ * Operator with warning about conversion might be wanted.
+ * **********************************************************************
+ */
+
+#define AF_BIN_OP_WITH_AP_INT(BIN_OP, RTYPE)                                \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType<    \
+      _AP_W, _AP_I, _AP_S>::RTYPE                                           \
+  operator BIN_OP(                                                          \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
+  }                                                                         \
+                                                                            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
+    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define AF_REL_OP_WITH_AP_INT(REL_OP)                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE bool operator REL_OP(                                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
+    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+                                                                            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE bool operator REL_OP(                                              \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
+  }
+
+#define AF_ASSIGN_OP_WITH_AP_INT(ASSIGN_OP)                                    \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
+  operator ASSIGN_OP(                                                          \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                               \
+    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                            \
+                                                                               \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
+  INLINE ap_int_base<_AP_W2, _AP_S2>& operator ASSIGN_OP(                      \
+      ap_int_base<_AP_W2, _AP_S2>& i_op,                                       \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {     \
+    return i_op.operator ASSIGN_OP(op.to_ap_int_base());                       \
+  }
+
+AF_BIN_OP_WITH_AP_INT(+, plus)
+AF_BIN_OP_WITH_AP_INT(-, minus)
+AF_BIN_OP_WITH_AP_INT(*, mult)
+AF_BIN_OP_WITH_AP_INT(/, div)
+AF_BIN_OP_WITH_AP_INT(&, logic)
+AF_BIN_OP_WITH_AP_INT(|, logic)
+AF_BIN_OP_WITH_AP_INT(^, logic)
+
+#undef AF_BIN_OP_WITH_AP_INT
+
+AF_ASSIGN_OP_WITH_AP_INT(+=)
+AF_ASSIGN_OP_WITH_AP_INT(-=)
+AF_ASSIGN_OP_WITH_AP_INT(*=)
+AF_ASSIGN_OP_WITH_AP_INT(/=)
+AF_ASSIGN_OP_WITH_AP_INT(&=)
+AF_ASSIGN_OP_WITH_AP_INT(|=)
+AF_ASSIGN_OP_WITH_AP_INT(^=)
+
+#undef AF_ASSIGN_OP_WITH_AP_INT
+
+AF_REL_OP_WITH_AP_INT(==)
+AF_REL_OP_WITH_AP_INT(!=)
+AF_REL_OP_WITH_AP_INT(>)
+AF_REL_OP_WITH_AP_INT(>=)
+AF_REL_OP_WITH_AP_INT(<)
+AF_REL_OP_WITH_AP_INT(<=)
+
+#undef AF_REL_OP_WITH_AP_INT
+
+// Relational Operators with double
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator==(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator==(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator!=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator!=(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator>(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator<(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator>=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator<=(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator<(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator>(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator<=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator>=(op1);
+}
+
+#endif // ifndef __cplusplus else
+
+#endif // ifndef __AP_FIXED_BASE_H__ else
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_ref.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_ref.h
new file mode 100644
index 00000000..aefda0a6
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_ref.h
@@ -0,0 +1,718 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_REF_H__
+#define __AP_FIXED_REF_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+
+#else
+#ifndef __SYNTHESIS__
+#include <iostream>
+#endif
+/// Proxy class, which allows bit selection  to be used as both rvalue (for
+/// reading) and lvalue (for writing)
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_bit_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
+  ref_type& d_bv;
+  int d_index;
+
+ public:
+  INLINE af_bit_ref(
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.",
+                d_index);
+    _AP_WARNING(d_index >= _AP_W, "Index of bit vector (%d) out of range (%d).",
+                d_index, _AP_W);
+#endif
+  }
+
+  INLINE af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
+
+  INLINE af_bit_ref(const ref_type* bv, int index = 0)
+      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
+
+  /// convert operators.
+  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  /// @name assign operators
+  //  @{
+  INLINE af_bit_ref& operator=(bool val) {
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
+    return *this;
+  }
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE af_bit_ref& operator=(const af_bit_ref& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_bit_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    return operator=(val != 0);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=(ap_int_base<_AP_W2, false>(val));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_bit_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(ap_int_base<_AP_W2, false>(val));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE af_bit_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val));
+  }
+  //  @}
+
+  /// @name concatenate operators
+  //  @{
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, op);
+  }
+
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
+      const ap_bit_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this,
+                                                                        op);
+  }
+
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, op);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+                                                                         op);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      1, af_bit_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<
+        1, af_bit_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+                                                                       op);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                    _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                      _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+  //  @}
+
+  /// @name comparison
+  //  @{
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator==(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator!=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    return get() != op.get();
+  }
+  //  @}
+
+  INLINE bool operator~() const {
+    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
+    return bit ? false : true;
+  }
+
+  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  INLINE int length() const { return 1; }
+
+#ifndef __SYNTHESIS__
+  std::string to_string() const { return get() ? "1" : "0"; }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string() const { return 0; }
+#endif
+}; // struct af_bit_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& os,
+    const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  os << x.to_string();
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/// Range (slice) reference.
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_range_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
+  ref_type& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  /// copy ctor
+  INLINE af_range_ref(
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  /// ctor from ap_fixed_base, higher and lower bound.
+  /** if h is less than l, the bits selected will be returned in reverse order.
+   */
+  INLINE af_range_ref(ref_type* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
+                l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
+    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
+#endif
+  }
+
+  INLINE af_range_ref(const ref_type* bv, int h, int l)
+      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
+                l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
+    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
+#endif
+  }
+
+  /// @name assign operators
+  //  @{
+
+#define ASSIGN_CTYPE_TO_AF_RANGE(DATA_TYPE)                          \
+  INLINE af_range_ref& operator=(const DATA_TYPE val) {              \
+    ap_int_base<_AP_W, false> loc(val);                              \
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, loc.V); \
+    return *this;                                                    \
+  }
+
+  ASSIGN_CTYPE_TO_AF_RANGE(bool)
+  ASSIGN_CTYPE_TO_AF_RANGE(char)
+  ASSIGN_CTYPE_TO_AF_RANGE(signed char)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned char)
+  ASSIGN_CTYPE_TO_AF_RANGE(short)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned short)
+  ASSIGN_CTYPE_TO_AF_RANGE(int)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned int)
+  ASSIGN_CTYPE_TO_AF_RANGE(long)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned long)
+  ASSIGN_CTYPE_TO_AF_RANGE(ap_slong)
+  ASSIGN_CTYPE_TO_AF_RANGE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_CTYPE_TO_AF_RANGE(half)
+#endif
+  ASSIGN_CTYPE_TO_AF_RANGE(float)
+  ASSIGN_CTYPE_TO_AF_RANGE(double)
+#undef ASSIGN_CTYPE_TO_AF_RANGE
+
+  /// assgin using a string. XXX crucial for cosim.
+  INLINE af_range_ref& operator=(const char* val) {
+    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// assign from ap_int_base.
+  // NOTE Base of other assgin operators.
+  template <int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+    return *this;
+  }
+
+  /// assign from range reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    const ap_int_base<_AP_W2, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from bit reference to ap_int_base..
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    const ap_int_base<1, false> tmp((bool)val);
+    return operator=(tmp);
+  }
+
+  /// assgin from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+    return *this;
+  }
+
+  /// copy assgin.
+  // XXX This has to be explicit, otherwise it will be deleted, as d_bv is
+  // of reference type.
+  INLINE af_range_ref& operator=(const af_range_ref& val) {
+    ap_int_base<_AP_W, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from range reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    ap_int_base<_AP_W2, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from bit reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    ap_int_base<1, false> tmp((bool)val);
+    return operator=(tmp);
+  }
+
+  /// assign from compound reference.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE af_range_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val);
+    return operator=(tmp);
+  }
+  //  @}
+
+  /// @name comparison operators with ap_range_ref.
+  //  @{
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop == rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop < rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop > rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator>(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<(op2));
+  }
+  //  @}
+
+  /// @name comparison operators with af_range_ref.
+  //  @{
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator==(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop == rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator!=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator<(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop < rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator>(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop > rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator<=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator>(op2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator>=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator<(op2));
+  }
+  //  @}
+
+  /// @name concatenate operators.
+  /// @{
+  /// concatenate with ap_int_base.
+  template <int _AP_W2, int _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, op);
+  }
+
+  /// concatenate with ap_bit_ref.
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(op));
+  }
+
+  /// concatenate with ap_bit_ref.
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(op));
+  }
+
+  /// concatenate with ap_concat_ref.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(op));
+  }
+
+  /// concatenate with another af_range_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                    af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &op) {
+    return ap_concat_ref<
+        _AP_W, af_range_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+
+  /// concatenate with another af_bit_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(
+          const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<
+        _AP_W, af_range_ref, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+  //  @}
+
+  INLINE operator ap_ulong() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_uint64();
+  }
+
+  INLINE operator ap_int_base<_AP_W, false>() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  INLINE ap_int_base<_AP_W, false> to_ap_int_base() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  // used in ap_fixed_base::to_string()
+  INLINE char to_char() const {
+    return (char)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE int to_int() const {
+    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned to_uint() const {
+    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE long to_long() const {
+    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned long to_ulong() const {
+    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_slong to_int64() const {
+    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_int_base<_AP_W, false> get() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char rd = 2) const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_string(rd);
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(signed char rd = 2) const {
+    return 0;
+  }
+#endif
+}; // struct af_range_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& os,
+    const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  os << x.to_string();
+  return os;
+}
+#endif
+#endif // ifndef AP_AUTOCC
+
+#define AF_REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      C_TYPE op2) {                                                       \
+    return ap_int_base<_AP_W, false>(op)                                  \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                          \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      C_TYPE op2,                                                         \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
+    return ap_int_base<_AP_W2, _AP_S2>(op2)                               \
+        REL_OP ap_int_base<_AP_W, false>(op);                             \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
+      C_TYPE op2) {                                                       \
+    return bool(op) REL_OP op2;                                           \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      C_TYPE op2,                                                         \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
+    return op2 REL_OP bool(op);                                           \
+  }
+
+#define AF_REF_REL_OPS_WITH_INT(C_TYPE, _AP_W2, _AP_S2)  \
+  AF_REF_REL_OP_WITH_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
+  AF_REF_REL_OP_WITH_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
+  AF_REF_REL_OP_WITH_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+AF_REF_REL_OPS_WITH_INT(bool, 1, false)
+AF_REF_REL_OPS_WITH_INT(char, 8, CHAR_IS_SIGNED)
+AF_REF_REL_OPS_WITH_INT(signed char, 8, true)
+AF_REF_REL_OPS_WITH_INT(unsigned char, 8, false)
+AF_REF_REL_OPS_WITH_INT(short, _AP_SIZE_short, true)
+AF_REF_REL_OPS_WITH_INT(unsigned short, _AP_SIZE_short, false)
+AF_REF_REL_OPS_WITH_INT(int, _AP_SIZE_int, true)
+AF_REF_REL_OPS_WITH_INT(unsigned int, _AP_SIZE_int, false)
+AF_REF_REL_OPS_WITH_INT(long, _AP_SIZE_long, true)
+AF_REF_REL_OPS_WITH_INT(unsigned long, _AP_SIZE_long, false)
+AF_REF_REL_OPS_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+AF_REF_REL_OPS_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef AF_REF_REL_OP_INT
+#undef AF_REF_REL_OPS_WITH_INT
+
+#define AF_REF_REL_OP_WITH_AP_INT(REL_OP)                                 \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      const ap_int_base<_AP_W2, _AP_S>& op2) {                            \
+    return ap_int_base<_AP_W, false>(op) REL_OP op2;                      \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
+    return op2 REL_OP ap_int_base<_AP_W, false>(op);                      \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {                           \
+    return ap_int_base<1, false>(op) REL_OP op2;                          \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
+    return op2 REL_OP ap_int_base<1, false>(op);                          \
+  }
+
+AF_REF_REL_OP_WITH_AP_INT(>)
+AF_REF_REL_OP_WITH_AP_INT(<)
+AF_REF_REL_OP_WITH_AP_INT(>=)
+AF_REF_REL_OP_WITH_AP_INT(<=)
+AF_REF_REL_OP_WITH_AP_INT(==)
+AF_REF_REL_OP_WITH_AP_INT(!=)
+
+#endif // ifndef __cplusplus
+
+#endif // ifndef __AP_FIXED_REF_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_special.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_special.h
new file mode 100644
index 00000000..0f7a9f7e
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_fixed_special.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_SPECIAL_H__
+#define __AP_FIXED_SPECIAL_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __SYNTHESIS__
+#include <cstdio>
+#include <cstdlib>
+#endif
+// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
+// include.
+// #include <complex>
+namespace std {
+template<typename _Tp> class complex;
+}
+
+/*
+  TODO: Modernize the code using C++11/C++14
+  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
+  2. move constructor
+*/
+
+namespace std {
+/*
+   Specialize std::complex<ap_fixed> to zero initialization ap_fixed.
+
+   To reduce the area cost, ap_fixed is not zero initialized, just like basic
+   types float or double. However, libstdc++ provides specialization for float,
+   double and long double, initializing image part to 0 when not specified.
+
+   This has become a difficulty in switching legacy code from these C types to
+   ap_fixed. To ease the tranform of legacy code, we have to implement
+   specialization of std::complex<> for our type.
+
+   As ap_fixed is a template, it is impossible to specialize only the methods
+   that causes default initialization of value type in std::complex<>. An
+   explicit full specialization of the template class has to be done, covering
+   all the member functions and operators of std::complex<> as specified
+   in standard 26.2.4 and 26.2.5.
+*/
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > {
+ public:
+  typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp;
+  typedef _Tp value_type;
+
+  // 26.2.4/1
+  // Constructor without argument
+  // Default initialize, so that in dataflow, the variable is only written once.
+  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
+  // Constructor with ap_fixed.
+  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
+  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
+      : _M_real(__r), _M_imag(__i) {}
+
+  // Constructor with another complex number
+  template <typename _Up>
+  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
+
+#if __cplusplus >= 201103L
+  const _Tp& real() const { return _M_real; }
+  const _Tp& imag() const { return _M_imag; }
+#else
+  _Tp& real() { return _M_real; }
+  const _Tp& real() const { return _M_real; }
+  _Tp& imag() { return _M_imag; }
+  const _Tp& imag() const { return _M_imag; }
+#endif
+ 
+  void real(_Tp __val) { _M_real = __val; }
+
+  void imag(_Tp __val) { _M_imag = __val; }
+
+  // Assign this complex number with ap_fixed.
+  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
+  complex<_Tp> &operator=(const _Tp __t) {
+    _M_real = __t;
+    _M_imag = _Tp(0);
+    return *this;
+  }
+
+  // 26.2.5/1
+  // Add ap_fixed to this complex number.
+  complex<_Tp> &operator+=(const _Tp &__t) {
+    _M_real += __t;
+    return *this;
+  }
+
+  // 26.2.5/3
+  // Subtract ap_fixed from this complex number.
+  complex<_Tp> &operator-=(const _Tp &__t) {
+    _M_real -= __t;
+    return *this;
+  }
+
+  // 26.2.5/5
+  // Multiply this complex number by ap_fixed.
+  complex<_Tp> &operator*=(const _Tp &__t) {
+    _M_real *= __t;
+    _M_imag *= __t;
+    return *this;
+  }
+
+  // 26.2.5/7
+  // Divide this complex number by ap_fixed.
+  complex<_Tp> &operator/=(const _Tp &__t) {
+    _M_real /= __t;
+    _M_imag /= __t;
+    return *this;
+  }
+
+  // Assign complex number to this complex number.
+  template <typename _Up>
+  complex<_Tp> &operator=(const complex<_Up> &__z) {
+    _M_real = __z.real();
+    _M_imag = __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/9
+  // Add complex number to this.
+  template <typename _Up>
+  complex<_Tp> &operator+=(const complex<_Up> &__z) {
+    _M_real += __z.real();
+    _M_imag += __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/11
+  // Subtract complex number from this.
+  template <typename _Up>
+  complex<_Tp> &operator-=(const complex<_Up> &__z) {
+    _M_real -= __z.real();
+    _M_imag -= __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/13
+  // Multiply this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator*=(const complex<_Up> &__z) {
+    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
+    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
+    _M_real = __r;
+    return *this;
+  }
+
+  // 26.2.5/15
+  // Divide this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator/=(const complex<_Up> &__z) {
+    complex<_Tp> cj (__z.real(), -__z.imag());
+    complex<_Tp> a = (*this) * cj;
+    complex<_Tp> b = cj * __z;
+    _M_real = a.real() / b.real();
+    _M_imag = a.imag() / b.real();
+    return *this;
+  }
+
+ private:
+  _Tp _M_real;
+  _Tp _M_imag;
+
+}; // class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> >
+
+/*
+   Non-member operations
+   These operations are not required by standard in 26.2.6, but libstdc++
+   defines them for
+   float, double or long double's specialization.
+*/
+// Compare complex number with ap_fixed.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator==(
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
+  return __x.real() == __y &&
+         __x.imag() == 0;
+}
+
+// Compare ap_fixed with complex number.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator==(
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
+  return __x == __y.real() &&
+         0 == __y.imag();
+}
+
+// Compare complex number with ap_fixed.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator!=(
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
+  return __x.real() != __y ||
+         __x.imag() != 0;
+}
+
+// Compare ap_fixed with complex number.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator!=(
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
+  return __x != __y.real() ||
+         0 != __y.imag();
+}
+
+}  // namespace std
+
+#endif  // ifndef __AP_FIXED_SPECIAL_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int.h
new file mode 100644
index 00000000..db3044d4
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_H__
+#define __AP_INT_H__
+
+#include <ap_common.h>
+#include <ap_int_base.h>
+#include <ap_int_ref.h>
+
+//---------------------------------------------------------------
+
+/// Sign Arbitrary Precision Type.
+template <int _AP_W>
+struct ap_int : ap_int_base<_AP_W, true> {
+  typedef ap_int_base<_AP_W, true> Base;
+  // Constructor
+  INLINE ap_int() : Base() {}
+
+  // Copy ctor
+  INLINE ap_int(const ap_int& op) { Base::V = op.V; }
+
+  template <int _AP_W2>
+  INLINE ap_int(const ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const volatile ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const volatile ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
+      : Base(ref) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(
+      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(
+      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_int(TYPE val) { Base::V = val; }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#undef CTOR
+  ap_int(double val) : Base(val) {}
+  ap_int(float val) : Base(val) {}
+#if _AP_ENABLE_HALF_ == 1
+  ap_int(half val) : Base(val) {}
+#endif
+
+  // ap_int_base will guess radix if radix is not provided.
+  INLINE ap_int(const char* s) : Base(s) {}
+
+  INLINE ap_int(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  /* ctor will be used when right is not of proper type. */
+
+  INLINE ap_int& operator=(const ap_int<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot bind volatile reference to non-volatile type. */
+  INLINE ap_int& operator=(const volatile ap_int<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot return volatile *this. */
+  INLINE void operator=(const ap_int<_AP_W>& op2) volatile { Base::V = op2.V; }
+
+  INLINE void operator=(const volatile ap_int<_AP_W>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+}; // struct ap_int.
+
+//---------------------------------------------------------------
+
+/// Unsigned Arbitrary Precision Type.
+template <int _AP_W>
+struct ap_uint : ap_int_base<_AP_W, false> {
+  typedef ap_int_base<_AP_W, false> Base;
+  // Constructor
+  INLINE ap_uint() : Base() {}
+
+  // Copy ctor
+  INLINE ap_uint(const ap_uint& op) { Base::V = op.V; }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const volatile ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const volatile ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
+      : Base(ref) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(
+      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(
+      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_uint(TYPE val) { Base::V = val; }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#undef CTOR
+  ap_uint(double val) : Base(val) {}
+  ap_uint(float val) : Base(val) {}
+#if _AP_ENABLE_HALF_ == 1
+  ap_uint(half val) : Base(val) {}
+#endif
+
+  // ap_int_base will guess radix if radix is not provided.
+  INLINE ap_uint(const char* s) : Base(s) {}
+
+  INLINE ap_uint(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  /* XXX ctor will be used when right is not of proper type. */
+
+  INLINE ap_uint& operator=(const ap_uint<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot bind volatile reference to non-volatile type. */
+  INLINE ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot return volatile *this. */
+  INLINE void operator=(const ap_uint<_AP_W>& op2) volatile { Base::V = op2.V; }
+
+  INLINE void operator=(const volatile ap_uint<_AP_W>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+}; // struct ap_uint.
+
+#define ap_bigint ap_int
+#define ap_biguint ap_uint
+
+#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
+// XXX sc_trace overload for ap_fixed is already included in
+// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
+template <int _AP_W>
+INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_int<_AP_W>& op,
+                     const std::string& name) {
+  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+
+template <int _AP_W>
+INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_uint<_AP_W>& op,
+                     const std::string& name) {
+  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+#endif // System C sim
+
+#include <ap_int_special.h>
+
+#endif // ifndef __AP_INT_H__ else
+
+// FIXME user should include ap_fixed.h when using ap_fixed.
+// to avoid circular inclusion, must check whether this is required by
+// ap_fixed.h
+#ifndef __AP_FIXED_H__
+#include <ap_fixed.h>
+#endif
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_base.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_base.h
new file mode 100644
index 00000000..091552a8
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_base.h
@@ -0,0 +1,1885 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_BASE_H__
+#define __AP_INT_BASE_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+#else
+
+#include <ap_common.h>
+#ifndef __SYNTHESIS__
+#if _AP_ENABLE_HALF_ == 1
+#include <hls_half.h>
+#endif
+#include <iostream>
+#include <string.h>
+#endif
+
+/* ----------------------------------------------------------------
+ * ap_int_base: AutoPilot integer/Arbitrary precision integer.
+ * ----------------------------------------------------------------
+ */
+
+/* helper trait. Selecting the smallest C type that can hold the value,
+ * return 64 bit C type if not possible.
+ */
+template <int _AP_N, bool _AP_S>
+struct retval;
+
+// at least 64 bit
+template <int _AP_N>
+struct retval<_AP_N, true> {
+  typedef ap_slong Type;
+};
+
+template <int _AP_N>
+struct retval<_AP_N, false> {
+  typedef ap_ulong Type;
+};
+
+// at least 8 bit
+template <>
+struct retval<1, true> {
+  typedef signed char Type;
+};
+
+template <>
+struct retval<1, false> {
+  typedef unsigned char Type;
+};
+
+// at least 16 bit
+template <>
+struct retval<2, true> {
+  typedef short Type;
+};
+
+template <>
+struct retval<2, false> {
+  typedef unsigned short Type;
+};
+
+// at least 32 bit
+template <>
+struct retval<3, true> {
+  typedef long Type;
+};
+
+template <>
+struct retval<3, false> {
+  typedef unsigned long Type;
+};
+
+template <>
+struct retval<4, true> {
+  typedef long Type;
+};
+
+template <>
+struct retval<4, false> {
+  typedef unsigned long Type;
+};
+
+// trait for letting base class to return derived class.
+// Notice that derived class template is incomplete, and we cannot use
+// the member of the derived class.
+template <int _AP_W2, bool _AP_S2>
+struct _ap_int_factory;
+template <int _AP_W2>
+struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; };
+template <int _AP_W2>
+struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; };
+
+template <int _AP_W, bool _AP_S>
+struct ap_int_base : public _AP_ROOT_TYPE<_AP_W, _AP_S> {
+ public:
+  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
+
+  /* ap_int_base<_AP_W, _AP_S, true>
+   * typedef typename retval<(_AP_W + 7) / 8, _AP_S>::Type RetType;
+   *
+   * ap_int_base<_AP_W, _AP_S, false>
+   * typedef typename retval<8, _AP_S>::Type RetType;
+   */
+  typedef typename retval<AP_MAX((_AP_W + 7) / 8, 8), _AP_S>::Type RetType;
+
+  static const int width = _AP_W;
+
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+
+
+    typedef ap_int_base<mult_w, mult_s> mult_base;
+    typedef ap_int_base<plus_w, plus_s> plus_base;
+    typedef ap_int_base<minus_w, minus_s> minus_base;
+    typedef ap_int_base<logic_w, logic_s> logic_base;
+    typedef ap_int_base<div_w, div_s> div_base;
+    typedef ap_int_base<mod_w, mod_s> mod_base;
+    typedef ap_int_base<_AP_W, _AP_S> arg1_base;
+
+    typedef typename _ap_int_factory<mult_w, mult_s>::type mult;
+    typedef typename _ap_int_factory<plus_w, plus_s>::type plus;
+    typedef typename _ap_int_factory<minus_w, minus_s>::type minus;
+    typedef typename _ap_int_factory<logic_w, logic_s>::type logic;
+    typedef typename _ap_int_factory<div_w, div_s>::type div;
+    typedef typename _ap_int_factory<mod_w, mod_s>::type mod;
+    typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1;
+    typedef bool reduce;
+  };
+
+  /* Constructors.
+   * ----------------------------------------------------------------
+   */
+  /// default ctor
+  INLINE ap_int_base() {
+    /*
+      #ifdef __SC_COMPATIBLE__
+      Base::V = 0;
+      #endif
+    */
+  }
+
+  /// copy ctor
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  /// volatile copy ctor
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+// XXX C++11 feature.
+// The explicit specifier specifies that a constructor or conversion function
+// (since C++11) doesn't allow implicit conversions or copy-initialization.
+//   ap_int_base<W,S> x = 1;
+//   ap_int_base<W,S> foo() { return 1; }
+// but allows
+//   ap_int_base<W,S> x(1);
+//   ap_int_base<W,S> y {1};
+
+/// from all c types.
+#define CTOR_FROM_INT(Type, Size, Signed) \
+  INLINE ap_int_base(const Type op) { Base::V = op; }
+
+  CTOR_FROM_INT(bool, 1, false)
+  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  CTOR_FROM_INT(signed char, 8, true)
+  CTOR_FROM_INT(unsigned char, 8, false)
+  CTOR_FROM_INT(short, _AP_SIZE_short, true)
+  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  CTOR_FROM_INT(int, _AP_SIZE_int, true)
+  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  CTOR_FROM_INT(long, _AP_SIZE_long, true)
+  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+#undef CTOR_FROM_INT
+
+#if _AP_ENABLE_HALF_ == 1
+  /// ctor from half.
+  //  TODO optimize
+  INLINE ap_int_base(half op) {
+    ap_int_base<_AP_W, _AP_S> t((float)op);
+    Base::V = t.V;
+  }
+#endif
+
+  /// ctor from float.
+  INLINE ap_int_base(float op) {
+    const int BITS = FLOAT_MAN + FLOAT_EXP + 1;
+    ap_int_base<BITS, false> reg;
+    reg.V = floatToRawBits(op);
+    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
+
+    ap_int_base<FLOAT_EXP + 1, true> exp = 0;
+    exp.V = _AP_ROOT_op_get_range(reg.V, FLOAT_MAN, BITS - 2);
+    exp = exp - FLOAT_BIAS;
+
+    ap_int_base<FLOAT_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(reg.V, 0, FLOAT_MAN - 1);
+    // check for NaN
+    _AP_WARNING(exp == ((unsigned char)(FLOAT_BIAS + 1)) && man.V != 0,
+                "assign NaN to ap integer value");
+    // set leading 1.
+    man.V = _AP_ROOT_op_set_bit(man.V, FLOAT_MAN, 1);
+    //if (is_neg) man = -man;
+
+    if ((reg.V & 0x7ffffffful) == 0) {
+      Base::V = 0;
+    } else {
+      int sh_amt = FLOAT_MAN - exp.V;
+      if (sh_amt == 0) {
+        Base::V = man.V;
+      } else if (sh_amt > 0) {
+        if (sh_amt < FLOAT_MAN + 2) {
+          Base::V = man.V >> sh_amt;
+        } else {
+          if (is_neg)
+            Base::V = -1;
+          else
+            Base::V = 0;
+        }
+      } else {
+        sh_amt = -sh_amt;
+        if (sh_amt < _AP_W) {
+          Base::V = man.V;
+          Base::V <<= sh_amt;
+        } else {
+          Base::V = 0;
+        }
+      }
+    }
+    if (is_neg) *this = -(*this);
+  }
+
+  /// ctor from double.
+  INLINE ap_int_base(double op) {
+    const int BITS = DOUBLE_MAN + DOUBLE_EXP + 1;
+    ap_int_base<BITS, false> reg;
+    reg.V = doubleToRawBits(op);
+    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
+
+    ap_int_base<DOUBLE_EXP + 1, true> exp = 0;
+    exp.V = _AP_ROOT_op_get_range(reg.V, DOUBLE_MAN, BITS - 2);
+    exp = exp - DOUBLE_BIAS;
+
+    ap_int_base<DOUBLE_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(reg.V, 0, DOUBLE_MAN - 1);
+    // check for NaN
+    _AP_WARNING(exp == ((unsigned char)(DOUBLE_BIAS + 1)) && man.V != 0,
+                "assign NaN to ap integer value");
+    // set leading 1.
+    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
+    //if (is_neg) man = -man;
+
+    if ((reg.V & 0x7fffffffffffffffull) == 0) {
+      Base::V = 0;
+    } else {
+      int sh_amt = DOUBLE_MAN - exp.V;
+      if (sh_amt == 0) {
+        Base::V = man.V;
+      } else if (sh_amt > 0) {
+        if (sh_amt < DOUBLE_MAN + 2) {
+          Base::V = man.V >> sh_amt;
+        } else {
+          if (is_neg)
+            Base::V = -1;
+          else
+            Base::V = 0;
+        }
+      } else {
+        sh_amt = -sh_amt;
+        if (sh_amt < _AP_W) {
+          Base::V = man.V;
+          Base::V <<= sh_amt;
+        } else {
+          Base::V = 0;
+        }
+      }
+    }
+    if (is_neg) *this = -(*this);
+  }
+
+  /// from higer rank type.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = op.to_ap_int_base().V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) {
+    Base::V = (ref.get()).V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) {
+    Base::V = ref.operator bool();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+    const ap_int_base<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>::_AP_WR,
+                      false>
+        tmp = ref.get();
+    Base::V = tmp.V;
+  }
+
+  /* radix has default value in set */
+
+#ifndef __SYNTHESIS__
+  INLINE ap_int_base(const char* s, signed char rd = 0) {
+    if (rd == 0)
+      rd = guess_radix(s);
+    unsigned int length = strlen(s);
+    Base::V.fromString(s, length, rd);
+  }
+#else
+  // XXX __builtin_bit_from_string(...) requires const C string and radix.
+  INLINE ap_int_base(const char* s) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S,
+                      AP_TRN, AP_WRAP, 0, _AP_C99);
+    Base::V = t;
+  }
+  INLINE ap_int_base(const char* s, signed char rd) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S,
+                      AP_TRN, AP_WRAP, 0, _AP_C99);
+    Base::V = t;
+  }
+#endif
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    Base::V = (val.get()).V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    Base::V = val.operator bool();
+  }
+
+  INLINE ap_int_base read() volatile {
+    /*AP_DEBUG(printf("call read %d\n", Base::V););*/
+    ap_int_base ret;
+    ret.V = Base::V;
+    return ret;
+  }
+
+  INLINE void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    /*AP_DEBUG(printf("call write %d\n", op2.V););*/
+    Base::V = op2.V;
+  }
+
+  /* Another form of "write".*/
+  template <int _AP_W2, bool _AP_S2>
+  INLINE void operator=(
+      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  INLINE void operator=(
+      const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  INLINE void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(
+      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  INLINE ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  INLINE ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+
+#define ASSIGN_OP_FROM_INT(Type, Size, Signed) \
+  INLINE ap_int_base& operator=(Type op) {     \
+    Base::V = op;                              \
+    return *this;                              \
+  }
+
+  ASSIGN_OP_FROM_INT(bool, 1, false)
+  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  ASSIGN_OP_FROM_INT(signed char, 8, true)
+  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
+  ASSIGN_OP_FROM_INT(short, _AP_SIZE_short, true)
+  ASSIGN_OP_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  ASSIGN_OP_FROM_INT(int, _AP_SIZE_int, true)
+  ASSIGN_OP_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  ASSIGN_OP_FROM_INT(long, _AP_SIZE_long, true)
+  ASSIGN_OP_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  ASSIGN_OP_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  ASSIGN_OP_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ASSIGN_OP_FROM_INT
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) {
+    Base::V = (bool)op2;
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    Base::V = (ap_int_base<_AP_W2, false>(op2)).V;
+    return *this;
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) {
+    Base::V = op2.get().V;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = op.to_ap_int_base().V;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = (bool)op;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V;
+    return *this;
+  }
+
+  // FIXME: UG902 has clearly required user to use to_int() to convert to built-in
+  // types, but this implicit conversion is relied on in hls_cordic.h and hls_rsr.h.
+  // For example:
+  //     int d_exp = fps_x.exp - fps_y.exp;
+  INLINE operator RetType() const { return (RetType)(Base::V); }
+
+  /* Explicit conversions to C types.
+   * ----------------------------------------------------------------
+   */
+  INLINE bool to_bool() const { return (bool)(Base::V); }
+  INLINE char to_char() const { return (char)(Base::V); }
+  INLINE signed char to_schar() const { return (signed char)(Base::V); }
+  INLINE unsigned char to_uchar() const { return (unsigned char)(Base::V); }
+  INLINE short to_short() const { return (short)(Base::V); }
+  INLINE unsigned short to_ushort() const { return (unsigned short)(Base::V); }
+  INLINE int to_int() const { return (int)(Base::V); }
+  INLINE unsigned to_uint() const { return (unsigned)(Base::V); }
+  INLINE long to_long() const { return (long)(Base::V); }
+  INLINE unsigned long to_ulong() const { return (unsigned long)(Base::V); }
+  INLINE ap_slong to_int64() const { return (ap_slong)(Base::V); }
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)(Base::V); }
+  INLINE float to_float() const { return (float)(Base::V); }
+  INLINE double to_double() const { return (double)(Base::V); }
+
+  // TODO decide if user-defined conversion should be provided.
+#if 0
+  INLINE operator char() const { return (char)(Base::V); }
+  INLINE operator signed char() const { return (signed char)(Base::V); }
+  INLINE operator unsigned char() const { return (unsigned char)(Base::V); }
+  INLINE operator short() const { return (short)(Base::V); }
+  INLINE operator unsigned short() const { return (unsigned short)(Base::V); }
+  INLINE operator int() const { return (int)(Base::V); }
+  INLINE operator unsigned int () const { return (unsigned)(Base::V); }
+  INLINE operator long () const { return (long)(Base::V); }
+  INLINE operator unsigned long () const { return (unsigned long)(Base::V); }
+  INLINE operator ap_slong () { return (ap_slong)(Base::V); }
+  INLINE operator ap_ulong () { return (ap_ulong)(Base::V); }
+#endif
+
+  /* Helper methods.
+     ----------------------------------------------------------------
+  */
+  /* we cannot call a non-volatile function on a volatile instance.
+   * but calling a volatile function is ok.
+   * XXX deleted non-volatile version.
+   */
+  INLINE int length() const volatile { return _AP_W; }
+
+  /*Return true if the value of ap_int_base instance is zero*/
+  INLINE bool iszero() const { return Base::V == 0; }
+
+  /*Return true if the value of ap_int_base instance is zero*/
+  INLINE bool is_zero() const { return Base::V == 0; }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (_AP_S &&
+        _AP_ROOT_op_get_bit(Base::V, _AP_W - 1))
+      return true;
+    else
+      return false;
+  }
+
+  /* x[i] = 0 */
+  INLINE void clear(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
+  }
+
+  /* x[i] = !x[i]*/
+  INLINE void invert(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    bool val = _AP_ROOT_op_get_bit(Base::V, i);
+    if (val)
+      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
+    else
+      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
+  }
+
+  INLINE bool test(int i) const {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    return _AP_ROOT_op_get_bit(Base::V, i);
+  }
+
+  // Get self. For ap_concat_ref expansion.
+  INLINE ap_int_base& get() { return *this; }
+
+  // Set the ith bit into 1
+  INLINE void set(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_int_base object n places to the left
+  INLINE ap_int_base& lrotate(int n) {
+    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
+    // TODO unify this.
+#ifdef __SYNTHESIS__
+    typeof(Base::V) l_p = Base::V << n;
+    typeof(Base::V) r_p = Base::V >> (_AP_W - n);
+    Base::V = l_p | r_p;
+#else
+    Base::V.lrotate(n);
+#endif
+    return *this;
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_int_base object n places to the right
+  INLINE ap_int_base& rrotate(int n) {
+    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
+    // TODO unify this.
+#ifdef __SYNTHESIS__
+    typeof(Base::V) l_p = Base::V << (_AP_W - n);
+    typeof(Base::V) r_p = Base::V >> n;
+    Base::V = l_p | r_p;
+#else
+    Base::V.rrotate(n);
+#endif
+    return *this;
+  }
+
+  // Reverse the contents of ap_int_base instance.
+  // I.e. LSB becomes MSB and vise versa.
+  INLINE ap_int_base& reverse() {
+    Base::V = _AP_ROOT_op_get_range(Base::V, _AP_W - 1, 0);
+    return *this;
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    return (bool)_AP_ROOT_op_get_bit(Base::V, i);
+  }
+
+  // complements every bit
+  INLINE void b_not() { Base::V = ~Base::V; }
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    Base::V Sym op2.V;                                                       \
+    return *this;                                                            \
+  }
+
+  /* Arithmetic assign.
+   * ----------------------------------------------------------------
+   */
+  OP_ASSIGN_AP(*=)
+  OP_ASSIGN_AP(+=)
+  OP_ASSIGN_AP(-=)
+  OP_ASSIGN_AP(/=)
+  OP_ASSIGN_AP(%=)
+#undef OP_ASSIGN_AP
+
+  /* Bitwise assign: and, or, xor.
+   * ----------------------------------------------------------------
+   */
+#define OP_ASSIGN_AP_CHK(Sym)                                                \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    _AP_WARNING((_AP_W != _AP_W2),                                           \
+                "Bitsize mismatch for ap_[u]int" #Sym "ap_[u]int.");         \
+    Base::V Sym op2.V;                                                       \
+    return *this;                                                            \
+  }
+  OP_ASSIGN_AP_CHK(&=)
+  OP_ASSIGN_AP_CHK(|=)
+  OP_ASSIGN_AP_CHK(^=)
+#undef OP_ASSIGN_AP_CHK
+
+  /* Prefix increment, decrement.
+   * ----------------------------------------------------------------
+   */
+  INLINE ap_int_base& operator++() {
+    operator+=((ap_int_base<1, false>)1);
+    return *this;
+  }
+  INLINE ap_int_base& operator--() {
+    operator-=((ap_int_base<1, false>)1);
+    return *this;
+  }
+
+  /* Postfix increment, decrement
+   * ----------------------------------------------------------------
+   */
+  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator++(int) {
+    ap_int_base t = *this;
+    operator+=((ap_int_base<1, false>)1);
+    return t;
+  }
+  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator--(int) {
+    ap_int_base t = *this;
+    operator-=((ap_int_base<1, false>)1);
+    return t;
+  }
+
+  /* Unary arithmetic.
+   * ----------------------------------------------------------------
+   */
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; }
+
+  // TODO used to be W>64 only... need check.
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_int_base<1, false>(0) - *this;
+  }
+
+  /* Not (!)
+   * ----------------------------------------------------------------
+   */
+  INLINE bool operator!() const { return Base::V == 0; }
+
+  /* Bitwise (arithmetic) unary: complement
+     ----------------------------------------------------------------
+  */
+  // XXX different from Mentor's ac_int!
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator~() const {
+    ap_int_base<_AP_W, _AP_S> r;
+    r.V = ~Base::V;
+    return r;
+  }
+
+  /* Shift (result constrained by left operand).
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator>>(sh);
+    } else
+      return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const {
+    ap_int_base r;
+    r.V = Base::V << op2.to_uint();
+    return r;
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator<<(sh);
+    }
+    return operator>>(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const {
+    ap_int_base r;
+    r.V = Base::V >> op2.to_uint();
+    return r;
+  }
+
+  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
+#if 0
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base operator<<(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
+    return *this << (op2.operator ap_int_base<_AP_W2, false>());
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base operator>>(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
+    return *this >> (op2.operator ap_int_base<_AP_W2, false>());
+  }
+#endif
+
+  /* Shift assign
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2>
+  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator>>=(sh);
+    } else
+      return operator<<=(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) {
+    Base::V <<= op2.to_uint();
+    return *this;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator<<=(sh);
+    }
+    return operator>>=(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) {
+    Base::V >>= op2.to_uint();
+    return *this;
+  }
+
+  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
+#if 0
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator<<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return *this <<= (op2.operator ap_int_base<_AP_W2, false>());
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator>>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return *this >>= (op2.operator ap_int_base<_AP_W2, false>());
+  }
+#endif
+
+  /* Equality and Relational.
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V == op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return !(Base::V == op2.V);
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V < op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V >= op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V > op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V <= op2.V;
+  }
+
+  /* Bit and Part Select
+   * ----------------------------------------------------------------
+   */
+  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
+    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
+    return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  // This is a must to strip constness to produce reference type.
+  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
+    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
+    return ap_range_ref<_AP_W, _AP_S>(const_cast<ap_int_base*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> range() {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> range() const {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return this->range(Hi, Lo);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+#if 0
+  template<int Hi, int Lo>
+  INLINE ap_int_base<Hi-Lo+1, false> slice() const {
+    AP_ASSERT(Hi >= Lo && Hi < _AP_W && Lo < _AP_W, "Out of bounds in slice()");
+    ap_int_base<Hi-Lo+1, false> tmp ;
+    tmp.V = _AP_ROOT_op_get_range(Base::V, Lo, Hi);
+    return tmp;
+  }
+
+  INLINE ap_bit_ref<_AP_W,_AP_S> operator [] ( unsigned int uindex) {
+    AP_ASSERT(uindex < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W,_AP_S> bvh( this, uindex );
+    return bvh;
+  }
+#endif
+
+  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
+    return bvh;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
+    return bvh;
+  }
+
+  INLINE bool operator[](int index) const {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index);
+    return br.to_bool();
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const {
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int());
+    return br.to_bool();
+  }
+
+  INLINE ap_bit_ref<_AP_W, _AP_S> bit(int index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
+    return bvh;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref<_AP_W, _AP_S> bit(
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
+    return bvh;
+  }
+
+  INLINE bool bit(int index) const {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index);
+    return br.to_bool();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const {
+    return bit(index.to_int());
+  }
+
+#if 0
+  template<typename _AP_T>
+  INLINE bool operator[](_AP_T index) const {
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W,_AP_S> br = operator[](index);
+    return br.to_bool();
+  }
+#endif
+
+  // Count the number of zeros from the most significant bit
+  // to the first one bit.
+  INLINE int countLeadingZeros() {
+#ifdef __SYNTHESIS__
+    if (_AP_W <= 32) {
+      ap_int_base<32, false> t(-1UL), x;
+      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
+      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
+      return __builtin_ctz(t.V); // count trailing zeros.
+    } else if (_AP_W <= 64) {
+      ap_int_base<64, false> t(-1ULL);
+      ap_int_base<64, false> x;
+      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
+      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
+      return __builtin_ctzll(t.V); // count trailing zeros.
+    } else {
+      enum { __N = (_AP_W + 63) / 64 };
+      int NZeros = 0;
+      int i = 0;
+      bool hitNonZero = false;
+      for (i = 0; i < __N - 1; ++i) {
+        ap_int_base<64, false> t;
+        t.V = _AP_ROOT_op_get_range(this->V, _AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
+        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); // count leading zeros.
+        hitNonZero |= (t.V != 0);
+      }
+      if (!hitNonZero) {
+        ap_int_base<64, false> t(-1ULL);
+        enum { REST = (_AP_W - 1) % 64 };
+        ap_int_base<64, false> x;
+        x.V = _AP_ROOT_op_get_range(this->V, 0, REST);
+        t.V = _AP_ROOT_op_set_range(t.V, 63 - REST, 63, x.V);
+        NZeros += __builtin_clzll(t.V);
+      }
+      return NZeros;
+    }
+#else
+    return (Base::V).countLeadingZeros();
+#endif
+  } // countLeadingZeros
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  concat(ap_int_base<_AP_W2, _AP_S2>& a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this), a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, a2);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+                                                                         a2);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_int_base, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                &a2) const {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_int_base, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+                                                                       a2);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &a2) const {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(
+          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this & a2.get();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this | a2.get();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this ^ a2.get();
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    Base::V = val.V;
+  }
+
+  /* Reduce operations.
+   * ----------------------------------------------------------------
+   */
+  // XXX non-const version deleted.
+  INLINE bool and_reduce() const { return _AP_ROOT_op_reduce(and, Base::V); }
+  INLINE bool nand_reduce() const { return _AP_ROOT_op_reduce(nand, Base::V); }
+  INLINE bool or_reduce() const { return _AP_ROOT_op_reduce(or, Base::V); }
+  INLINE bool nor_reduce() const { return !(_AP_ROOT_op_reduce(or, Base::V)); }
+  INLINE bool xor_reduce() const { return _AP_ROOT_op_reduce (xor, Base::V); }
+  INLINE bool xnor_reduce() const {
+    return !(_AP_ROOT_op_reduce (xor, Base::V));
+  }
+
+  /* Output as a string.
+   * ----------------------------------------------------------------
+   */
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char rd = 2, bool sign = _AP_S) const {
+    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
+    // initialize sc_lv, which seems incapable of handling format "-0b".
+    if (rd == 2) sign = false;
+    return (Base::V).to_string(rd, sign);
+  }
+#else
+  INLINE char* to_string(signed char rd = 2, bool sign = _AP_S) const {
+    return 0;
+  }
+#endif
+}; // struct ap_int_base
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::ostream& operator<<(std::ostream& os,
+                                const ap_int_base<_AP_W, _AP_S>& x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.to_string(16); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.to_string(8); // don't print sign
+  } else {
+    os << x.to_string(10);
+  }
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::istream& operator>>(std::istream& in,
+                                ap_int_base<_AP_W, _AP_S>& op) {
+  std::string str;
+  in >> str;
+  const std::ios_base::fmtflags basefield = in.flags() & std::ios_base::basefield;
+  unsigned radix = (basefield == std::ios_base::dec) ? 0 : (
+                     (basefield == std::ios_base::oct) ? 8 : (
+                       (basefield == std::ios_base::hex) ? 16 : 0));
+  op = ap_int_base<_AP_W, _AP_S>(str.c_str(), radix);
+  return in;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/* Operators with another ap_int_base.
+ * ----------------------------------------------------------------
+ */
+#define OP_BIN_AP(Sym, Rty)                                                   \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
+      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
+                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base lhs(op);                                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base rhs(op2);                                 \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
+    ret.V = lhs.V Sym rhs.V;                                                  \
+    return ret;                                                               \
+  }
+
+OP_BIN_AP(*, mult)
+OP_BIN_AP(+, plus)
+OP_BIN_AP(-, minus)
+OP_BIN_AP(&, logic)
+OP_BIN_AP(|, logic)
+OP_BIN_AP(^, logic)
+
+#define OP_BIN_AP2(Sym, Rty)                                                  \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
+      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
+                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
+    ret.V = op.V Sym op2.V;                                                   \
+    return ret;                                                               \
+  }
+
+OP_BIN_AP2(/, div)
+OP_BIN_AP2(%, mod)
+
+// shift operators are defined inside class.
+// compound assignment operators are defined inside class.
+
+/* Operators with a pointer type.
+ * ----------------------------------------------------------------
+ *   char a[100];
+ *   char* ptr = a;
+ *   ap_int<2> n = 3;
+ *   char* ptr2 = ptr + n*2;
+ * avoid ambiguous errors.
+ */
+#define OP_BIN_WITH_PTR(BIN_OP)                                           \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
+  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                        \
+                                   const ap_int_base<_AP_W, _AP_S>& op) { \
+    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
+    return i_op BIN_OP op2;                                               \
+  }                                                                       \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
+  INLINE PTR_TYPE* operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                                   PTR_TYPE* i_op) {                      \
+    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
+    return op2 BIN_OP i_op;                                               \
+  }
+
+OP_BIN_WITH_PTR(+)
+OP_BIN_WITH_PTR(-)
+
+/* Operators with a native floating point types.
+ * ----------------------------------------------------------------
+ */
+// float OP ap_int
+// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
+#define OP_BIN_WITH_FLOAT(BIN_OP, C_TYPE)                              \
+  template <int _AP_W, bool _AP_S>                                     \
+  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                           \
+                                const ap_int_base<_AP_W, _AP_S>& op) { \
+    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
+    return i_op BIN_OP op2;                                            \
+  }                                                                    \
+  template <int _AP_W, bool _AP_S>                                     \
+  INLINE C_TYPE operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                                C_TYPE i_op) {                         \
+    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
+    return op2 BIN_OP i_op;                                            \
+  }
+
+#define ALL_OP_WITH_FLOAT(C_TYPE) \
+  OP_BIN_WITH_FLOAT(*, C_TYPE) \
+  OP_BIN_WITH_FLOAT(/, C_TYPE) \
+  OP_BIN_WITH_FLOAT(+, C_TYPE) \
+  OP_BIN_WITH_FLOAT(-, C_TYPE)
+
+#if _AP_ENABLE_HALF_ == 1
+ALL_OP_WITH_FLOAT(half)
+#endif
+ALL_OP_WITH_FLOAT(float)
+ALL_OP_WITH_FLOAT(double)
+
+// TODO no shift?
+
+/* Operators with a native integral types.
+ * ----------------------------------------------------------------
+ */
+// arithmetic and bitwise operators.
+#define OP_BIN_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)             \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
+                                                            _AP_S2>::RTYPE \
+  operator BIN_OP(C_TYPE i_op, const ap_int_base<_AP_W, _AP_S>& op) {      \
+    return ap_int_base<_AP_W2, _AP_S2>(i_op) BIN_OP(op);                   \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
+                                                            _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, C_TYPE i_op) {      \
+    return op BIN_OP ap_int_base<_AP_W2, _AP_S2>(i_op);                    \
+  }
+
+#define ALL_OP_BIN_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  OP_BIN_WITH_INT(*, C_TYPE, _AP_W2, _AP_S2, mult)  \
+  OP_BIN_WITH_INT(+, C_TYPE, _AP_W2, _AP_S2, plus)  \
+  OP_BIN_WITH_INT(-, C_TYPE, _AP_W2, _AP_S2, minus) \
+  OP_BIN_WITH_INT(/, C_TYPE, _AP_W2, _AP_S2, div)   \
+  OP_BIN_WITH_INT(%, C_TYPE, _AP_W2, _AP_S2, mod)   \
+  OP_BIN_WITH_INT(&, C_TYPE, _AP_W2, _AP_S2, logic) \
+  OP_BIN_WITH_INT(|, C_TYPE, _AP_W2, _AP_S2, logic) \
+  OP_BIN_WITH_INT(^, C_TYPE, _AP_W2, _AP_S2, logic)
+
+ALL_OP_BIN_WITH_INT(bool, 1, false)
+ALL_OP_BIN_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_BIN_WITH_INT(signed char, 8, true)
+ALL_OP_BIN_WITH_INT(unsigned char, 8, false)
+ALL_OP_BIN_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_BIN_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_BIN_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_BIN_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_BIN_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_BIN_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_BIN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_BIN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_BIN_WITH_INT
+#undef ALL_OP_BIN_WITH_INT
+
+// shift operators.
+#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    if (_AP_S2)                                          \
+      r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); \
+    else                                                 \
+      r.V = op.V << op2;                                 \
+    return r;                                            \
+  }                                                      \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    if (_AP_S2)                                          \
+      r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); \
+    else                                                 \
+      r.V = op.V >> op2;                                 \
+    return r;                                            \
+  }
+
+ALL_OP_SHIFT_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_SHIFT_WITH_INT(signed char, 8, true)
+ALL_OP_SHIFT_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_SHIFT_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_SHIFT_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_SHIFT_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+
+#undef ALL_OP_SHIFT_WITH_INT
+
+#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    r.V = op.V << op2;                                   \
+    return r;                                            \
+  }                                                      \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    r.V = op.V >> op2;                                   \
+    return r;                                            \
+  }
+ALL_OP_SHIFT_WITH_INT(bool, 1, false)
+ALL_OP_SHIFT_WITH_INT(unsigned char, 8, false)
+ALL_OP_SHIFT_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_SHIFT_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_SHIFT_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_SHIFT_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ALL_OP_SHIFT_WITH_INT
+
+// compound assign operators.
+#define OP_ASSIGN_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE ap_int_base<_AP_W, _AP_S>& operator ASSIGN_OP(             \
+      ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
+    return op ASSIGN_OP ap_int_base<_AP_W2, _AP_S2>(op2);           \
+  }
+
+// TODO int a; ap_int<16> b; a += b;
+
+#define ALL_OP_ASSIGN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
+  OP_ASSIGN_WITH_INT(+=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(-=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(*=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(/=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(%=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(&=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(|=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(^=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(>>=, C_TYPE, _AP_W2, _AP_S2)      \
+  OP_ASSIGN_WITH_INT(<<=, C_TYPE, _AP_W2, _AP_S2)
+
+ALL_OP_ASSIGN_WITH_INT(bool, 1, false)
+ALL_OP_ASSIGN_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_ASSIGN_WITH_INT(signed char, 8, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned char, 8, false)
+ALL_OP_ASSIGN_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_ASSIGN_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_ASSIGN_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_ASSIGN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_ASSIGN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_ASSIGN_WITH_INT
+#undef ALL_OP_ASSIGN_WITH_INT
+
+// equality and relational operators.
+#define OP_REL_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)              \
+  template <int _AP_W, bool _AP_S>                                   \
+  INLINE bool operator REL_OP(C_TYPE i_op,                           \
+                              const ap_int_base<_AP_W, _AP_S>& op) { \
+    return ap_int_base<_AP_W2, _AP_S2>(i_op) REL_OP op;              \
+  }                                                                  \
+  template <int _AP_W, bool _AP_S>                                   \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                              C_TYPE op2) {                          \
+    return op REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);               \
+  }
+
+#define ALL_OP_REL_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
+  OP_REL_WITH_INT(>, C_TYPE, _AP_W2, _AP_S2)        \
+  OP_REL_WITH_INT(<, C_TYPE, _AP_W2, _AP_S2)        \
+  OP_REL_WITH_INT(>=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(<=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(==, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(!=, C_TYPE, _AP_W2, _AP_S2)
+
+ALL_OP_REL_WITH_INT(bool, 1, false)
+ALL_OP_REL_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_REL_WITH_INT(signed char, 8, true)
+ALL_OP_REL_WITH_INT(unsigned char, 8, false)
+ALL_OP_REL_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_REL_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_REL_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_REL_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_REL_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_REL_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_REL_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_REL_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_REL_WITH_INT
+#undef ALL_OP_BIN_WITH_INT
+
+#define OP_REL_WITH_DOUBLE_OR_FLOAT(Sym)                            \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
+                           double op2) {                            \
+    return op1.to_double() Sym op2 ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(double op1,                              \
+                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
+    return op1 Sym op2.to_double() ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
+                           float op2) {                             \
+    return op1.to_double() Sym op2 ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(float op1,                               \
+                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
+    return op1 Sym op2.to_double() ;                                \
+  }
+  OP_REL_WITH_DOUBLE_OR_FLOAT(>)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(<)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(>=)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(<=)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(==)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(!=)
+
+#undef OP_REL_WITH_DOUBLE_OR_FLOAT
+
+
+/* Operators with ap_bit_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+#define OP_BIN_WITH_RANGE(BIN_OP, RTYPE)                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                              _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,                   \
+                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    return ap_int_base<_AP_W1, false>(op1) BIN_OP op2;                       \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                              _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                    \
+                  const ap_range_ref<_AP_W2, _AP_S2>& op2) {                 \
+    return op1 BIN_OP ap_int_base<_AP_W2, false>(op2);                       \
+  }
+
+OP_BIN_WITH_RANGE(+, plus)
+OP_BIN_WITH_RANGE(-, minus)
+OP_BIN_WITH_RANGE(*, mult)
+OP_BIN_WITH_RANGE(/, div)
+OP_BIN_WITH_RANGE(%, mod)
+OP_BIN_WITH_RANGE(&, logic)
+OP_BIN_WITH_RANGE(|, logic)
+OP_BIN_WITH_RANGE(^, logic)
+OP_BIN_WITH_RANGE(>>, arg1)
+OP_BIN_WITH_RANGE(<<, arg1)
+
+#undef OP_BIN_WITH_RANGE
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_RANGE(ASSIGN_OP)                                      \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                    \
+      ap_int_base<_AP_W1, _AP_S1>& op1, ap_range_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1 ASSIGN_OP ap_int_base<_AP_W2, false>(op2);                    \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE ap_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
+      ap_range_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    ap_int_base<_AP_W1, false> tmp(op1);                                     \
+    tmp ASSIGN_OP op2;                                                       \
+    op1 = tmp;                                                               \
+    return op1;                                                              \
+  }
+
+OP_ASSIGN_WITH_RANGE(+=)
+OP_ASSIGN_WITH_RANGE(-=)
+OP_ASSIGN_WITH_RANGE(*=)
+OP_ASSIGN_WITH_RANGE(/=)
+OP_ASSIGN_WITH_RANGE(%=)
+OP_ASSIGN_WITH_RANGE(&=)
+OP_ASSIGN_WITH_RANGE(|=)
+OP_ASSIGN_WITH_RANGE(^=)
+OP_ASSIGN_WITH_RANGE(>>=)
+OP_ASSIGN_WITH_RANGE(<<=)
+
+#undef OP_ASSIGN_WITH_RANGE
+
+// equality and relational operators
+#define OP_REL_WITH_RANGE(REL_OP)                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE bool operator REL_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,     \
+                              const ap_int_base<_AP_W2, _AP_S2>& op2) {    \
+    return ap_int_base<_AP_W1, false>(op1).operator REL_OP(op2);           \
+  }                                                                        \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,      \
+                              const ap_range_ref<_AP_W2, _AP_S2>& op2) {   \
+    return op1.operator REL_OP(op2.operator ap_int_base<_AP_W2, false>()); \
+  }
+
+OP_REL_WITH_RANGE(==)
+OP_REL_WITH_RANGE(!=)
+OP_REL_WITH_RANGE(>)
+OP_REL_WITH_RANGE(>=)
+OP_REL_WITH_RANGE(<)
+OP_REL_WITH_RANGE(<=)
+
+#undef OP_REL_WITH_RANGE
+
+/* Operators with ap_bit_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+#define OP_BIN_WITH_BIT(BIN_OP, RTYPE)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                      \
+                  const ap_bit_ref<_AP_W2, _AP_S2>& op2) {                     \
+    return op1 BIN_OP ap_int_base<1, false>(op2);                              \
+  }                                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,                       \
+                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                    \
+    return ap_int_base<1, false>(op1) BIN_OP op2;                              \
+  }
+
+OP_BIN_WITH_BIT(+, plus)
+OP_BIN_WITH_BIT(-, minus)
+OP_BIN_WITH_BIT(*, mult)
+OP_BIN_WITH_BIT(/, div)
+OP_BIN_WITH_BIT(%, mod)
+OP_BIN_WITH_BIT(&, logic)
+OP_BIN_WITH_BIT(|, logic)
+OP_BIN_WITH_BIT(^, logic)
+OP_BIN_WITH_BIT(>>, arg1)
+OP_BIN_WITH_BIT(<<, arg1)
+
+#undef OP_BIN_WITH_BIT
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_BIT(ASSIGN_OP)                                      \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                  \
+      ap_int_base<_AP_W1, _AP_S1>& op1, ap_bit_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1 ASSIGN_OP ap_int_base<1, false>(op2);                       \
+  }                                                                        \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE ap_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
+      ap_bit_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    ap_int_base<1, false> tmp(op1);                                        \
+    tmp ASSIGN_OP op2;                                                     \
+    op1 = tmp;                                                             \
+    return op1;                                                            \
+  }
+
+OP_ASSIGN_WITH_BIT(+=)
+OP_ASSIGN_WITH_BIT(-=)
+OP_ASSIGN_WITH_BIT(*=)
+OP_ASSIGN_WITH_BIT(/=)
+OP_ASSIGN_WITH_BIT(%=)
+OP_ASSIGN_WITH_BIT(&=)
+OP_ASSIGN_WITH_BIT(|=)
+OP_ASSIGN_WITH_BIT(^=)
+OP_ASSIGN_WITH_BIT(>>=)
+OP_ASSIGN_WITH_BIT(<<=)
+
+#undef OP_ASSIGN_WITH_BIT
+
+// equality and relational operators.
+#define OP_REL_WITH_BIT(REL_OP)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,   \
+                              const ap_bit_ref<_AP_W2, _AP_S2>& op2) {  \
+    return op1 REL_OP ap_int_base<1, false>(op2);                       \
+  }                                                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
+  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,    \
+                              const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    return ap_int_base<1, false>(op1) REL_OP op2;                       \
+  }
+
+OP_REL_WITH_BIT(==)
+OP_REL_WITH_BIT(!=)
+OP_REL_WITH_BIT(>)
+OP_REL_WITH_BIT(>=)
+OP_REL_WITH_BIT(<)
+OP_REL_WITH_BIT(<=)
+
+#undef OP_REL_WITH_BIT
+
+
+/* Operators with ap_concat_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+// bitwise operators are defined in struct.
+// TODO specify whether to define arithmetic and bitwise operators.
+#if 0
+#define OP_BIN_WITH_CONCAT(BIN_OP, RTYPE)                                      \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
+                                                              false>::RTYPE    \
+  operator BIN_OP(const ap_int_base<_AP_W3, _AP_S3>& op1,                      \
+                  const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {  \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1 BIN_OP op2.get();                                               \
+  }                                                                            \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
+                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
+  operator BIN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,    \
+                  const ap_int_base<_AP_W3, _AP_S3>& op2) {                    \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1.get() BIN_OP op2;                                               \
+  }
+
+OP_BIN_WITH_CONCAT(+, plus)
+OP_BIN_WITH_CONCAT(-, minus)
+OP_BIN_WITH_CONCAT(*, mult)
+OP_BIN_WITH_CONCAT(/, div)
+OP_BIN_WITH_CONCAT(%, mod)
+OP_BIN_WITH_CONCAT(&, logic)
+OP_BIN_WITH_CONCAT(|, logic)
+OP_BIN_WITH_CONCAT(^, logic)
+OP_BIN_WITH_CONCAT(>>, arg1)
+OP_BIN_WITH_CONCAT(<<, arg1)
+
+#undef OP_BIN_WITH_CONCAT
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_CONCAT(ASSIGN_OP)                                       \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
+                                                              false>::RTYPE    \
+  operator ASSIGN_OP(                                                          \
+      const ap_int_base<_AP_W3, _AP_S3>& op1,                                  \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {              \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1 ASSIGN_OP op2.get();                                            \
+  }                                                                            \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
+                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
+  operator ASSIGN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \
+                     const ap_int_base<_AP_W3, _AP_S3>& op2) {                 \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    ap_int_base<_AP_W1 + _AP_W2, false> tmp = op1.get();                       \
+    tmp ASSIGN_OP op2;                                                         \
+    op1 = tmp;                                                                 \
+    return op1;                                                                \
+  }
+
+OP_ASSIGN_WITH_CONCAT(+=)
+OP_ASSIGN_WITH_CONCAT(-=)
+OP_ASSIGN_WITH_CONCAT(*=)
+OP_ASSIGN_WITH_CONCAT(/=)
+OP_ASSIGN_WITH_CONCAT(%=)
+OP_ASSIGN_WITH_CONCAT(&=)
+OP_ASSIGN_WITH_CONCAT(|=)
+OP_ASSIGN_WITH_CONCAT(^=)
+OP_ASSIGN_WITH_CONCAT(>>=)
+OP_ASSIGN_WITH_CONCAT(<<=)
+
+#undef OP_ASSIGN_WITH_CONCAT
+#endif
+
+// equality and relational operators.
+#define OP_REL_WITH_CONCAT(REL_OP)                                    \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
+            int _AP_W3, bool _AP_S3>                                  \
+  INLINE bool operator REL_OP(                                        \
+      const ap_int_base<_AP_W3, _AP_S3>& op1,                         \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {     \
+    /* convert ap_concat_ref to ap_int_base */                        \
+    return op1 REL_OP op2.get();                                      \
+  }                                                                   \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
+            int _AP_W3, bool _AP_S3>                                  \
+  INLINE bool operator REL_OP(                                        \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,       \
+      const ap_int_base<_AP_W3, _AP_S3>& op2) {                       \
+    /* convert ap_concat_ref to ap_int_base */                        \
+    return op1.get() REL_OP op2;                                      \
+  }
+
+OP_REL_WITH_CONCAT(==)
+OP_REL_WITH_CONCAT(!=)
+OP_REL_WITH_CONCAT(>)
+OP_REL_WITH_CONCAT(>=)
+OP_REL_WITH_CONCAT(<)
+OP_REL_WITH_CONCAT(<=)
+
+#undef OP_REL_WITH_CONCAT
+
+#endif // ifndef __cplusplus
+#endif // ifndef __AP_INT_BASE_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_ref.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_ref.h
new file mode 100644
index 00000000..421f09fd
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_ref.h
@@ -0,0 +1,1346 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_REF_H__
+#define __AP_INT_REF_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+
+#else
+
+#ifndef __SYNTHESIS__
+#include <iostream>
+#endif
+
+/* Concatination reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+struct ap_concat_ref {
+  enum {
+    _AP_WR = _AP_W1 + _AP_W2,
+  };
+
+  _AP_T1& mbv1;
+  _AP_T2& mbv2;
+
+  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref)
+      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
+
+  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W1, false> Part1;
+    Part1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
+    mbv1.set(Part1);
+    ap_int_base<_AP_W2, false> Part2;
+    Part2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
+    mbv2.set(Part2);
+    return *this;
+  }
+
+  // assign op from hls supported C integral types.
+  // FIXME disabled to support legacy code directly assign from sc_signal<T>
+  //template <typename T>
+  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                    ap_concat_ref&>::type
+  //operator=(T val) {
+  //  ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+  //  return operator=(tmpVal);
+  //}
+#define ASSIGN_WITH_CTYPE(_Tp)                       \
+  INLINE ap_concat_ref& operator=(_Tp val) {         \
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); \
+    return operator=(tmpVal);                        \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE(half)
+#endif
+  ASSIGN_WITH_CTYPE(float)
+  ASSIGN_WITH_CTYPE(double)
+
+#undef ASSIGN_WITH_CTYPE
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_concat_ref& operator=(
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+  INLINE ap_concat_ref& operator=(
+      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+    return operator=((const ap_int_base<_AP_W3, false>)(val));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
+          val) {
+    return operator=(val.to_ap_int_base());
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  INLINE operator ap_int_base<_AP_WR, false>() const { return get(); }
+
+  INLINE operator ap_ulong() const { return get().to_uint64(); }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                       ap_range_ref<_AP_W3, _AP_S3> >
+  operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_range_ref<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_range_ref<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(*this, a2);
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
+    // FIXME op's life does not seem long enough
+    ap_int_base<_AP_W3, _AP_S3> op(a2);
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(op));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >
+  operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_bit_ref<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
+  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
+        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>&>(a2));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref<
+      _AP_WR, ap_concat_ref, _AP_W3,
+      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+  operator,(
+      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) {
+    return ap_concat_ref<
+        _AP_WR, ap_concat_ref, _AP_W3,
+        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
+                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
+                    &a2) {
+    return ap_concat_ref<
+        _AP_WR, ap_concat_ref, 1,
+        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(
+            a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() & a2;
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() | a2;
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() ^ a2;
+  }
+
+#if 0
+  template<int Hi, int Lo>
+  INLINE ap_int_base<Hi-Lo+1, false> slice() {
+    ap_int_base<_AP_WR, false> bv = get();
+    return bv.slice<Hi,Lo>();
+  }
+#endif
+
+  INLINE ap_int_base<_AP_WR, false> get() const {
+    ap_int_base<_AP_WR, false> tmpVal(0);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W2, false> v2(mbv2);
+    ap_int_base<_AP_W1, false> v1(mbv1);
+    tmpVal.V = _AP_ROOT_op_set_range(tmpVal.V, 0, W_ref2 - 1, v2.V);
+    tmpVal.V =
+        _AP_ROOT_op_set_range(tmpVal.V, W_ref2, W_ref1 + W_ref2 - 1, v1.V);
+    return tmpVal;
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W1, false> tmpVal1;
+    tmpVal1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
+    mbv1.set(tmpVal1);
+    ap_int_base<_AP_W2, false> tmpVal2;
+    tmpVal2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
+    mbv2.set(tmpVal2);
+  }
+
+  INLINE int length() const { return mbv1.length() + mbv2.length(); }
+}; // struct ap_concat_ref
+
+/* Range (slice) reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W, bool _AP_S>
+struct ap_range_ref {
+  // struct ssdm_int or its sim model.
+  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
+  //      and then we can retire af_range_ref.
+  typedef ap_int_base<_AP_W, _AP_S> ref_type;
+  ref_type& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  INLINE ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  INLINE ap_range_ref(ref_type* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {}
+
+  INLINE ap_range_ref(const ref_type* bv, int h, int l)
+      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {}
+
+  INLINE operator ap_int_base<_AP_W, false>() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  INLINE operator ap_ulong() const { return to_uint64(); }
+
+  /// @name assign operators
+  //  @{
+
+  // FIXME disabled to work-around lagacy code assigning from sc_signal<T>,
+  // which dependes on implicit type conversion.
+  //
+  //   /// assign from hls supported C integral types.
+  //   template <typename T>
+  //   INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                       ap_range_ref&>::type
+  //   operator=(T val) {
+  //     ap_int_base<_AP_W, false> tmp(val);
+  //     d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+  //     return *this;
+  //   }
+#define ASSIGN_WITH_CTYPE(_Tp)                                       \
+  INLINE ap_range_ref& operator=(_Tp val) {                          \
+    ap_int_base<_AP_W, false> tmp(val);                              \
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); \
+    return *this;                                                    \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE(half)
+#endif
+  ASSIGN_WITH_CTYPE(float)
+  ASSIGN_WITH_CTYPE(double)
+
+#undef ASSIGN_WITH_CTYPE
+
+  /// assign using string. XXX crucial for cosim.
+  INLINE ap_range_ref& operator=(const char* val) {
+    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// assign from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    ap_int_base<_AP_W, false> tmp(val);
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// copy assign operator
+  // XXX Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_range_ref& operator=(const ap_range_ref& val) {
+    return operator=((const ap_int_base<_AP_W, false>)val);
+  }
+
+  /// assign from range reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  /// assign from bit reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  /// assign from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          val) {
+    return operator=(val.to_ap_int_base());
+  }
+
+  /// assign from range reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  /// assign from bit reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  /// assign from compound reference.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_range_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val));
+  }
+  //  @}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  INLINE
+  ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> >
+  operator,(ap_int_base<_AP_W, _AP_S>& a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W,
+                         ap_int_base<_AP_W, _AP_S> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_range_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) {
+    return ap_concat_ref<
+        _AP_W, ap_range_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_range_ref, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop == hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop < hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop <= hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<=(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V |= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V |= op2.V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V &= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V &= op2.V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V ^= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V ^= op2.V;
+    return *this;
+  };
+
+  INLINE ap_int_base<_AP_W, false> get() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+  INLINE int to_int() const {
+    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned to_uint() const {
+    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE long to_long() const {
+    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned long to_ulong() const {
+    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_slong to_int64() const {
+    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE bool and_reduce() const {
+    bool ret = true;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret &= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+
+  INLINE bool or_reduce() const {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret |= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+
+  INLINE bool xor_reduce() const {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret ^= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char radix = 2) const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_string(radix);
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(signed char radix = 2) const {
+    return 0;
+  }
+#endif
+}; // struct ap_range_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::ostream& operator<<(std::ostream& os,
+                                const ap_range_ref<_AP_W, _AP_S>& x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.to_string(16); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.to_string(8); // don't print sign
+  } else {
+    os << x.to_string(10);
+  }
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::istream& operator>>(std::istream& in,
+                                ap_range_ref<_AP_W, _AP_S>& op) {
+  std::string str;
+  in >> str;
+  op = ap_int_base<_AP_W, _AP_S>(str.c_str());
+  return in;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/* Bit reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W, bool _AP_S>
+struct ap_bit_ref {
+  // struct ssdm_int or its sim model.
+  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
+  //      and then we can retire af_bit_ref.
+  typedef ap_int_base<_AP_W, _AP_S> ref_type;
+  ref_type& d_bv;
+  int d_index;
+
+ public:
+  // copy ctor
+  INLINE ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {}
+
+  INLINE ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
+
+  INLINE ap_bit_ref(const ref_type* bv, int index = 0)
+      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
+
+  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+  INLINE bool to_bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  // assign op from hls supported C integral types.
+  // FIXME disabled to support sc_signal<bool>.
+  // NOTE this used to be unsigned long long.
+  //template <typename T>
+  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                    ap_bit_ref&>::type
+  //operator=(T val) {
+  //  d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
+  //  return *this;
+  //}
+#define ASSIGN_WITH_CTYPE(_Tp)                          \
+  INLINE ap_bit_ref& operator=(_Tp val) {               \
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); \
+    return *this;                                       \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+
+#undef ASSIGN_WITH_CTYPE
+
+#define ASSIGN_WITH_CTYPE_FP(_Tp)                           \
+  INLINE ap_bit_ref& operator=(_Tp val) {                   \
+    bool tmp_val = val;                                     \
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index,tmp_val);  \
+    return *this;                                           \
+  }
+
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE_FP(half)
+#endif
+  ASSIGN_WITH_CTYPE_FP(float)
+  ASSIGN_WITH_CTYPE_FP(double)
+
+#undef ASSIGN_WITH_CTYPE_FP
+
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(val.V != 0));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_int_base<_AP_W2, false>)val);
+  }
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_bit_ref& operator=(const ap_bit_ref& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_bit_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_bit_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_bit_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    ap_int_base<_AP_W2, _AP_S2> op(a2);
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    ap_int_base<_AP_W2, _AP_S2> op(a2);
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
+      const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      1, ap_bit_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        1, ap_bit_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                    _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                      _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    return get() != op.get();
+  }
+
+  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  INLINE bool get() { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    operator=(val);
+  }
+
+  INLINE bool operator~() const {
+    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
+    return bit ? false : true;
+  }
+
+  INLINE int length() const { return 1; }
+
+#ifndef __SYNTHESIS__
+  std::string to_string() const { return get() ? "1" : "0"; }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string() const { return 0; }
+#endif
+}; // struct ap_bit_ref
+
+/* ap_range_ref with int.
+ * ------------------------------------------------------------
+ */
+// equality and relational operators.
+#define REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_range_ref<_AP_W, _AP_S>& op,        \
+                              C_TYPE op2) {                                \
+    return ap_int_base<_AP_W, false>(op)                                   \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W, _AP_S>& op,          \
+                              C_TYPE op2) {                                \
+    return bool(op) REL_OP op2;                                            \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(C_TYPE op2,                                  \
+                              const ap_bit_ref<_AP_W, _AP_S>& op) {        \
+    return op2 REL_OP bool(op);                                            \
+  }                                                                        \
+  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
+  INLINE bool operator REL_OP(                                             \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, C_TYPE op2) { \
+    return ap_int_base<_AP_W + _AP_W1, false>(op)                          \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
+  }
+
+// Make the line shorter than 5000 chars
+#define REF_REL_WITH_INT_1(C_TYPE, _AP_WI, _AP_SI) \
+  REF_REL_OP_WITH_INT(>, C_TYPE, _AP_WI, _AP_SI)   \
+  REF_REL_OP_WITH_INT(<, C_TYPE, _AP_WI, _AP_SI)   \
+  REF_REL_OP_WITH_INT(>=, C_TYPE, _AP_WI, _AP_SI)  \
+  REF_REL_OP_WITH_INT(<=, C_TYPE, _AP_WI, _AP_SI)
+
+REF_REL_WITH_INT_1(bool, 1, false)
+REF_REL_WITH_INT_1(char, 8, CHAR_IS_SIGNED)
+REF_REL_WITH_INT_1(signed char, 8, true)
+REF_REL_WITH_INT_1(unsigned char, 8, false)
+REF_REL_WITH_INT_1(short, _AP_SIZE_short, true)
+REF_REL_WITH_INT_1(unsigned short, _AP_SIZE_short, false)
+REF_REL_WITH_INT_1(int, _AP_SIZE_int, true)
+REF_REL_WITH_INT_1(unsigned int, _AP_SIZE_int, false)
+REF_REL_WITH_INT_1(long, _AP_SIZE_long, true)
+REF_REL_WITH_INT_1(unsigned long, _AP_SIZE_long, false)
+REF_REL_WITH_INT_1(ap_slong, _AP_SIZE_ap_slong, true)
+REF_REL_WITH_INT_1(ap_ulong, _AP_SIZE_ap_slong, false)
+
+// Make the line shorter than 5000 chars
+#define REF_REL_WITH_INT_2(C_TYPE, _AP_WI, _AP_SI) \
+  REF_REL_OP_WITH_INT(==, C_TYPE, _AP_WI, _AP_SI)  \
+  REF_REL_OP_WITH_INT(!=, C_TYPE, _AP_WI, _AP_SI)
+
+REF_REL_WITH_INT_2(bool, 1, false)
+REF_REL_WITH_INT_2(char, 8, CHAR_IS_SIGNED)
+REF_REL_WITH_INT_2(signed char, 8, true)
+REF_REL_WITH_INT_2(unsigned char, 8, false)
+REF_REL_WITH_INT_2(short, _AP_SIZE_short, true)
+REF_REL_WITH_INT_2(unsigned short, _AP_SIZE_short, false)
+REF_REL_WITH_INT_2(int, _AP_SIZE_int, true)
+REF_REL_WITH_INT_2(unsigned int, _AP_SIZE_int, false)
+REF_REL_WITH_INT_2(long, _AP_SIZE_long, true)
+REF_REL_WITH_INT_2(unsigned long, _AP_SIZE_long, false)
+REF_REL_WITH_INT_2(ap_slong, _AP_SIZE_ap_slong, true)
+REF_REL_WITH_INT_2(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_REL_OP_WITH_INT
+#undef REF_REL_WITH_INT_1
+#undef REF_REL_WITH_INT_2
+
+#define REF_BIN_OP_WITH_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)          \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE typename ap_int_base<_AP_W, false>::template RType<_AP_W2,         \
+                                                            _AP_S2>::RTYPE  \
+  operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& op, C_TYPE op2) {       \
+    return ap_int_base<_AP_W, false>(op)                                    \
+        BIN_OP ap_int_base<_AP_W2, _AP_S2>(op2);                            \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE typename ap_int_base<_AP_W2, _AP_S2>::template RType<_AP_W,        \
+                                                              false>::RTYPE \
+  operator BIN_OP(C_TYPE op2, const ap_range_ref<_AP_W, _AP_S>& op) {       \
+    return ap_int_base<_AP_W2, _AP_S2>(op2)                                 \
+        BIN_OP ap_int_base<_AP_W, false>(op);                               \
+  }
+
+// arithmetic operators.
+#define REF_BIN_OP_WITH_INT_ARITH(C_TYPE, _AP_W2, _AP_S2)   \
+  REF_BIN_OP_WITH_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_WITH_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_WITH_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_OP_WITH_INT_ARITH(bool, 1, false)
+REF_BIN_OP_WITH_INT_ARITH(char, 8, CHAR_IS_SIGNED)
+REF_BIN_OP_WITH_INT_ARITH(signed char, 8, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned char, 8, false)
+REF_BIN_OP_WITH_INT_ARITH(short, _AP_SIZE_short, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned short, _AP_SIZE_short, false)
+REF_BIN_OP_WITH_INT_ARITH(int, _AP_SIZE_int, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned int, _AP_SIZE_int, false)
+REF_BIN_OP_WITH_INT_ARITH(long, _AP_SIZE_long, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned long, _AP_SIZE_long, false)
+REF_BIN_OP_WITH_INT_ARITH(ap_slong, _AP_SIZE_ap_slong, true)
+REF_BIN_OP_WITH_INT_ARITH(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_BIN_OP_WITH_INT_ARITH
+
+// bitwise and shift operators
+#define REF_BIN_OP_WITH_INT_BITS(C_TYPE, _AP_W2, _AP_S2)     \
+  REF_BIN_OP_WITH_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_WITH_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_OP_WITH_INT_BITS(bool, 1, false)
+REF_BIN_OP_WITH_INT_BITS(char, 8, CHAR_IS_SIGNED)
+REF_BIN_OP_WITH_INT_BITS(signed char, 8, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned char, 8, false)
+REF_BIN_OP_WITH_INT_BITS(short, _AP_SIZE_short, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned short, _AP_SIZE_short, false)
+REF_BIN_OP_WITH_INT_BITS(int, _AP_SIZE_int, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned int, _AP_SIZE_int, false)
+REF_BIN_OP_WITH_INT_BITS(long, _AP_SIZE_long, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned long, _AP_SIZE_long, false)
+REF_BIN_OP_WITH_INT_BITS(ap_slong, _AP_SIZE_ap_slong, true)
+REF_BIN_OP_WITH_INT_BITS(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_BIN_OP_WITH_INT_BITS
+
+/* ap_range_ref with ap_range_ref
+ *  ------------------------------------------------------------
+ */
+#define REF_BIN_OP(BIN_OP, RTYPE)                                              \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                    \
+  INLINE                                                                       \
+      typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
+      operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& lhs,                   \
+                      const ap_range_ref<_AP_W2, _AP_S2>& rhs) {               \
+    return (lhs.operator ap_int_base<_AP_W, false>())BIN_OP(                   \
+        rhs.operator ap_int_base<_AP_W2, false>());                            \
+  }
+
+REF_BIN_OP(+, plus)
+REF_BIN_OP(-, minus)
+REF_BIN_OP(*, mult)
+REF_BIN_OP(/, div)
+REF_BIN_OP(%, mod)
+REF_BIN_OP(&, logic)
+REF_BIN_OP(|, logic)
+REF_BIN_OP(^, logic)
+REF_BIN_OP(>>, arg1)
+REF_BIN_OP(<<, arg1)
+
+/* ap_concat_ref with ap_concat_ref.
+ *  ------------------------------------------------------------
+ */
+
+//************************************************************************
+//  Implement
+//      ap_int_base<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
+//  for operators  +, -, *, /, %, >>, <<, &, |, ^
+//  Without these operators the operands are converted to int64 and
+//  larger results lose informations (higher order bits).
+//
+//                       operand OP
+//                      /          |
+//              left-concat         right-concat
+//                /     |            /         |
+//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>    <RW2,RT2>
+//
+//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
+//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
+//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
+//
+//  In Verilog 2001 result of concatenation is always unsigned even
+//  when both sides are signed.
+//************************************************************************
+
+#undef SYN_CONCAT_REF_BIN_OP
+
+#define SYN_CONCAT_REF_BIN_OP(BIN_OP, RTYPE)                              \
+  template <int _AP_LW1, typename _AP_LT1, int _AP_LW2, typename _AP_LT2, \
+            int _AP_RW1, typename _AP_RT1, int _AP_RW2, typename _AP_RT2> \
+  INLINE typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType<  \
+      _AP_RW1 + _AP_RW2, false>::RTYPE                                    \
+  operator BIN_OP(                                                        \
+      const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs,       \
+      const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) {     \
+    return lhs.get() BIN_OP rhs.get();                                    \
+  }
+
+SYN_CONCAT_REF_BIN_OP(+, plus)
+SYN_CONCAT_REF_BIN_OP(-, minus)
+SYN_CONCAT_REF_BIN_OP(*, mult)
+SYN_CONCAT_REF_BIN_OP(/, div)
+SYN_CONCAT_REF_BIN_OP(%, mod)
+SYN_CONCAT_REF_BIN_OP(&, logic)
+SYN_CONCAT_REF_BIN_OP(|, logic)
+SYN_CONCAT_REF_BIN_OP(^, logic)
+SYN_CONCAT_REF_BIN_OP(>>, arg1)
+SYN_CONCAT_REF_BIN_OP(<<, arg1)
+
+#undef SYN_CONCAT_REF_BIN_OP
+
+#define CONCAT_OP_WITH_INT(C_TYPE, _AP_WI, _AP_SI)                          \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const ap_int_base<_AP_W, _AP_S> &op1, C_TYPE op2) {                   \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    ret <<= _AP_WI;                                                         \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1, const ap_int_base<_AP_W, _AP_S> &op2) {                   \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    if (_AP_S) {                                                            \
+      ret <<= _AP_WI;                                                       \
+      ret >>= _AP_WI;                                                       \
+    }                                                                       \
+    ret |= val << _AP_W;                                                    \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const ap_range_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    ret <<= _AP_WI;                                                         \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1, const ap_range_ref<_AP_W, _AP_S> &op2) {                  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
+      const ap_bit_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                    \
+    ap_int_base<_AP_WI + 1, false> val(op2);                                \
+    val[_AP_WI] = op1;                                                      \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
+      C_TYPE op1, const ap_bit_ref<_AP_W, _AP_S> &op2) {                    \
+    ap_int_base<_AP_WI + 1, false> val(op1);                                \
+    val <<= 1;                                                              \
+    val[0] = op2;                                                           \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
+  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, C_TYPE op2) { \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op2);                  \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op1);                  \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W + _AP_W2;                                               \
+      val >>= _AP_W + _AP_W2;                                               \
+    }                                                                       \
+    ret <<= _AP_WI;                                                         \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
+  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
+      C_TYPE op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op1);                  \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op2);                  \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,    \
+      C_TYPE op2) {                                                         \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret <<= _AP_WI;                                                         \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1,                                                           \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,      \
+      C_TYPE op2) {                                                         \
+    ap_int_base<_AP_WI + 1, _AP_SI> val(op2);                               \
+    val[_AP_WI] = op1;                                                      \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
+      C_TYPE op1,                                                           \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {    \
+    ap_int_base<_AP_WI + 1, _AP_SI> val(op1);                               \
+    val <<= 1;                                                              \
+    val[0] = op2;                                                           \
+    return val;                                                             \
+  }
+
+CONCAT_OP_WITH_INT(bool, 1, false)
+CONCAT_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
+CONCAT_OP_WITH_INT(signed char, 8, true)
+CONCAT_OP_WITH_INT(unsigned char, 8, false)
+CONCAT_OP_WITH_INT(short, _AP_SIZE_short, true)
+CONCAT_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
+CONCAT_OP_WITH_INT(int, _AP_SIZE_int, true)
+CONCAT_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
+CONCAT_OP_WITH_INT(long, _AP_SIZE_long, true)
+CONCAT_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
+CONCAT_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+CONCAT_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef CONCAT_OP_WITH_INT
+
+#define CONCAT_SHIFT_WITH_INT(C_TYPE, OP)                                  \
+  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
+  INLINE ap_uint<_AP_W + _AP_W1> operator OP(                              \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, C_TYPE rhs) { \
+    return ap_uint<_AP_W + _AP_W1>(lhs).get() OP int(rhs);                 \
+  }
+
+// FIXME int(rhs) may loose precision.
+
+CONCAT_SHIFT_WITH_INT(int, <<)
+CONCAT_SHIFT_WITH_INT(unsigned int, <<)
+CONCAT_SHIFT_WITH_INT(long, <<)
+CONCAT_SHIFT_WITH_INT(unsigned long, <<)
+CONCAT_SHIFT_WITH_INT(ap_slong, <<)
+CONCAT_SHIFT_WITH_INT(ap_ulong, <<)
+
+CONCAT_SHIFT_WITH_INT(int, >>)
+CONCAT_SHIFT_WITH_INT(unsigned int, >>)
+CONCAT_SHIFT_WITH_INT(long, >>)
+CONCAT_SHIFT_WITH_INT(unsigned long, >>)
+CONCAT_SHIFT_WITH_INT(ap_slong, >>)
+CONCAT_SHIFT_WITH_INT(ap_ulong, >>)
+
+#endif // ifndef __cplusplus
+#endif // ifndef __AP_INT_REF_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_special.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_special.h
new file mode 100644
index 00000000..3afc6192
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_int_special.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_SPECIAL_H__
+#define __AP_INT_SPECIAL_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __SYNTHESIS__
+#include <cstdio>
+#include <cstdlib>
+#endif
+// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
+// include.
+// #include <complex>
+namespace std {
+template<typename _Tp> class complex;
+}
+
+/*
+  TODO: Modernize the code using C++11/C++14
+  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
+  2. move constructor
+*/
+
+namespace std {
+/*
+   Specialize std::complex<ap_int> to zero initialization ap_int.
+
+   To reduce the area cost, ap_int is not zero initialized, just like basic
+   types float or double. However, libstdc++ provides specialization for float,
+   double and long double, initializing image part to 0 when not specified.
+
+   This has become a difficulty in switching legacy code from these C types to
+   ap_int. To ease the tranform of legacy code, we have to implement
+   specialization of std::complex<> for our type.
+
+   As ap_int is a template, it is impossible to specialize only the methods
+   that causes default initialization of value type in std::complex<>. An
+   explicit full specialization of the template class has to be done, covering
+   all the member functions and operators of std::complex<> as specified
+   in standard 26.2.4 and 26.2.5.
+*/
+template <int _AP_W>
+class complex<ap_int<_AP_W> > {
+ public:
+  typedef ap_int<_AP_W> _Tp;
+  typedef _Tp value_type;
+
+  // 26.2.4/1
+  // Constructor without argument
+  // Default initialize, so that in dataflow, the variable is only written once.
+  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
+  // Constructor with ap_int.
+  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
+  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
+      : _M_real(__r), _M_imag(__i) {}
+
+  // Constructor with another complex number
+  template <typename _Up>
+  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
+
+#if __cplusplus >= 201103L
+  const _Tp& real() const { return _M_real; }
+  const _Tp& imag() const { return _M_imag; }
+#else
+  _Tp& real() { return _M_real; }
+  const _Tp& real() const { return _M_real; }
+  _Tp& imag() { return _M_imag; }
+  const _Tp& imag() const { return _M_imag; }
+#endif
+
+  void real(_Tp __val) { _M_real = __val; }
+
+  void imag(_Tp __val) { _M_imag = __val; }
+
+  // Assign this complex number with ap_int.
+  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
+  complex<_Tp> &operator=(const _Tp __t) {
+    _M_real = __t;
+    _M_imag = _Tp(0);
+    return *this;
+  }
+
+  // 26.2.5/1
+  // Add ap_int to this complex number.
+  complex<_Tp> &operator+=(const _Tp &__t) {
+    _M_real += __t;
+    return *this;
+  }
+
+  // 26.2.5/3
+  // Subtract ap_int from this complex number.
+  complex<_Tp> &operator-=(const _Tp &__t) {
+    _M_real -= __t;
+    return *this;
+  }
+
+  // 26.2.5/5
+  // Multiply this complex number by ap_int.
+  complex<_Tp> &operator*=(const _Tp &__t) {
+    _M_real *= __t;
+    _M_imag *= __t;
+    return *this;
+  }
+
+  // 26.2.5/7
+  // Divide this complex number by ap_int.
+  complex<_Tp> &operator/=(const _Tp &__t) {
+    _M_real /= __t;
+    _M_imag /= __t;
+    return *this;
+  }
+
+  // Assign complex number to this complex number.
+  template <typename _Up>
+  complex<_Tp> &operator=(const complex<_Up> &__z) {
+    _M_real = __z.real();
+    _M_imag = __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/9
+  // Add complex number to this.
+  template <typename _Up>
+  complex<_Tp> &operator+=(const complex<_Up> &__z) {
+    _M_real += __z.real();
+    _M_imag += __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/11
+  // Subtract complex number from this.
+  template <typename _Up>
+  complex<_Tp> &operator-=(const complex<_Up> &__z) {
+    _M_real -= __z.real();
+    _M_imag -= __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/13
+  // Multiply this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator*=(const complex<_Up> &__z) {
+    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
+    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
+    _M_real = __r;
+    return *this;
+  }
+
+  // 26.2.5/15
+  // Divide this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator/=(const complex<_Up> &__z) {
+    complex<_Tp> cj (__z.real(), -__z.imag());
+    complex<_Tp> a = (*this) * cj;
+    complex<_Tp> b = cj * __z;
+    _M_real = a.real() / b.real();
+    _M_imag = a.imag() / b.real();
+    return *this;
+  }
+
+ private:
+  _Tp _M_real;
+  _Tp _M_imag;
+
+}; // class complex<ap_int<_AP_W> >
+
+
+/*
+   Non-member operations
+   These operations are not required by standard in 26.2.6, but libstdc++
+   defines them for
+   float, double or long double's specialization.
+*/
+// Compare complex number with ap_int.
+template <int _AP_W>
+inline bool operator==(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
+  return __x.real() == __y &&
+         __x.imag() == 0;
+}
+
+// Compare ap_int with complex number.
+template <int _AP_W>
+inline bool operator==(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
+  return __x == __y.real() &&
+         0 == __y.imag();
+}
+
+// Compare complex number with ap_int.
+template <int _AP_W>
+inline bool operator!=(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
+  return __x.real() != __y ||
+         __x.imag() != 0;
+}
+
+// Compare ap_int with complex number.
+template <int _AP_W>
+inline bool operator!=(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
+  return __x != __y.real() ||
+         0 != __y.imag();
+}
+
+}  // namespace std
+
+#endif  // ifndef __AP_INT_SPECIAL_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_shift_reg.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_shift_reg.h
new file mode 100644
index 00000000..94dba51e
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/ap_shift_reg.h
@@ -0,0 +1,138 @@
+/*
+#-  (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __SIM_AP_SHIFT_REG_H__
+#define __SIM_AP_SHIFT_REG_H__
+
+
+/*
+ * This file contains a C++ model of shift register.
+ * It defines C level simulation model.
+ */
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#else
+
+#include <cassert>
+
+//////////////////////////////////////////////
+// C level simulation model for ap_shift_reg
+//////////////////////////////////////////////
+template<typename __SHIFT_T__, unsigned int __SHIFT_DEPTH__ = 32>
+class ap_shift_reg
+{
+  public:
+    /// Constructors
+    ap_shift_reg() { }
+    ap_shift_reg(const char* name) { }
+    /// Destructor
+    virtual ~ap_shift_reg() { }
+
+  private:
+    /// Make copy constructor and assignment operator private
+    ap_shift_reg(const ap_shift_reg< __SHIFT_T__, __SHIFT_DEPTH__ >& shreg)
+    {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+    }
+
+    ap_shift_reg& operator = (const ap_shift_reg< __SHIFT_T__,
+        __SHIFT_DEPTH__ >& shreg)
+    {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+        return *this;
+    }
+
+  public:
+    // Shift the queue, push to back and read from a given address.
+    __SHIFT_T__ shift(__SHIFT_T__ DataIn,
+        unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true)
+    {
+        assert(Addr < __SHIFT_DEPTH__ &&
+            "Out-of-bound shift is found in ap_shift_reg.");
+        __SHIFT_T__ ret = Array[Addr];
+        if (Enable) {
+            for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i)
+                Array[i] = Array[i-1];
+            Array[0] = DataIn;
+        }
+        return ret;
+    }
+
+    // Read from a given address.
+    __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const
+    {
+        assert(Addr < __SHIFT_DEPTH__ &&
+            "Out-of-bound read is found in ap_shift_reg.");
+        return Array[Addr];
+    }
+
+  protected:
+    __SHIFT_T__ Array[__SHIFT_DEPTH__];
+};
+
+#endif //__cplusplus
+
+#endif //__SIM_AP_SHIFT_REG_H__
+
+
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/etc/ap_private.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/etc/ap_private.h
new file mode 100644
index 00000000..0c29a0ac
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/etc/ap_private.h
@@ -0,0 +1,7199 @@
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_PRIVATE_H__
+#define __AP_PRIVATE_H__
+
+// common macros and type declarations are now defined in ap_common.h, and
+// ap_private becomes part of it.
+#ifndef __AP_COMMON_H__
+#error "etc/ap_private.h cannot be included directly."
+#endif
+
+// forward declarations
+//template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
+//class ap_private; // moved to ap_common.h
+template <int _AP_W, bool _AP_S>
+struct _private_range_ref;
+template <int _AP_W, bool _AP_S>
+struct _private_bit_ref;
+
+// TODO clean up this part.
+#ifndef LLVM_SUPPORT_MATHEXTRAS_H
+#define LLVM_SUPPORT_MATHEXTRAS_H
+
+#ifdef _MSC_VER
+#if _MSC_VER <= 1500
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include <stdint.h>
+#endif
+#else
+#include <stdint.h>
+#endif
+
+#ifndef INLINE
+#define INLINE inline
+// Enable to debug ap_int/ap_fixed
+// #define INLINE  __attribute__((weak))
+#endif
+
+// NOTE: The following support functions use the _32/_64 extensions instead of
+// type overloading so that signed and unsigned integers can be used without
+// ambiguity.
+namespace AESL_std {
+template <class DataType>
+DataType INLINE min(DataType a, DataType b) {
+  return (a >= b) ? b : a;
+}
+
+template <class DataType>
+DataType INLINE max(DataType a, DataType b) {
+  return (a >= b) ? a : b;
+}
+} // namespace AESL_std
+
+// TODO clean up included headers.
+#include <math.h>
+#include <stdio.h>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace ap_private_ops {
+/// Hi_32 - This function returns the high 32 bits of a 64 bit value.
+static INLINE uint32_t Hi_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value >> 32);
+}
+
+/// Lo_32 - This function returns the low 32 bits of a 64 bit value.
+static INLINE uint32_t Lo_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value);
+}
+
+template <int _AP_W>
+INLINE bool isNegative(const ap_private<_AP_W, false>& a) {
+  return false;
+}
+
+template <int _AP_W>
+INLINE bool isNegative(const ap_private<_AP_W, true>& a) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    _AP_N = (_AP_W + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD
+  };
+  static const uint64_t sign_mask = 1ULL << ((_AP_W - 1) % APINT_BITS_PER_WORD);
+  return (sign_mask & a.get_pVal(_AP_N - 1)) != 0;
+}
+
+/// CountLeadingZeros_32 - this function performs the platform optimal form of
+/// counting the number of zeros from the most significant bit to the first one
+/// bit.  Ex. CountLeadingZeros_32(0x00F000FF) == 8.
+/// Returns 32 if the word is zero.
+static INLINE unsigned CountLeadingZeros_32(uint32_t Value) {
+  unsigned Count; // result
+#if __GNUC__ >= 4
+// PowerPC is defined for __builtin_clz(0)
+#if !defined(__ppc__) && !defined(__ppc64__)
+  if (Value == 0) return 32;
+#endif
+  Count = __builtin_clz(Value);
+#else
+  if (Value == 0) return 32;
+  Count = 0;
+  // bisecton method for count leading zeros
+  for (unsigned Shift = 32 >> 1; Shift; Shift >>= 1) {
+    uint32_t Tmp = (Value) >> (Shift);
+    if (Tmp) {
+      Value = Tmp;
+    } else {
+      Count |= Shift;
+    }
+  }
+#endif
+  return Count;
+}
+
+/// CountLeadingZeros_64 - This function performs the platform optimal form
+/// of counting the number of zeros from the most significant bit to the first
+/// one bit (64 bit edition.)
+/// Returns 64 if the word is zero.
+static INLINE unsigned CountLeadingZeros_64(uint64_t Value) {
+  unsigned Count; // result
+#if __GNUC__ >= 4
+// PowerPC is defined for __builtin_clzll(0)
+#if !defined(__ppc__) && !defined(__ppc64__)
+  if (!Value) return 64;
+#endif
+  Count = __builtin_clzll(Value);
+#else
+  if (sizeof(long) == sizeof(int64_t)) {
+    if (!Value) return 64;
+    Count = 0;
+    // bisecton method for count leading zeros
+    for (unsigned Shift = 64 >> 1; Shift; Shift >>= 1) {
+      uint64_t Tmp = (Value) >> (Shift);
+      if (Tmp) {
+        Value = Tmp;
+      } else {
+        Count |= Shift;
+      }
+    }
+  } else {
+    // get hi portion
+    uint32_t Hi = Hi_32(Value);
+
+    // if some bits in hi portion
+    if (Hi) {
+      // leading zeros in hi portion plus all bits in lo portion
+      Count = CountLeadingZeros_32(Hi);
+    } else {
+      // get lo portion
+      uint32_t Lo = Lo_32(Value);
+      // same as 32 bit value
+      Count = CountLeadingZeros_32(Lo) + 32;
+    }
+  }
+#endif
+  return Count;
+}
+
+/// CountTrailingZeros_64 - This function performs the platform optimal form
+/// of counting the number of zeros from the least significant bit to the first
+/// one bit (64 bit edition.)
+/// Returns 64 if the word is zero.
+static INLINE unsigned CountTrailingZeros_64(uint64_t Value) {
+#if __GNUC__ >= 4
+  return (Value != 0) ? __builtin_ctzll(Value) : 64;
+#else
+  static const unsigned Mod67Position[] = {
+      64, 0,  1,  39, 2,  15, 40, 23, 3,  12, 16, 59, 41, 19, 24, 54, 4,
+      64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5,  32,
+      65, 38, 14, 22, 11, 58, 18, 53, 63, 9,  61, 27, 29, 50, 43, 46, 31,
+      37, 21, 57, 52, 8,  26, 49, 45, 36, 56, 7,  48, 35, 6,  34, 33, 0};
+  return Mod67Position[(uint64_t)(-(int64_t)Value & (int64_t)Value) % 67];
+#endif
+}
+
+/// CountPopulation_64 - this function counts the number of set bits in a value,
+/// (64 bit edition.)
+static INLINE unsigned CountPopulation_64(uint64_t Value) {
+#if __GNUC__ >= 4
+  return __builtin_popcountll(Value);
+#else
+  uint64_t v = Value - (((Value) >> 1) & 0x5555555555555555ULL);
+  v = (v & 0x3333333333333333ULL) + (((v) >> 2) & 0x3333333333333333ULL);
+  v = (v + ((v) >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+  return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+#endif
+}
+
+static INLINE uint32_t countLeadingOnes_64(uint64_t __V, uint32_t skip) {
+  uint32_t Count = 0;
+  if (skip) (__V) <<= (skip);
+  while (__V && (__V & (1ULL << 63))) {
+    Count++;
+    (__V) <<= 1;
+  }
+  return Count;
+}
+
+static INLINE std::string oct2Bin(char oct) {
+  switch (oct) {
+    case '\0': {
+      return "";
+    }
+    case '.': {
+      return ".";
+    }
+    case '0': {
+      return "000";
+    }
+    case '1': {
+      return "001";
+    }
+    case '2': {
+      return "010";
+    }
+    case '3': {
+      return "011";
+    }
+    case '4': {
+      return "100";
+    }
+    case '5': {
+      return "101";
+    }
+    case '6': {
+      return "110";
+    }
+    case '7': {
+      return "111";
+    }
+  }
+  assert(0 && "Invalid character in digit string");
+  return "";
+}
+
+static INLINE std::string hex2Bin(char hex) {
+  switch (hex) {
+    case '\0': {
+      return "";
+    }
+    case '.': {
+      return ".";
+    }
+    case '0': {
+      return "0000";
+    }
+    case '1': {
+      return "0001";
+    }
+    case '2': {
+      return "0010";
+    }
+    case '3': {
+      return "0011";
+    }
+    case '4': {
+      return "0100";
+    }
+    case '5': {
+      return "0101";
+    }
+    case '6': {
+      return "0110";
+    }
+    case '7': {
+      return "0111";
+    }
+    case '8': {
+      return "1000";
+    }
+    case '9': {
+      return "1001";
+    }
+    case 'A':
+    case 'a': {
+      return "1010";
+    }
+    case 'B':
+    case 'b': {
+      return "1011";
+    }
+    case 'C':
+    case 'c': {
+      return "1100";
+    }
+    case 'D':
+    case 'd': {
+      return "1101";
+    }
+    case 'E':
+    case 'e': {
+      return "1110";
+    }
+    case 'F':
+    case 'f': {
+      return "1111";
+    }
+  }
+  assert(0 && "Invalid character in digit string");
+  return "";
+}
+
+static INLINE uint32_t decode_digit(char cdigit, int radix) {
+  uint32_t digit = 0;
+  if (radix == 16) {
+#define isxdigit(c)                                            \
+  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
+   ((c) >= 'A' && (c) <= 'F'))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+    if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
+    if (isdigit(cdigit))
+      digit = cdigit - '0';
+    else if (cdigit >= 'a')
+      digit = cdigit - 'a' + 10;
+    else if (cdigit >= 'A')
+      digit = cdigit - 'A' + 10;
+    else
+      assert(0 && "huh? we shouldn't get here");
+  } else if (isdigit(cdigit)) {
+    digit = cdigit - '0';
+  } else {
+    assert(0 && "Invalid character in digit string");
+  }
+#undef isxdigit
+#undef isdigit
+  return digit;
+}
+
+// Determine the radix of "val".
+static INLINE std::string parseString(const std::string& input, unsigned char& radix) {
+  size_t len = input.length();
+  if (len == 0) {
+    if (radix == 0) radix = 10;
+    return input;
+  }
+
+  size_t startPos = 0;
+  // Trim whitespace
+  while (input[startPos] == ' ' && startPos < len) startPos++;
+  while (input[len - 1] == ' ' && startPos < len) len--;
+
+  std::string val = input.substr(startPos, len - startPos);
+  // std::cout << "val = " << val << "\n";
+  len = val.length();
+  startPos = 0;
+
+  // If the length of the string is less than 2, then radix
+  // is decimal and there is no exponent.
+  if (len < 2) {
+    if (radix == 0) radix = 10;
+    return val;
+  }
+
+  bool isNegative = false;
+  std::string ans;
+
+  // First check to see if we start with a sign indicator
+  if (val[0] == '-') {
+    ans = "-";
+    ++startPos;
+    isNegative = true;
+  } else if (val[0] == '+')
+    ++startPos;
+
+  if (len - startPos < 2) {
+    if (radix == 0) radix = 10;
+    return val;
+  }
+
+  if (val.substr(startPos, 2) == "0x" || val.substr(startPos, 2) == "0X") {
+    // If we start with "0x", then the radix is hex.
+    radix = 16;
+    startPos += 2;
+  } else if (val.substr(startPos, 2) == "0b" ||
+             val.substr(startPos, 2) == "0B") {
+    // If we start with "0b", then the radix is binary.
+    radix = 2;
+    startPos += 2;
+  } else if (val.substr(startPos, 2) == "0o" ||
+             val.substr(startPos, 2) == "0O") {
+    // If we start with "0o", then the radix is octal.
+    radix = 8;
+    startPos += 2;
+  } else if (radix == 0) {
+    radix = 10;
+  }
+
+  int exp = 0;
+  if (radix == 10) {
+    // If radix is decimal, then see if there is an
+    // exponent indicator.
+    size_t expPos = val.find('e');
+    bool has_exponent = true;
+    if (expPos == std::string::npos) expPos = val.find('E');
+    if (expPos == std::string::npos) {
+      // No exponent indicator, so the mantissa goes to the end.
+      expPos = len;
+      has_exponent = false;
+    }
+    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
+
+    ans += val.substr(startPos, expPos - startPos);
+    if (has_exponent) {
+      // Parse the exponent.
+      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
+      iss >> exp;
+    }
+  } else {
+    // Check for a binary exponent indicator.
+    size_t expPos = val.find('p');
+    bool has_exponent = true;
+    if (expPos == std::string::npos) expPos = val.find('P');
+    if (expPos == std::string::npos) {
+      // No exponent indicator, so the mantissa goes to the end.
+      expPos = len;
+      has_exponent = false;
+    }
+
+    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
+
+    assert(startPos <= expPos);
+    // Convert to binary as we go.
+    for (size_t i = startPos; i < expPos; ++i) {
+      if (radix == 16) {
+        ans += hex2Bin(val[i]);
+      } else if (radix == 8) {
+        ans += oct2Bin(val[i]);
+      } else { // radix == 2
+        ans += val[i];
+      }
+    }
+    // End in binary
+    radix = 2;
+    if (has_exponent) {
+      // Parse the exponent.
+      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
+      iss >> exp;
+    }
+  }
+  if (exp == 0) return ans;
+
+  size_t decPos = ans.find('.');
+  if (decPos == std::string::npos) decPos = ans.length();
+  if ((int)decPos + exp >= (int)ans.length()) {
+    int i = decPos;
+    for (; i < (int)ans.length() - 1; ++i) ans[i] = ans[i + 1];
+    for (; i < (int)ans.length(); ++i) ans[i] = '0';
+    for (; i < (int)decPos + exp; ++i) ans += '0';
+    return ans;
+  } else if ((int)decPos + exp < (int)isNegative) {
+    std::string dupAns = "0.";
+    if (ans[0] == '-') dupAns = "-0.";
+    for (int i = 0; i < isNegative - (int)decPos - exp; ++i) dupAns += '0';
+    for (size_t i = isNegative; i < ans.length(); ++i)
+      if (ans[i] != '.') dupAns += ans[i];
+    return dupAns;
+  }
+
+  if (exp > 0)
+    for (size_t i = decPos; i < decPos + exp; ++i) ans[i] = ans[i + 1];
+  else {
+    if (decPos == ans.length()) ans += ' ';
+    for (int i = decPos; i > (int)decPos + exp; --i) ans[i] = ans[i - 1];
+  }
+  ans[decPos + exp] = '.';
+  return ans;
+}
+
+/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from
+/// the multi-digit integer array, x[], propagating the borrowed 1 value until
+/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
+/// In other words, if y > x then this function returns 1, otherwise 0.
+/// @returns the borrow out of the subtraction
+static INLINE bool sub_1(uint64_t x[], uint32_t len, uint64_t y) {
+  for (uint32_t i = 0; i < len; ++i) {
+    uint64_t __X = x[i];
+    x[i] -= y;
+    if (y > __X)
+      y = 1; // We have to "borrow 1" from next "digit"
+    else {
+      y = 0; // No need to borrow
+      break; // Remaining digits are unchanged so exit early
+    }
+  }
+  return (y != 0);
+}
+
+/// add_1 - This function adds a single "digit" integer, y, to the multiple
+/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+static INLINE bool add_1(uint64_t dest[], uint64_t x[], uint32_t len,
+                         uint64_t y) {
+  for (uint32_t i = 0; i < len; ++i) {
+    dest[i] = y + x[i];
+    if (dest[i] < y)
+      y = 1; // Carry one to next digit.
+    else {
+      y = 0; // No need to carry so exit early
+      break;
+    }
+  }
+  return (y != 0);
+}
+
+/// add - This function adds the integer array x to the integer array Y and
+/// places the result in dest.
+/// @returns the carry out from the addition
+/// @brief General addition of 64-bit integer arrays
+static INLINE bool add(uint64_t* dest, const uint64_t* x, const uint64_t* y,
+                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
+                       bool xsigned, bool ysigned) {
+  bool carry = false;
+  uint32_t len = AESL_std::min(xlen, ylen);
+  uint32_t i;
+  for (i = 0; i < len && i < destlen; ++i) {
+    uint64_t limit =
+        AESL_std::min(x[i], y[i]); // must come first in case dest == x
+    dest[i] = x[i] + y[i] + carry;
+    carry = dest[i] < limit || (carry && dest[i] == limit);
+  }
+  if (xlen > ylen) {
+    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
+    for (i = ylen; i < xlen && i < destlen; i++) {
+      uint64_t limit = AESL_std::min(x[i], yext);
+      dest[i] = x[i] + yext + carry;
+      carry = (dest[i] < limit) || (carry && dest[i] == limit);
+    }
+  } else if (ylen > xlen) {
+    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
+    for (i = xlen; i < ylen && i < destlen; i++) {
+      uint64_t limit = AESL_std::min(xext, y[i]);
+      dest[i] = xext + y[i] + carry;
+      carry = (dest[i] < limit) || (carry && dest[i] == limit);
+    }
+  }
+  return carry;
+}
+
+/// @returns returns the borrow out.
+/// @brief Generalized subtraction of 64-bit integer arrays.
+static INLINE bool sub(uint64_t* dest, const uint64_t* x, const uint64_t* y,
+                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
+                       bool xsigned, bool ysigned) {
+  bool borrow = false;
+  uint32_t i;
+  uint32_t len = AESL_std::min(xlen, ylen);
+  for (i = 0; i < len && i < destlen; ++i) {
+    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
+    dest[i] = x_tmp - y[i];
+  }
+  if (xlen > ylen) {
+    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
+    for (i = ylen; i < xlen && i < destlen; i++) {
+      uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+      borrow = yext > x_tmp || (borrow && x[i] == 0);
+      dest[i] = x_tmp - yext;
+    }
+  } else if (ylen > xlen) {
+    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
+    for (i = xlen; i < ylen && i < destlen; i++) {
+      uint64_t x_tmp = borrow ? xext - 1 : xext;
+      borrow = y[i] > x_tmp || (borrow && xext == 0);
+      dest[i] = x_tmp - y[i];
+    }
+  }
+  return borrow;
+}
+
+/// Subtracts the RHS ap_private from this ap_private
+/// @returns this, after subtraction
+/// @brief Subtraction assignment operator.
+
+/// Multiplies an integer array, x by a a uint64_t integer and places the result
+/// into dest.
+/// @returns the carry out of the multiplication.
+/// @brief Multiply a multi-digit ap_private by a single digit (64-bit) integer.
+static INLINE uint64_t mul_1(uint64_t dest[], const uint64_t x[], uint32_t len,
+                             uint64_t y) {
+  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
+  uint64_t ly = y & 0xffffffffULL, hy = (y) >> 32;
+  uint64_t carry = 0;
+  static const uint64_t two_power_32 = 1ULL << 32;
+  // For each digit of x.
+  for (uint32_t i = 0; i < len; ++i) {
+    // Split x into high and low words
+    uint64_t lx = x[i] & 0xffffffffULL;
+    uint64_t hx = (x[i]) >> 32;
+    // hasCarry - A flag to indicate if there is a carry to the next digit.
+    // hasCarry == 0, no carry
+    // hasCarry == 1, has carry
+    // hasCarry == 2, no carry and the calculation result == 0.
+    uint8_t hasCarry = 0;
+    dest[i] = carry + lx * ly;
+    // Determine if the add above introduces carry.
+    hasCarry = (dest[i] < carry) ? 1 : 0;
+    carry = hx * ly + ((dest[i]) >> 32) + (hasCarry ? two_power_32 : 0);
+    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
+    // (2^32 - 1) + 2^32 = 2^64.
+    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+    carry += (lx * hy) & 0xffffffffULL;
+    dest[i] = ((carry) << 32) | (dest[i] & 0xffffffffULL);
+    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? two_power_32 : 0) +
+            ((carry) >> 32) + ((lx * hy) >> 32) + hx * hy;
+  }
+  return carry;
+}
+
+/// Multiplies integer array x by integer array y and stores the result into
+/// the integer array dest. Note that dest's size must be >= xlen + ylen in
+/// order to
+/// do a full precision computation. If it is not, then only the low-order words
+/// are returned.
+/// @brief Generalized multiplicate of integer arrays.
+static INLINE void mul(uint64_t dest[], const uint64_t x[], uint32_t xlen,
+                       const uint64_t y[], uint32_t ylen, uint32_t destlen) {
+  assert(xlen > 0);
+  assert(ylen > 0);
+  assert(destlen >= xlen + ylen);
+  if (xlen < destlen) dest[xlen] = mul_1(dest, x, xlen, y[0]);
+  for (uint32_t i = 1; i < ylen; ++i) {
+    uint64_t ly = y[i] & 0xffffffffULL, hy = (y[i]) >> 32;
+    uint64_t carry = 0, lx = 0, hx = 0;
+    for (uint32_t j = 0; j < xlen; ++j) {
+      lx = x[j] & 0xffffffffULL;
+      hx = (x[j]) >> 32;
+      // hasCarry - A flag to indicate if has carry.
+      // hasCarry == 0, no carry
+      // hasCarry == 1, has carry
+      // hasCarry == 2, no carry and the calculation result == 0.
+      uint8_t hasCarry = 0;
+      uint64_t resul = carry + lx * ly;
+      hasCarry = (resul < carry) ? 1 : 0;
+      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + ((resul) >> 32);
+      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+      carry += (lx * hy) & 0xffffffffULL;
+      resul = ((carry) << 32) | (resul & 0xffffffffULL);
+      if (i + j < destlen) dest[i + j] += resul;
+      carry =
+          (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
+          ((carry) >> 32) + (dest[i + j] < resul ? 1 : 0) + ((lx * hy) >> 32) +
+          hx * hy;
+    }
+    if (i + xlen < destlen) dest[i + xlen] = carry;
+  }
+}
+
+/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
+/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
+/// variables here have the same names as in the algorithm. Comments explain
+/// the algorithm and any deviation from it.
+static INLINE void KnuthDiv(uint32_t* u, uint32_t* v, uint32_t* q, uint32_t* r,
+                            uint32_t m, uint32_t n) {
+  assert(u && "Must provide dividend");
+  assert(v && "Must provide divisor");
+  assert(q && "Must provide quotient");
+  assert(u != v && u != q && v != q && "Must us different memory");
+  assert(n > 1 && "n must be > 1");
+
+  // Knuth uses the value b as the base of the number system. In our case b
+  // is 2^31 so we just set it to -1u.
+  uint64_t b = uint64_t(1) << 32;
+
+  // DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
+  // DEBUG(cerr << "KnuthDiv: original:");
+  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
+  // u[i]);
+  // DEBUG(cerr << " by");
+  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
+  // v[i-1]);
+  // DEBUG(cerr << '\n');
+  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
+  // u and v by d. Note that we have taken Knuth's advice here to use a power
+  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
+  // 2 allows us to shift instead of multiply and it is easy to determine the
+  // shift amount from the leading zeros.  We are basically normalizing the u
+  // and v so that its high bits are shifted to the top of v's range without
+  // overflow. Note that this can require an extra word in u so that u must
+  // be of length m+n+1.
+  uint32_t shift = CountLeadingZeros_32(v[n - 1]);
+  uint32_t v_carry = 0;
+  uint32_t u_carry = 0;
+  if (shift) {
+    for (uint32_t i = 0; i < m + n; ++i) {
+      uint32_t u_tmp = (u[i]) >> (32 - shift);
+      u[i] = ((u[i]) << (shift)) | u_carry;
+      u_carry = u_tmp;
+    }
+    for (uint32_t i = 0; i < n; ++i) {
+      uint32_t v_tmp = (v[i]) >> (32 - shift);
+      v[i] = ((v[i]) << (shift)) | v_carry;
+      v_carry = v_tmp;
+    }
+  }
+  u[m + n] = u_carry;
+  // DEBUG(cerr << "KnuthDiv:   normal:");
+  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
+  // u[i]);
+  // DEBUG(cerr << " by");
+  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
+  // v[i-1]);
+  // DEBUG(cerr << '\n');
+
+  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
+  int j = m;
+  do {
+    // DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
+    // D3. [Calculate q'.].
+    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
+    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
+    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
+    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // on v[n-2] determines at high speed most of the cases in which the trial
+    // value qp is one too large, and it eliminates all cases where qp is two
+    // too large.
+    uint64_t dividend = ((uint64_t(u[j + n]) << 32) + u[j + n - 1]);
+    // DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
+    uint64_t qp = dividend / v[n - 1];
+    uint64_t rp = dividend % v[n - 1];
+    if (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2]) {
+      qp--;
+      rp += v[n - 1];
+      if (rp < b && (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2])) qp--;
+    }
+    // DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+
+    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
+    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
+    // consists of a simple multiplication by a one-place number, combined with
+    // a subtraction.
+    bool isNeg = false;
+    for (uint32_t i = 0; i < n; ++i) {
+      uint64_t u_tmp = uint64_t(u[j + i]) | ((uint64_t(u[j + i + 1])) << 32);
+      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
+      bool borrow = subtrahend > u_tmp;
+      /*DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp
+        << ", subtrahend == " << subtrahend
+        << ", borrow = " << borrow << '\n');*/
+
+      uint64_t result = u_tmp - subtrahend;
+      uint32_t k = j + i;
+      u[k++] = (uint32_t)(result & (b - 1)); // subtract low word
+      u[k++] = (uint32_t)((result) >> 32);   // subtract high word
+      while (borrow && k <= m + n) {         // deal with borrow to the left
+        borrow = u[k] == 0;
+        u[k]--;
+        k++;
+      }
+      isNeg |= borrow;
+      /*DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " <<
+        u[j+i+1] << '\n');*/
+    }
+    /*DEBUG(cerr << "KnuthDiv: after subtraction:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+      DEBUG(cerr << '\n');*/
+    // The digits (u[j+n]...u[j]) should be kept positive; if the result of
+    // this step is actually negative, (u[j+n]...u[j]) should be left as the
+    // true value plus b**(n+1), namely as the b's complement of
+    // the true value, and a "borrow" to the left should be remembered.
+    //
+    if (isNeg) {
+      bool carry = true; // true because b's complement is "complement + 1"
+      for (uint32_t i = 0; i <= m + n; ++i) {
+        u[i] = ~u[i] + carry; // b's complement
+        carry = carry && u[i] == 0;
+      }
+    }
+    /*DEBUG(cerr << "KnuthDiv: after complement:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+      DEBUG(cerr << '\n');*/
+
+    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
+    // negative, go to step D6; otherwise go on to step D7.
+    q[j] = (uint32_t)qp;
+    if (isNeg) {
+      // D6. [Add back]. The probability that this step is necessary is very
+      // small, on the order of only 2/b. Make sure that test data accounts for
+      // this possibility. Decrease q[j] by 1
+      q[j]--;
+      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
+      // A carry will occur to the left of u[j+n], and it should be ignored
+      // since it cancels with the borrow that occurred in D4.
+      bool carry = false;
+      for (uint32_t i = 0; i < n; i++) {
+        uint32_t limit = AESL_std::min(u[j + i], v[i]);
+        u[j + i] += v[i] + carry;
+        carry = u[j + i] < limit || (carry && u[j + i] == limit);
+      }
+      u[j + n] += carry;
+    }
+    /*DEBUG(cerr << "KnuthDiv: after correction:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
+      DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');*/
+
+    // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
+  } while (--j >= 0);
+
+  /*DEBUG(cerr << "KnuthDiv: quotient:");
+    DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
+    DEBUG(cerr << '\n');*/
+
+  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
+  // remainder may be obtained by dividing u[...] by d. If r is non-null we
+  // compute the remainder (urem uses this).
+  if (r) {
+    // The value d is expressed by the "shift" value above since we avoided
+    // multiplication by d by using a shift left. So, all we have to do is
+    // shift right here. In order to mak
+    if (shift) {
+      uint32_t carry = 0;
+      // DEBUG(cerr << "KnuthDiv: remainder:");
+      for (int i = n - 1; i >= 0; i--) {
+        r[i] = ((u[i]) >> (shift)) | carry;
+        carry = (u[i]) << (32 - shift);
+        // DEBUG(cerr << " " << r[i]);
+      }
+    } else {
+      for (int i = n - 1; i >= 0; i--) {
+        r[i] = u[i];
+        // DEBUG(cerr << " " << r[i]);
+      }
+    }
+    // DEBUG(cerr << '\n');
+  }
+  // DEBUG(cerr << std::setbase(10) << '\n');
+}
+
+template <int _AP_W, bool _AP_S>
+void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
+            const ap_private<_AP_W, _AP_S>& RHS, uint32_t rhsWords,
+            ap_private<_AP_W, _AP_S>* Quotient,
+            ap_private<_AP_W, _AP_S>* Remainder) {
+  assert(lhsWords >= rhsWords && "Fractional result");
+  enum { APINT_BITS_PER_WORD = 64 };
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
+  uint32_t n = rhsWords * 2;
+  uint32_t m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  uint32_t SPACE[128];
+  uint32_t* __U = 0;
+  uint32_t* __V = 0;
+  uint32_t* __Q = 0;
+  uint32_t* __R = 0;
+  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
+    __U = &SPACE[0];
+    __V = &SPACE[m + n + 1];
+    __Q = &SPACE[(m + n + 1) + n];
+    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
+  } else {
+    __U = new uint32_t[m + n + 1];
+    __V = new uint32_t[n];
+    __Q = new uint32_t[m + n];
+    if (Remainder) __R = new uint32_t[n];
+  }
+
+  // Initialize the dividend
+  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = LHS.get_pVal(i);
+    __U[i * 2] = (uint32_t)(tmp & mask);
+    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(__V, 0, (n) * sizeof(uint32_t));
+  for (unsigned i = 0; i < rhsWords; ++i) {
+    uint64_t tmp = RHS.get_pVal(i);
+    __V[i * 2] = (uint32_t)(tmp & mask);
+    __V[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+
+  // initialize the quotient and remainder
+  memset(__Q, 0, (m + n) * sizeof(uint32_t));
+  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    uint32_t divisor = __V[0];
+    uint32_t remainder = 0;
+    for (int i = m + n - 1; i >= 0; i--) {
+      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
+      if (partial_dividend == 0) {
+        __Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        __Q[i] = 0;
+        remainder = (uint32_t)partial_dividend;
+      } else if (partial_dividend == divisor) {
+        __Q[i] = 1;
+        remainder = 0;
+      } else {
+        __Q[i] = (uint32_t)(partial_dividend / divisor);
+        remainder = (uint32_t)(partial_dividend - (__Q[i] * divisor));
+      }
+    }
+    if (__R) __R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(__U, __V, __Q, __R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
+      Quotient->set_VAL(tmp);
+    } else {
+      assert(!Quotient->isSingleWord() &&
+             "Quotient ap_private not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->set_pVal(
+            i, uint64_t(__Q[i * 2]) |
+                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Quotient->clearUnusedBits();
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != RHS.BitWidth) {
+      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
+    } else
+      Remainder->clear();
+
+    // The remainder is in R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
+      Remainder->set_VAL(tmp);
+    } else {
+      assert(!Remainder->isSingleWord() &&
+             "Remainder ap_private not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->set_pVal(
+            i, uint64_t(__R[i * 2]) |
+                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Remainder->clearUnusedBits();
+  }
+
+  // Clean up the memory we allocated.
+  if (__U != &SPACE[0]) {
+    delete[] __U;
+    delete[] __V;
+    delete[] __Q;
+    delete[] __R;
+  }
+}
+
+template <int _AP_W, bool _AP_S>
+void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
+            uint64_t RHS, ap_private<_AP_W, _AP_S>* Quotient,
+            ap_private<_AP_W, _AP_S>* Remainder) {
+  uint32_t rhsWords = 1;
+  assert(lhsWords >= rhsWords && "Fractional result");
+  enum { APINT_BITS_PER_WORD = 64 };
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
+  uint32_t n = 2;
+  uint32_t m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  uint32_t SPACE[128];
+  uint32_t* __U = 0;
+  uint32_t* __V = 0;
+  uint32_t* __Q = 0;
+  uint32_t* __R = 0;
+  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
+    __U = &SPACE[0];
+    __V = &SPACE[m + n + 1];
+    __Q = &SPACE[(m + n + 1) + n];
+    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
+  } else {
+    __U = new uint32_t[m + n + 1];
+    __V = new uint32_t[n];
+    __Q = new uint32_t[m + n];
+    if (Remainder) __R = new uint32_t[n];
+  }
+
+  // Initialize the dividend
+  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = LHS.get_pVal(i);
+    __U[i * 2] = tmp & mask;
+    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(__V, 0, (n) * sizeof(uint32_t));
+  __V[0] = RHS & mask;
+  __V[1] = (RHS) >> (sizeof(uint32_t) * 8);
+
+  // initialize the quotient and remainder
+  memset(__Q, 0, (m + n) * sizeof(uint32_t));
+  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    uint32_t divisor = __V[0];
+    uint32_t remainder = 0;
+    for (int i = m + n - 1; i >= 0; i--) {
+      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
+      if (partial_dividend == 0) {
+        __Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        __Q[i] = 0;
+        remainder = partial_dividend;
+      } else if (partial_dividend == divisor) {
+        __Q[i] = 1;
+        remainder = 0;
+      } else {
+        __Q[i] = partial_dividend / divisor;
+        remainder = partial_dividend - (__Q[i] * divisor);
+      }
+    }
+    if (__R) __R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(__U, __V, __Q, __R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
+      Quotient->set_VAL(tmp);
+    } else {
+      assert(!Quotient->isSingleWord() &&
+             "Quotient ap_private not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->set_pVal(
+            i, uint64_t(__Q[i * 2]) |
+                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Quotient->clearUnusedBits();
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != 64 /* RHS.BitWidth */) {
+      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
+    } else
+      Remainder->clear();
+
+    // The remainder is in __R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
+      Remainder->set_VAL(tmp);
+    } else {
+      assert(!Remainder->isSingleWord() &&
+             "Remainder ap_private not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->set_pVal(
+            i, uint64_t(__R[i * 2]) |
+                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Remainder->clearUnusedBits();
+  }
+
+  // Clean up the memory we allocated.
+  if (__U != &SPACE[0]) {
+    delete[] __U;
+    delete[] __V;
+    delete[] __Q;
+    delete[] __R;
+  }
+}
+
+/// @brief Logical right-shift function.
+template <int _AP_W, bool _AP_S, bool _AP_C>
+INLINE ap_private<_AP_W, _AP_S, _AP_C> lshr(
+    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
+  return LHS.lshr(shiftAmt);
+}
+
+/// Left-shift the ap_private by shiftAmt.
+/// @brief Left-shift function.
+template <int _AP_W, bool _AP_S, bool _AP_C>
+INLINE ap_private<_AP_W, _AP_S, _AP_C> shl(
+    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
+  return LHS.shl(shiftAmt);
+}
+
+} // namespace ap_private_ops
+
+#endif // LLVM_SUPPORT_MATHEXTRAS_H
+
+/// This enumeration just provides for internal constants used in this
+/// translation unit.
+enum {
+  MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified
+  ///< Note that this must remain synchronized with IntegerType::MIN_INT_BITS
+  MAX_INT_BITS = (1 << 23) - 1 ///< Maximum number of bits that can be specified
+  ///< Note that this must remain synchronized with IntegerType::MAX_INT_BITS
+};
+
+//===----------------------------------------------------------------------===//
+//                              ap_private Class
+//===----------------------------------------------------------------------===//
+
+/// ap_private - This class represents arbitrary precision constant integral
+/// values.
+/// It is a functional replacement for common case unsigned integer type like
+/// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width
+/// integer sizes and large integer value types such as 3-bits, 15-bits, or more
+/// than 64-bits of precision. ap_private provides a variety of arithmetic
+/// operators
+/// and methods to manipulate integer values of any bit-width. It supports both
+/// the typical integer arithmetic and comparison operations as well as bitwise
+/// manipulation.
+///
+/// The class has several invariants worth noting:
+///   * All bit, byte, and word positions are zero-based.
+///   * Once the bit width is set, it doesn't change except by the Truncate,
+///     SignExtend, or ZeroExtend operations.
+///   * All binary operators must be on ap_private instances of the same bit
+///   width.
+///     Attempting to use these operators on instances with different bit
+///     widths will yield an assertion.
+///   * The value is stored canonically as an unsigned value. For operations
+///     where it makes a difference, there are both signed and unsigned variants
+///     of the operation. For example, sdiv and udiv. However, because the bit
+///     widths must be the same, operations such as Mul and Add produce the same
+///     results regardless of whether the values are interpreted as signed or
+///     not.
+///   * In general, the class tries to follow the style of computation that LLVM
+///     uses in its IR. This simplifies its use for LLVM.
+///
+/// @brief Class for arbitrary precision integers.
+
+#if defined(_MSC_VER)
+#if _MSC_VER < 1400 && !defined(for)
+#define for if (0); else for
+#endif
+typedef unsigned __int64 ap_ulong;
+typedef signed __int64 ap_slong;
+#else
+typedef unsigned long long ap_ulong;
+typedef signed long long ap_slong;
+#endif
+template <int _AP_N8, bool _AP_S>
+struct valtype;
+
+template <int _AP_N8>
+struct valtype<_AP_N8, false> {
+  typedef uint64_t Type;
+};
+
+template <int _AP_N8>
+struct valtype<_AP_N8, true> {
+  typedef int64_t Type;
+};
+
+template <>
+struct valtype<1, false> {
+  typedef unsigned char Type;
+};
+template <>
+struct valtype<2, false> {
+  typedef unsigned short Type;
+};
+template <>
+struct valtype<3, false> {
+  typedef unsigned int Type;
+};
+template <>
+struct valtype<4, false> {
+  typedef unsigned int Type;
+};
+template <>
+struct valtype<1, true> {
+  typedef signed char Type;
+};
+template <>
+struct valtype<2, true> {
+  typedef short Type;
+};
+template <>
+struct valtype<3, true> {
+  typedef int Type;
+};
+template <>
+struct valtype<4, true> {
+  typedef int Type;
+};
+
+template <bool enable>
+struct ap_private_enable_if {};
+template <>
+struct ap_private_enable_if<true> {
+  static const bool isValid = true;
+};
+
+// When bitwidth < 64
+template <int _AP_W, bool _AP_S>
+class ap_private<_AP_W, _AP_S, true> {
+  // SFINAE pattern.  Only consider this class when _AP_W <= 64
+  const static bool valid = ap_private_enable_if<_AP_W <= 64>::isValid;
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+ public:
+  typedef typename valtype<(_AP_W + 7) / 8, _AP_S>::Type ValType;
+  typedef ap_private<_AP_W, _AP_S> Type;
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+    typedef ap_private<mult_w, mult_s> mult;
+    typedef ap_private<plus_w, plus_s> plus;
+    typedef ap_private<minus_w, minus_s> minus;
+    typedef ap_private<logic_w, logic_s> logic;
+    typedef ap_private<div_w, div_s> div;
+    typedef ap_private<mod_w, mod_s> mod;
+    typedef ap_private<_AP_W, _AP_S> arg1;
+    typedef bool reduce;
+  };
+  enum { APINT_BITS_PER_WORD = sizeof(uint64_t) * 8 };
+  enum {
+    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
+                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
+                      : 0
+  };
+  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
+  static const uint64_t not_mask = ~mask;
+  static const uint64_t sign_bit_mask = 1ULL << (APINT_BITS_PER_WORD - 1);
+  template <int _AP_W1>
+  struct sign_ext_mask {
+    static const uint64_t mask = ~0ULL << _AP_W1;
+  };
+  static const int width = _AP_W;
+
+  enum {
+    BitWidth = _AP_W,
+    _AP_N = 1,
+  };
+  ValType VAL; ///< Used to store the <= 64 bits integer value.
+#ifdef AP_CANARY
+  ValType CANARY;
+  void check_canary() { assert(CANARY == (ValType)0xDEADBEEFDEADBEEF); }
+  void set_canary() { CANARY = (ValType)0xDEADBEEFDEADBEEF; }
+#else
+  void check_canary() {}
+  void set_canary() {}
+#endif
+
+  INLINE ValType& get_VAL(void) { return VAL; }
+  INLINE ValType get_VAL(void) const { return VAL; }
+  INLINE ValType get_VAL(void) const volatile { return VAL; }
+  INLINE void set_VAL(uint64_t value) { VAL = (ValType)value; }
+  INLINE ValType& get_pVal(int i) { return VAL; }
+  INLINE ValType get_pVal(int i) const { return VAL; }
+  INLINE const uint64_t* get_pVal() const {
+    assert(0 && "invalid usage");
+    return 0;
+  }
+  INLINE ValType get_pVal(int i) const volatile { return VAL; }
+  INLINE uint64_t* get_pVal() const volatile {
+    assert(0 && "invalid usage");
+    return 0;
+  }
+  INLINE void set_pVal(int i, uint64_t value) { VAL = (ValType)value; }
+
+  INLINE uint32_t getBitWidth() const { return BitWidth; }
+
+  template <int _AP_W1, bool _AP_S1>
+  ap_private<_AP_W, _AP_S>& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  ap_private<_AP_W, _AP_S>& operator=(
+      const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(RHS.get_VAL()); // TODO check here about ap_private<W,S,false>
+    clearUnusedBits();
+    return *this;
+  }
+
+  void operator=(const ap_private& RHS) volatile {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+  }
+
+  ap_private& operator=(const ap_private& RHS) {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+    return *this;
+  }
+
+  void operator=(const volatile ap_private& RHS) volatile {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+  }
+
+  ap_private& operator=(const volatile ap_private& RHS) {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    *this = ap_private<_AP_W2, false>(op2);
+    return *this;
+  }
+
+#define ASSIGN_OP_FROM_INT(C_TYPE)               \
+  INLINE ap_private& operator=(const C_TYPE v) { \
+    set_canary();                                \
+    this->VAL = (ValType)v;                      \
+    clearUnusedBits();                           \
+    check_canary();                              \
+    return *this;                                \
+  }
+
+ASSIGN_OP_FROM_INT(bool)
+ASSIGN_OP_FROM_INT(char)
+ASSIGN_OP_FROM_INT(signed char)
+ASSIGN_OP_FROM_INT(unsigned char)
+ASSIGN_OP_FROM_INT(short)
+ASSIGN_OP_FROM_INT(unsigned short)
+ASSIGN_OP_FROM_INT(int)
+ASSIGN_OP_FROM_INT(unsigned int)
+ASSIGN_OP_FROM_INT(long)
+ASSIGN_OP_FROM_INT(unsigned long)
+ASSIGN_OP_FROM_INT(ap_slong)
+ASSIGN_OP_FROM_INT(ap_ulong)
+#if 0
+ASSIGN_OP_FROM_INT(half)
+ASSIGN_OP_FROM_INT(float)
+ASSIGN_OP_FROM_INT(double)
+#endif
+#undef ASSIGN_OP_FROM_INT
+
+  // XXX This is a must to prevent pointer being converted to bool.
+  INLINE ap_private& operator=(const char* s) {
+    ap_private tmp(s); // XXX direct-initialization, as ctor is explicit.
+    operator=(tmp);
+    return *this;
+  }
+
+ private:
+  explicit INLINE ap_private(uint64_t* val) : VAL(val[0]) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  INLINE bool isSingleWord() const { return true; }
+
+ public:
+  INLINE void fromString(const char* strStart, uint32_t slen, uint8_t radix) {
+    bool isNeg = strStart[0] == '-';
+    if (isNeg) {
+      strStart++;
+      slen--;
+    }
+
+    if (strStart[0] == '0' && (strStart[1] == 'b' || strStart[1] == 'B')) {
+      //if(radix == 0) radix = 2;
+      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", strStart, 2, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'o' || strStart[1] == 'O')) {
+      //if (radix == 0) radix = 8;
+      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", strStart, 8, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'x' || strStart[1] == 'X')) {
+      //if (radix == 0) radix = 16;
+      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", strStart, 16, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'd' || strStart[1] == 'D')) {
+      //if (radix == 0) radix = 10;
+      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", strStart, 10, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (radix == 0) {
+      //radix = 2; // XXX default value
+    }
+
+    // Check our assumptions here
+    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+           "Radix should be 2, 8, 10, or 16!");
+    assert(strStart && "String is null?");
+
+    // Clear bits.
+    uint64_t tmpVAL = VAL = 0;
+
+    switch (radix) {
+      case 2:
+        //        sscanf(strStart,"%b",&VAL);
+        // tmpVAL = *strStart =='1' ? ~0ULL : 0;
+        for (; *strStart; ++strStart) {
+          assert((*strStart == '0' || *strStart == '1') &&
+                 ("Wrong binary number"));
+          tmpVAL <<= 1;
+          tmpVAL |= (*strStart - '0');
+        }
+        break;
+      case 8:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llo", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lo", &tmpVAL);
+#else
+        sscanf(strStart, "%llo", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      case 10:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llu", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lu", &tmpVAL);
+#else
+        sscanf(strStart, "%llu", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      case 16:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llx", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lx", &tmpVAL);
+#else
+        sscanf(strStart, "%llx", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      default:
+        assert(true && "Unknown radix");
+        // error
+    }
+    VAL = isNeg ? (ValType)(-tmpVAL) : (ValType)(tmpVAL);
+
+    clearUnusedBits();
+  }
+
+ private:
+  INLINE ap_private(const std::string& val, uint8_t radix = 2) : VAL(0) {
+    assert(!val.empty() && "String empty?");
+    set_canary();
+    fromString(val.c_str(), val.size(), radix);
+    check_canary();
+  }
+
+  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix)
+      : VAL(0) {
+    set_canary();
+    fromString(strStart, slen, radix);
+    check_canary();
+  }
+
+  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[])
+      : VAL(bigVal[0]) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+ public:
+  INLINE ap_private() {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+#define CTOR(TYPE)                              \
+  INLINE ap_private(TYPE v) : VAL((ValType)v) { \
+    set_canary();                               \
+    clearUnusedBits();                          \
+    check_canary();                             \
+  }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if 0
+  CTOR(half)
+  CTOR(float)
+  CTOR(double)
+#endif
+#undef CTOR
+
+  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
+      : VAL((ValType)that.get_VAL()) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
+      : VAL((ValType)that.get_VAL()) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  explicit INLINE ap_private(const char* val) {
+    set_canary();
+    unsigned char radix = 10;
+    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
+    std::string::size_type pos = str.find('.');
+    // trunc all fraction part
+    if (pos != std::string::npos) str = str.substr(pos);
+
+    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
+    operator=(ap_private_val);
+    check_canary();
+  }
+
+  INLINE ap_private(const char* val, signed char rd) {
+    set_canary();
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
+    std::string::size_type pos = str.find('.');
+    // trunc all fraction part
+    if (pos != std::string::npos) str = str.substr(pos);
+
+    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
+    operator=(ap_private_val);
+    check_canary();
+  }
+
+  INLINE ~ap_private() { check_canary(); }
+
+  INLINE bool isNegative() const {
+    static const uint64_t sign_mask = 1ULL << (_AP_W - 1);
+    return _AP_S && (sign_mask & VAL);
+  }
+
+  INLINE bool isPositive() const { return !isNegative(); }
+
+  INLINE bool isStrictlyPositive() const { return !isNegative() && VAL != 0; }
+
+  INLINE bool isAllOnesValue() const { return (mask & VAL) == mask; }
+
+  INLINE bool operator==(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return VAL == RHS.get_VAL();
+  }
+  INLINE bool operator==(const ap_private<_AP_W, !_AP_S>& RHS) const {
+    return (uint64_t)VAL == (uint64_t)RHS.get_VAL();
+  }
+
+  INLINE bool operator==(uint64_t Val) const { return ((uint64_t)VAL == Val); }
+  INLINE bool operator!=(uint64_t Val) const { return ((uint64_t)VAL != Val); }
+  INLINE bool operator!=(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return VAL != RHS.get_VAL();
+  }
+  INLINE bool operator!=(const ap_private<_AP_W, !_AP_S>& RHS) const {
+    return (uint64_t)VAL != (uint64_t)RHS.get_VAL();
+  }
+
+  /// postfix increment.
+  const ap_private operator++(int) {
+    ap_private orig(*this);
+    VAL++;
+    clearUnusedBits();
+    return orig;
+  }
+
+  /// prefix increment.
+  const ap_private operator++() {
+    ++VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// postfix decrement.
+  const ap_private operator--(int) {
+    ap_private orig(*this);
+    --VAL;
+    clearUnusedBits();
+    return orig;
+  }
+
+  /// prefix decrement.
+  const ap_private operator--() {
+    --VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// one's complement.
+  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
+    ap_private<_AP_W + !_AP_S, true> Result(*this);
+    Result.flip();
+    return Result;
+  }
+
+  /// two's complement.
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_private<1, false>(0) - (*this);
+  }
+
+  /// logic negation.
+  INLINE bool operator!() const { return !VAL; }
+
+  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
+  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
+    return toString(radix, false);
+  }
+  INLINE std::string toStringSigned(uint8_t radix = 10) const {
+    return toString(radix, true);
+  }
+  INLINE void clear() { VAL = 0; }
+  INLINE ap_private& clear(uint32_t bitPosition) {
+    VAL &= ~(1ULL << (bitPosition));
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE ap_private ashr(uint32_t shiftAmt) const {
+    if (_AP_S)
+      return ap_private((shiftAmt == BitWidth) ? 0
+                                               : ((int64_t)VAL) >> (shiftAmt));
+    else
+      return ap_private((shiftAmt == BitWidth) ? 0
+                                               : ((uint64_t)VAL) >> (shiftAmt));
+  }
+
+  INLINE ap_private lshr(uint32_t shiftAmt) const {
+    return ap_private((shiftAmt == BitWidth)
+                          ? ap_private(0)
+                          : ap_private((VAL & mask) >> (shiftAmt)));
+  }
+
+  INLINE ap_private shl(uint32_t shiftAmt) const
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    if (shiftAmt > BitWidth) {
+      if (!isNegative())
+        return ap_private(0);
+      else
+        return ap_private(-1);
+    }
+    if (shiftAmt == BitWidth)
+      return ap_private(0);
+    else
+      return ap_private((VAL) << (shiftAmt));
+    // return ap_private((shiftAmt == BitWidth) ? ap_private(0ULL) :
+    // ap_private(VAL << shiftAmt));
+  }
+
+  INLINE int64_t getSExtValue() const { return VAL; }
+
+  // XXX XXX this function is used in CBE
+  INLINE uint64_t getZExtValue() const { return VAL & mask; }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ref.get();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ((uint64_t)(bool)ref);
+    check_canary();
+  }
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+//    set_canary();
+//    *this = ref.get();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = ((val.operator ap_private<_AP_W2, false>()));
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = (uint64_t)(bool)val;
+//    check_canary();
+//  }
+
+  INLINE void write(const ap_private<_AP_W, _AP_S>& op2) volatile {
+    *this = (op2);
+  }
+
+  // Explicit conversions to C interger types
+  //-----------------------------------------------------------
+  INLINE operator ValType() const { return get_VAL(); }
+
+  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
+
+  INLINE int to_char() const { return (signed char)get_VAL(); }
+
+  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
+
+  INLINE int to_short() const { return (short)get_VAL(); }
+
+  INLINE int to_int() const {
+    //      ap_private<64 /* _AP_W */, _AP_S> res(V);
+    return (int)get_VAL();
+  }
+
+  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
+
+  INLINE long to_long() const { return (long)get_VAL(); }
+
+  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
+
+  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
+
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
+
+  INLINE double to_double() const {
+    if (isNegative())
+      return roundToDouble(true);
+    else
+      return roundToDouble(false);
+  }
+
+  INLINE unsigned length() const { return _AP_W; }
+
+  INLINE bool isMinValue() const { return VAL == 0; }
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator&=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) & RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator|=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) | RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator^=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) ^ RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) * RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) + RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) - RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator&(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) &
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret & RHS;
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator^(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) ^
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret ^ RHS;
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator|(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) |
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret | RHS;
+    }
+  }
+
+  INLINE ap_private And(const ap_private& RHS) const {
+    return ap_private(VAL & RHS.get_VAL());
+  }
+
+  INLINE ap_private Or(const ap_private& RHS) const {
+    return ap_private(VAL | RHS.get_VAL());
+  }
+
+  INLINE ap_private Xor(const ap_private& RHS) const {
+    return ap_private(VAL ^ RHS.get_VAL());
+  }
+#if 1
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::mult_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::mult Result(((uint64_t)VAL) *
+                                                  RHS.get_VAL());
+      return Result;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::mult Result(*this);
+      Result *= RHS;
+      return Result;
+    }
+  }
+#endif
+  INLINE ap_private Mul(const ap_private& RHS) const {
+    return ap_private(VAL * RHS.get_VAL());
+  }
+
+  INLINE ap_private Add(const ap_private& RHS) const {
+    return ap_private(VAL + RHS.get_VAL());
+  }
+
+  INLINE ap_private Sub(const ap_private& RHS) const {
+    return ap_private(VAL - RHS.get_VAL());
+  }
+
+  INLINE ap_private& operator&=(uint64_t RHS) {
+    VAL &= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator|=(uint64_t RHS) {
+    VAL |= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator^=(uint64_t RHS) {
+    VAL ^= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator*=(uint64_t RHS) {
+    VAL *= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator+=(uint64_t RHS) {
+    VAL += (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator-=(uint64_t RHS) {
+    VAL -= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE bool isMinSignedValue() const {
+    static const uint64_t min_mask = ~(~0ULL << (_AP_W - 1));
+    return BitWidth == 1 ? VAL == 1
+                         : (ap_private_ops::isNegative<_AP_W>(*this) &&
+                            ((min_mask & VAL) == 0));
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::plus_w <= 64)
+      return typename RType<_AP_W1, _AP_S1>::plus(
+          RType<_AP_W1, _AP_S1>::plus_s
+              ? int64_t(((uint64_t)VAL) + RHS.get_VAL())
+              : uint64_t(((uint64_t)VAL) + RHS.get_VAL()));
+    typename RType<_AP_W1, _AP_S1>::plus Result = RHS;
+    Result += VAL;
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::minus_w <= 64)
+      return typename RType<_AP_W1, _AP_S1>::minus(
+          int64_t(((uint64_t)VAL) - RHS.get_VAL()));
+    typename RType<_AP_W1, _AP_S1>::minus Result = *this;
+    Result -= RHS;
+    return Result;
+  }
+
+  INLINE uint32_t countPopulation() const {
+    return ap_private_ops::CountPopulation_64(VAL);
+  }
+  INLINE uint32_t countLeadingZeros() const {
+    int remainder = BitWidth % 64;
+    int excessBits = (64 - remainder) % 64;
+    uint32_t Count = ap_private_ops::CountLeadingZeros_64(VAL);
+    if (Count) Count -= excessBits;
+    return AESL_std::min(Count, (uint32_t)_AP_W);
+  }
+
+  /// HiBits - This function returns the high "numBits" bits of this ap_private.
+  INLINE ap_private<_AP_W, _AP_S> getHiBits(uint32_t numBits) const {
+    ap_private<_AP_W, _AP_S> ret(*this);
+    ret = (ret) >> (BitWidth - numBits);
+    return ret;
+  }
+
+  /// LoBits - This function returns the low "numBits" bits of this ap_private.
+  INLINE ap_private<_AP_W, _AP_S> getLoBits(uint32_t numBits) const {
+    ap_private<_AP_W, _AP_S> ret(((uint64_t)VAL) << (BitWidth - numBits));
+    ret = (ret) >> (BitWidth - numBits);
+    return ret;
+    // return ap_private(numBits, (VAL << (BitWidth - numBits))>> (BitWidth -
+    // numBits));
+  }
+
+  INLINE ap_private<_AP_W, _AP_S>& set(uint32_t bitPosition) {
+    VAL |= (1ULL << (bitPosition));
+    clearUnusedBits();
+    return *this; // clearUnusedBits();
+  }
+
+  INLINE void set() {
+    VAL = (ValType)~0ULL;
+    clearUnusedBits();
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_private<_AP_W3, false>& val) {
+    operator=(ap_private<_AP_W3, _AP_S>(val));
+  }
+
+  INLINE void set(const ap_private& val) { operator=(val); }
+
+  INLINE void clearUnusedBits(void) volatile
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
+    VAL = (ValType)(
+        _AP_S
+            ? ((((int64_t)VAL) << (excess_bits)) >> (excess_bits))
+            : (excess_bits ? (((uint64_t)VAL) << (excess_bits)) >> (excess_bits)
+                           : (uint64_t)VAL));
+  }
+
+  INLINE void clearUnusedBitsToZero(void) {
+    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
+    static uint64_t mask = ~0ULL >> (excess_bits);
+    VAL &= mask;
+  }
+
+  INLINE ap_private udiv(const ap_private& RHS) const {
+    return ap_private((uint64_t)VAL / RHS.get_VAL());
+  }
+
+  /// Signed divide this ap_private by ap_private RHS.
+  /// @brief Signed division function for ap_private.
+  INLINE ap_private sdiv(const ap_private& RHS) const {
+    if (isNegative())
+      if (RHS.isNegative())
+        return ((uint64_t)(0 - (*this))) / (uint64_t)(0 - RHS);
+      else
+        return 0 - ((uint64_t)(0 - (*this)) / (uint64_t)(RHS));
+    else if (RHS.isNegative())
+      return 0 - (this->udiv((ap_private)(0 - RHS)));
+    return this->udiv(RHS);
+  }
+
+  template <bool _AP_S2>
+  INLINE ap_private urem(const ap_private<_AP_W, _AP_S2>& RHS) const {
+    assert(RHS.get_VAL() != 0 && "Divide by 0");
+    return ap_private(((uint64_t)VAL) % ((uint64_t)RHS.get_VAL()));
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  template <bool _AP_S2>
+  INLINE ap_private srem(const ap_private<_AP_W, _AP_S2>& RHS) const {
+    if (isNegative()) {
+      ap_private lhs = 0 - (*this);
+      if (RHS.isNegative()) {
+        ap_private rhs = 0 - RHS;
+        return 0 - (lhs.urem(rhs));
+      } else
+        return 0 - (lhs.urem(RHS));
+    } else if (RHS.isNegative()) {
+      ap_private rhs = 0 - RHS;
+      return this->urem(rhs);
+    }
+    return this->urem(RHS);
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool eq(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return (*this) == RHS;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ne(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered unsigned.
+  /// @brief Unsigned less than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ult(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (_AP_W1 <= 64) {
+      uint64_t lhsZext = ((uint64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
+      uint64_t rhsZext =
+          ((uint64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
+      return lhsZext < rhsZext;
+    } else
+      return RHS.uge(*this);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered signed.
+  /// @brief Signed less than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool slt(const ap_private<_AP_W1, _AP_S1>& RHS) const
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    if (_AP_W1 <= 64) {
+      int64_t lhsSext = ((int64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
+      int64_t rhsSext =
+          ((int64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
+      return lhsSext < rhsSext;
+    } else
+      return RHS.sge(*this);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered unsigned.
+  /// @brief Unsigned less or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ule(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return ult(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered signed.
+  /// @brief Signed less or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sle(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return slt(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered unsigned.
+  /// @brief Unsigned greather than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ugt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !ult(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered signed.
+  /// @brief Signed greather than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sgt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !slt(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered unsigned.
+  /// @brief Unsigned greater or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool uge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !ult(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered signed.
+  /// @brief Signed greather or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !slt(RHS);
+  }
+
+  INLINE ap_private abs() const {
+    if (isNegative()) return -(*this);
+    return *this;
+  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> ret(*this);
+    return ret;
+  }
+
+  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
+                                       uint8_t radix) {
+    return _AP_W;
+  }
+
+  INLINE uint32_t getActiveBits() const {
+    uint32_t bits = _AP_W - countLeadingZeros();
+    return bits ? bits : 1;
+  }
+
+  INLINE double roundToDouble(bool isSigned = false) const {
+    return isSigned ? double((int64_t)VAL) : double((uint64_t)VAL);
+  }
+
+  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
+   * versa*/
+  INLINE ap_private& reverse() {
+    for (int i = 0; i < _AP_W / 2; ++i) {
+      bool tmp = operator[](i);
+      if (operator[](_AP_W - 1 - i))
+        set(i);
+      else
+        clear(i);
+      if (tmp)
+        set(_AP_W - 1 - i);
+      else
+        clear(_AP_W - 1 - i);
+    }
+    clearUnusedBits();
+    return *this;
+  }
+
+  /*Return true if the value of ap_private instance is zero*/
+  INLINE bool iszero() const { return isMinValue(); }
+
+  INLINE bool to_bool() const { return !iszero(); }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (isNegative()) return true;
+    return false;
+  }
+
+  /* x[i] = !x[i] */
+  INLINE void invert(int i) {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    flip(i);
+  }
+
+  /* x[i] */
+  INLINE bool test(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return operator[](i);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the left
+  INLINE void lrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(shl(n) | lshr(_AP_W - n));
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the right
+  INLINE void rrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(lshr(n) | shl(_AP_W - n));
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return (((1ULL << i) & VAL) != 0);
+  }
+
+  /// Toggle all bits.
+  INLINE ap_private& flip() {
+    VAL = (ValType)((~0ULL ^ VAL) & mask);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// Toggles a given bit to its opposite value.
+  INLINE ap_private& flip(uint32_t bitPosition) {
+    assert(bitPosition < BitWidth && "Out of the bit-width range!");
+    set_bit(bitPosition, !get_bit(bitPosition));
+    return *this;
+  }
+
+  // complements every bit
+  INLINE void b_not() { flip(); }
+
+// Binary Arithmetic
+//-----------------------------------------------------------
+#define OP_BIN_AP(Sym, Rty, Fun)                           \
+  template <int _AP_W2, bool _AP_S2>                       \
+  INLINE typename RType<_AP_W2, _AP_S2>::Rty operator Sym( \
+      const ap_private<_AP_W2, _AP_S2>& op) const {        \
+    typename RType<_AP_W2, _AP_S2>::Rty lhs(*this);        \
+    typename RType<_AP_W2, _AP_S2>::Rty rhs(op);           \
+    return lhs.Fun(rhs);                                   \
+  }
+
+/// Bitwise and, or, xor
+// OP_BIN_AP(&,logic, And)
+// OP_BIN_AP(|,logic, Or)
+// OP_BIN_AP(^,logic, Xor)
+#undef OP_BIN_AP
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    return typename RType<_AP_W2, _AP_S2>::div(
+        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    typename RType<_AP_W2, _AP_S2>::mod res =
+        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
+                                                  : lhs.urem(rhs));
+    return res;
+  }
+
+#define OP_ASSIGN_AP_2(Sym)                         \
+  template <int _AP_W2, bool _AP_S2>                \
+  INLINE ap_private<_AP_W, _AP_S>& operator Sym##=( \
+      const ap_private<_AP_W2, _AP_S2>& op) {       \
+    *this = operator Sym(op);                       \
+    return *this;                                   \
+  }
+
+  OP_ASSIGN_AP_2(/)
+  OP_ASSIGN_AP_2(%)
+#undef OP_ASSIGN_AP_2
+
+/// Bitwise assign: and, or, xor
+//-------------------------------------------------------------
+//    OP_ASSIGN_AP(&)
+//    OP_ASSIGN_AP(^)
+//    OP_ASSIGN_AP(|)
+
+#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
+  INLINE ap_private operator<<(const TYPE op) const { \
+    if (op >= _AP_W) return ap_private(0);            \
+    if (SIGNED && op < 0) return *this >> (0 - op);   \
+    return shl(op);                                   \
+  }
+
+  // OP_LEFT_SHIFT_CTYPE(bool, false)
+  OP_LEFT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_LEFT_SHIFT_CTYPE(signed char, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
+  OP_LEFT_SHIFT_CTYPE(short, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
+  OP_LEFT_SHIFT_CTYPE(int, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
+  OP_LEFT_SHIFT_CTYPE(long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
+  OP_LEFT_SHIFT_CTYPE(long long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
+#if 0
+  OP_LEFT_SHIFT_CTYPE(half, false)
+  OP_LEFT_SHIFT_CTYPE(float, false)
+  OP_LEFT_SHIFT_CTYPE(double, false)
+#endif
+
+#undef OP_LEFT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this << sh;
+    } else {
+      int sh = op2.to_int();
+      return *this << sh;
+    }
+  }
+
+#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
+  INLINE ap_private operator>>(const TYPE op) const { \
+    if (op >= _AP_W) {                                \
+      if (isNegative())                               \
+        return ap_private(-1);                        \
+      else                                            \
+        return ap_private(0);                         \
+    }                                                 \
+    if ((SIGNED) && op < 0) return *this << (0 - op); \
+    if (_AP_S)                                        \
+      return ashr(op);                                \
+    else                                              \
+      return lshr(op);                                \
+  }
+
+  // OP_RIGHT_SHIFT_CTYPE(bool, false)
+  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_RIGHT_SHIFT_CTYPE(signed char, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
+  OP_RIGHT_SHIFT_CTYPE(short, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
+  OP_RIGHT_SHIFT_CTYPE(int, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
+  OP_RIGHT_SHIFT_CTYPE(long, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
+  OP_RIGHT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_RIGHT_SHIFT_CTYPE(half, false)
+  OP_RIGHT_SHIFT_CTYPE(float, false)
+  OP_RIGHT_SHIFT_CTYPE(double, false)
+#endif
+
+#undef OP_RIGHT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this >> sh;
+    } else {
+      int sh = op2.to_int();
+      return *this >> sh;
+    }
+  }
+
+  /// Shift assign
+  //-----------------------------------------------------------------
+
+  //INLINE const ap_private& operator<<=(uint32_t shiftAmt) {
+  //  VAL <<= shiftAmt;
+  //  clearUnusedBits();
+  //  return *this;
+  //}
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(int op) {                               \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }                                                                          \
+  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }                                                                          \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }
+
+  OP_ASSIGN_AP(>>)
+  OP_ASSIGN_AP(<<)
+#undef OP_ASSIGN_AP
+
+  /// Comparisons
+  //-----------------------------------------------------------------
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool operator==(const ap_private<_AP_W1, _AP_S1>& op) const {
+    enum { _AP_MAX_W = AP_MAX(AP_MAX(_AP_W, _AP_W1), 32) };
+    ap_private<_AP_MAX_W, false> lhs(*this);
+    ap_private<_AP_MAX_W, false> rhs(op);
+    if (_AP_MAX_W <= 64) {
+      return (uint64_t)lhs.get_VAL() == (uint64_t)rhs.get_VAL();
+    } else
+      return lhs == rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this == op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    // this will follow gcc rule for comparison
+    // between different bitwidth and signness
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
+    else if (_AP_W < 32 && _AP_W2 < 32)
+      // different signness but both bitwidth is less than 32
+      return lhs.sgt(rhs);
+    else
+        // different signness but bigger bitwidth
+        // is greater or equal to 32
+        if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ugt(rhs);
+      else
+        return lhs.sgt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ugt(rhs);
+    else
+      return lhs.sgt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this > op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
+    else if (_AP_W < 32 && _AP_W2 < 32)
+      return lhs.slt(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ult(rhs);
+      else
+        return lhs.slt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ult(rhs);
+    else
+      return lhs.slt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this < op);
+  }
+
+  /// Bit and Part Select
+  //--------------------------------------------------------------
+  // FIXME now _private_range_ref refs to _AP_ROOT_TYPE(struct ssdm_int).
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+//                                                                         a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+//                                                                       a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(
+//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this & a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this | a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this ^ a2.get();
+//  }
+
+  // Reduce operation
+  //-----------------------------------------------------------
+  INLINE bool and_reduce() const { return (VAL & mask) == mask; }
+
+  INLINE bool nand_reduce() const { return (VAL & mask) != mask; }
+
+  INLINE bool or_reduce() const { return (bool)VAL; }
+
+  INLINE bool nor_reduce() const { return VAL == 0; }
+
+  INLINE bool xor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? true : false;
+  }
+
+  INLINE bool xnor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? false : true;
+  }
+
+  INLINE std::string to_string(uint8_t radix = 2, bool sign = false) const {
+    return toString(radix, radix == 10 ? _AP_S : sign);
+  }
+}; // End of class ap_private <_AP_W, _AP_S, true>
+
+template <int _AP_W, bool _AP_S>
+std::string ap_private<_AP_W, _AP_S, true>::toString(uint8_t radix,
+                                                     bool wantSigned) const {
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "a", "b", "c", "d", "e", "f"};
+  std::string result;
+  if (radix != 10) {
+    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+    // because the number of bits per digit (1,3 and 4 respectively) divides
+    // equaly. We just shift until there value is zero.
+
+    // First, check for a zero value and just short circuit the logic below.
+    if (*this == (uint64_t)(0)) {
+      // Always generate a radix indicator because fixed-point
+      // formats require it.
+      switch (radix) {
+        case 2:
+          result = "0b0";
+          break;
+        case 8:
+          result = "0o0";
+          break;
+        case 16:
+          result = "0x0";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+    } else {
+      ap_private<_AP_W, false, true> tmp(*this);
+      size_t insert_at = 0;
+      bool leading_zero = true;
+      if (wantSigned && isNegative()) {
+        // They want to print the signed version and it is a negative value
+        // Flip the bits and add one to turn it into the equivalent positive
+        // value and put a '-' in the result.
+        tmp.flip();
+        tmp++;
+        result = "-";
+        insert_at = 1;
+        leading_zero = false;
+      }
+      switch (radix) {
+        case 2:
+          result += "0b";
+          break;
+        case 8:
+          result += "0o";
+          break;
+        case 16:
+          result += "0x";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+      insert_at += 2;
+
+      // Just shift tmp right for each digit width until it becomes zero
+      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
+      uint64_t mask = radix - 1;
+      ap_private<_AP_W, false, true> zero(0);
+      unsigned bits = 0;
+      bool msb = false;
+      while (tmp.ne(zero)) {
+        unsigned digit = (unsigned)(tmp.get_VAL() & mask);
+        result.insert(insert_at, digits[digit]);
+        tmp = tmp.lshr(shift);
+        bits++;
+        msb = (digit >> (shift - 1)) == 1;
+      }
+      bits *= shift;
+      if (bits < _AP_W && leading_zero && msb)
+        result.insert(insert_at, digits[0]);
+    }
+    return result;
+  }
+
+  ap_private<_AP_W, false, true> tmp(*this);
+  ap_private<6, false, true> divisor(radix);
+  ap_private<_AP_W, _AP_S, true> zero(0);
+  size_t insert_at = 0;
+  if (wantSigned && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    tmp.flip();
+    tmp++;
+    result = "-";
+    insert_at = 1;
+  }
+  if (tmp == ap_private<_AP_W, false, true>(0ULL))
+    result = "0";
+  else
+    while (tmp.ne(zero)) {
+      ap_private<_AP_W, false, true> APdigit = tmp % divisor;
+      ap_private<_AP_W, false, true> tmp2 = tmp / divisor;
+      uint32_t digit = (uint32_t)(APdigit.getZExtValue());
+      assert(digit < radix && "divide failed");
+      result.insert(insert_at, digits[digit]);
+      tmp = tmp2;
+    }
+  return result;
+
+} // End of ap_private<_AP_W, _AP_S, true>::toString()
+
+// bitwidth > 64
+template <int _AP_W, bool _AP_S>
+class ap_private<_AP_W, _AP_S, false> {
+  // SFINAE pattern.  Only consider this class when _AP_W > 64
+  const static bool valid = ap_private_enable_if<(_AP_W > 64)>::isValid;
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+ public:
+  enum { BitWidth = _AP_W, _AP_N = (_AP_W + 63) / 64 };
+  static const int width = _AP_W;
+
+ private:
+  /// This constructor is used only internally for speed of construction of
+  /// temporaries. It is unsafe for general use so it is not public.
+
+  /* Constructors */
+  /// Note that numWords can be smaller or larger than the corresponding bit
+  /// width but any extraneous bits will be dropped.
+  /// @param numWords the number of words in bigVal
+  /// @param bigVal a sequence of words to form the initial value of the
+  /// ap_private
+  /// @brief Construct an ap_private, initialized as bigVal[].
+  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) {
+    set_canary();
+    assert(bigVal && "Null pointer detected!");
+    {
+      // Get memory, cleared to 0
+      memset(pVal, 0, _AP_N * sizeof(uint64_t));
+
+      // Calculate the number of words to copy
+      uint32_t words = AESL_std::min<uint32_t>(numWords, _AP_N);
+      // Copy the words from bigVal to pVal
+      memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
+      if (words >= _AP_W) clearUnusedBits();
+      // Make sure unused high bits are cleared
+    }
+    check_canary();
+  }
+
+  /// This constructor interprets Val as a string in the given radix. The
+  /// interpretation stops when the first charater that is not suitable for the
+  /// radix is encountered. Acceptable radix values are 2, 8, 10 and 16. It is
+  /// an error for the value implied by the string to require more bits than
+  /// numBits.
+  /// @param val the string to be interpreted
+  /// @param radix the radix of Val to use for the intepretation
+  /// @brief Construct an ap_private from a string representation.
+  INLINE ap_private(const std::string& val, uint8_t radix = 2) {
+    set_canary();
+    assert(!val.empty() && "The input string is empty.");
+    const char* c_str = val.c_str();
+    fromString(c_str, val.size(), radix);
+    check_canary();
+  }
+
+  /// This constructor interprets the slen characters starting at StrStart as
+  /// a string in the given radix. The interpretation stops when the first
+  /// character that is not suitable for the radix is encountered. Acceptable
+  /// radix values are 2, 8, 10 and 16. It is an error for the value implied by
+  /// the string to require more bits than numBits.
+  /// @param strStart the start of the string to be interpreted
+  /// @param slen the maximum number of characters to interpret
+  /// @param radix the radix to use for the conversion
+  /// @brief Construct an ap_private from a string representation.
+  /// This method does not consider whether it is negative or not.
+  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) {
+    set_canary();
+    fromString(strStart, slen, radix);
+    check_canary();
+  }
+
+  INLINE void report() {
+    _AP_ERROR(_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024,
+              "ap_%sint<%d>: Bitwidth exceeds the "
+              "default max value %d. Please use macro "
+              "AP_INT_MAX_W to set a larger max value.",
+              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
+  }
+  /// This union is used to store the integer value. When the
+  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
+
+  /// This enum is used to hold the constants we needed for ap_private.
+  // uint64_t VAL;    ///< Used to store the <= 64 bits integer value.
+  uint64_t pVal[_AP_N]; ///< Used to store the >64 bits integer value.
+#ifdef AP_CANARY
+  uint64_t CANARY;
+  INLINE void check_canary() { assert(CANARY == (uint64_t)0xDEADBEEFDEADBEEF); }
+  INLINE void set_canary() { CANARY = (uint64_t)0xDEADBEEFDEADBEEF; }
+#else
+  INLINE void check_canary() {}
+  INLINE void set_canary() {}
+#endif
+
+ public:
+  typedef typename valtype<8, _AP_S>::Type ValType;
+  typedef ap_private<_AP_W, _AP_S> Type;
+  // FIXME remove friend type?
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  friend struct ap_fixed_base;
+  /// return type of variety of operations
+  //----------------------------------------------------------
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+    typedef ap_private<mult_w, mult_s> mult;
+    typedef ap_private<plus_w, plus_s> plus;
+    typedef ap_private<minus_w, minus_s> minus;
+    typedef ap_private<logic_w, logic_s> logic;
+    typedef ap_private<div_w, div_s> div;
+    typedef ap_private<mod_w, mod_s> mod;
+    typedef ap_private<_AP_W, _AP_S> arg1;
+    typedef bool reduce;
+  };
+
+  INLINE uint64_t& get_VAL(void) { return pVal[0]; }
+  INLINE uint64_t get_VAL(void) const { return pVal[0]; }
+  INLINE uint64_t get_VAL(void) const volatile { return pVal[0]; }
+  INLINE void set_VAL(uint64_t value) { pVal[0] = value; }
+  INLINE uint64_t& get_pVal(int index) { return pVal[index]; }
+  INLINE uint64_t* get_pVal() { return pVal; }
+  INLINE const uint64_t* get_pVal() const { return pVal; }
+  INLINE uint64_t get_pVal(int index) const { return pVal[index]; }
+  INLINE uint64_t* get_pVal() const volatile { return pVal; }
+  INLINE uint64_t get_pVal(int index) const volatile { return pVal[index]; }
+  INLINE void set_pVal(int i, uint64_t value) { pVal[i] = value; }
+
+  /// This enum is used to hold the constants we needed for ap_private.
+  enum {
+    APINT_BITS_PER_WORD = sizeof(uint64_t) * 8, ///< Bits in a word
+    APINT_WORD_SIZE = sizeof(uint64_t)          ///< Byte size of a word
+  };
+
+  enum {
+    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
+                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
+                      : 0
+  };
+  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
+
+ public:
+  // NOTE changed to explicit to be consistent with ap_private<W,S,true>
+  explicit INLINE ap_private(const char* val) {
+    set_canary();
+    unsigned char radix = 10;
+    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
+    std::string::size_type pos = str.find('.');
+    if (pos != std::string::npos) str = str.substr(pos);
+    ap_private ap_private_val(str, radix);
+    operator=(ap_private_val);
+    report();
+    check_canary();
+  }
+
+  INLINE ap_private(const char* val, unsigned char rd) {
+    set_canary();
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
+    std::string::size_type pos = str.find('.');
+    if (pos != std::string::npos) str = str.substr(pos);
+    ap_private ap_private_val(str, radix);
+    operator=(ap_private_val);
+    report();
+
+    report();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ref.get();
+    report();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ((uint64_t)(bool)ref);
+    report();
+    check_canary();
+  }
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+//    set_canary();
+//    *this = ref.get();
+//    report();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = ((val.operator ap_private<_AP_W2, false>()));
+//    report();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = (uint64_t)(bool)val;
+//    report();
+//    check_canary();
+//  }
+
+  /// Simply makes *this a copy of that.
+  /// @brief Copy Constructor.
+  INLINE ap_private(const ap_private& that) {
+      set_canary();
+      memcpy(pVal, that.get_pVal(), _AP_N * APINT_WORD_SIZE);
+      clearUnusedBits();
+      check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, false>& that) {
+    set_canary();
+    operator=(that);
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, false>& that) {
+    set_canary();
+    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, false>&>(that));
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, true>& that) {
+    set_canary();
+    static const uint64_t that_sign_ext_mask =
+        (_AP_W1 == APINT_BITS_PER_WORD)
+            ? 0
+            : ~0ULL >> (_AP_W1 % APINT_BITS_PER_WORD)
+                           << (_AP_W1 % APINT_BITS_PER_WORD);
+    if (that.isNegative()) {
+      pVal[0] = that.get_VAL() | that_sign_ext_mask;
+      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1));
+    } else {
+      pVal[0] = that.get_VAL();
+      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));
+    }
+    clearUnusedBits();
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, true>& that) {
+    set_canary();
+    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, true>&>(that));
+    check_canary();
+  }
+
+  /// @brief Destructor.
+  // virtual ~ap_private() {}
+  INLINE ~ap_private() { check_canary(); }
+
+  /// @name Constructors
+  /// @{
+
+  /// Default constructor that creates an uninitialized ap_private.  This is
+  /// useful
+  ///  for object deserialization (pair this with the static method Read).
+  INLINE ap_private() {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  INLINE ap_private(uint64_t* val, uint32_t bits = _AP_W) { assert(0); }
+  INLINE ap_private(const uint64_t* const val, uint32_t bits) { assert(0); }
+
+/// If isSigned is true then val is treated as if it were a signed value
+/// (i.e. as an int64_t) and the appropriate sign extension to the bit width
+/// will be done. Otherwise, no sign extension occurs (high order bits beyond
+/// the range of val are zero filled).
+/// @param numBits the bit width of the constructed ap_private
+/// @param val the initial value of the ap_private
+/// @param isSigned how to treat signedness of val
+/// @brief Create a new ap_private of numBits width, initialized as val.
+#define CTOR(TYPE, SIGNED)                                  \
+  INLINE ap_private(TYPE val, bool isSigned = SIGNED) {     \
+    set_canary();                                           \
+    pVal[0] = (ValType)val;                                 \
+    if (isSigned && int64_t(pVal[0]) < 0) {                 \
+      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); \
+    } else {                                                \
+      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));  \
+    }                                                       \
+    clearUnusedBits();                                      \
+    check_canary();                                         \
+  }
+
+  CTOR(bool, false)
+  CTOR(char, CHAR_IS_SIGNED)
+  CTOR(signed char, true)
+  CTOR(unsigned char, false)
+  CTOR(short, true)
+  CTOR(unsigned short, false)
+  CTOR(int, true)
+  CTOR(unsigned int, false)
+  CTOR(long, true)
+  CTOR(unsigned long, false)
+  CTOR(ap_slong, true)
+  CTOR(ap_ulong, false)
+#if 0
+  CTOR(half, false)
+  CTOR(float, false)
+  CTOR(double, false)
+#endif
+#undef CTOR
+
+  /// @returns true if the number of bits <= 64, false otherwise.
+  /// @brief Determine if this ap_private just has one word to store value.
+  INLINE bool isSingleWord() const { return false; }
+
+  /// @returns the word position for the specified bit position.
+  /// @brief Determine which word a bit is in.
+  static INLINE uint32_t whichWord(uint32_t bitPosition) {
+    //    return bitPosition / APINT_BITS_PER_WORD;
+    return (bitPosition) >> 6;
+  }
+
+  /// @returns the bit position in a word for the specified bit position
+  /// in the ap_private.
+  /// @brief Determine which bit in a word a bit is in.
+  static INLINE uint32_t whichBit(uint32_t bitPosition) {
+    //    return bitPosition % APINT_BITS_PER_WORD;
+    return bitPosition & 0x3f;
+  }
+
+  /// bit at a specific bit position. This is used to mask the bit in the
+  /// corresponding word.
+  /// @returns a uint64_t with only bit at "whichBit(bitPosition)" set
+  /// @brief Get a single bit mask.
+  static INLINE uint64_t maskBit(uint32_t bitPosition) {
+    return 1ULL << (whichBit(bitPosition));
+  }
+
+  /// @returns the corresponding word for the specified bit position.
+  /// @brief Get the word corresponding to a bit position
+  INLINE uint64_t getWord(uint32_t bitPosition) const {
+    return pVal[whichWord(bitPosition)];
+  }
+
+  /// This method is used internally to clear the to "N" bits in the high order
+  /// word that are not used by the ap_private. This is needed after the most
+  /// significant word is assigned a value to ensure that those bits are
+  /// zero'd out.
+  /// @brief Clear unused high order bits
+  INLINE void clearUnusedBits(void) volatile
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    pVal[_AP_N - 1] =
+        _AP_S ? ((((int64_t)pVal[_AP_N - 1]) << (excess_bits)) >> excess_bits)
+              : (excess_bits
+                     ? ((pVal[_AP_N - 1]) << (excess_bits)) >> (excess_bits)
+                     : pVal[_AP_N - 1]);
+  }
+
+  INLINE void clearUnusedBitsToZero(void) { pVal[_AP_N - 1] &= mask; }
+
+  INLINE void clearUnusedBitsToOne(void) { pVal[_AP_N - 1] |= mask; }
+
+  /// This is used by the constructors that take string arguments.
+  /// @brief Convert a char array into an ap_private
+  INLINE void fromString(const char* str, uint32_t slen, uint8_t radix) {
+    enum { numbits = _AP_W };
+    bool isNeg = str[0] == '-';
+    if (isNeg) {
+      str++;
+      slen--;
+    }
+
+    if (str[0] == '0' && (str[1] == 'b' || str[1] == 'B')) {
+      //if(radix == 0) radix = 2;
+      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", str, 2, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'o' || str[1] == 'O')) {
+      //if (radix == 0) radix = 8;
+      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", str, 8, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
+      //if (radix == 0) radix = 16;
+      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", str, 16, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'd' || str[1] == 'D')) {
+      //if (radix == 0) radix = 10;
+      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", str, 10, radix);
+      str += 2;
+      slen -=2;
+    } else if (radix == 0) {
+      //radix = 2; // XXX default value
+    }
+
+    // Check our assumptions here
+    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+           "Radix should be 2, 8, 10, or 16!");
+    assert(str && "String is null?");
+
+    // skip any leading zero
+    while (*str == '0' && *(str + 1) != '\0') {
+      str++;
+      slen--;
+    }
+    assert((slen <= numbits || radix != 2) && "Insufficient bit width");
+    assert(((slen - 1) * 3 <= numbits || radix != 8) &&
+           "Insufficient bit width");
+    assert(((slen - 1) * 4 <= numbits || radix != 16) &&
+           "Insufficient bit width");
+    assert((((slen - 1) * 64) / 22 <= numbits || radix != 10) &&
+           "Insufficient bit width");
+
+    // clear bits
+    memset(pVal, 0, _AP_N * sizeof(uint64_t));
+
+    // Figure out if we can shift instead of multiply
+    uint32_t shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+    // Set up an ap_private for the digit to add outside the loop so we don't
+    // constantly construct/destruct it.
+    uint64_t bigVal[_AP_N];
+    memset(bigVal, 0, _AP_N * sizeof(uint64_t));
+    ap_private<_AP_W, _AP_S> apdigit(getBitWidth(), bigVal);
+    ap_private<_AP_W, _AP_S> apradix(radix);
+
+    // Enter digit traversal loop
+    for (unsigned i = 0; i < slen; i++) {
+      // Get a digit
+      uint32_t digit = 0;
+      char cdigit = str[i];
+      if (radix == 16) {
+#define isxdigit(c)                                            \
+  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
+   ((c) >= 'A' && (c) <= 'F'))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+        if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
+        if (isdigit(cdigit))
+          digit = cdigit - '0';
+        else if (cdigit >= 'a')
+          digit = cdigit - 'a' + 10;
+        else if (cdigit >= 'A')
+          digit = cdigit - 'A' + 10;
+        else
+          assert(0 && "huh? we shouldn't get here");
+      } else if (isdigit(cdigit)) {
+        digit = cdigit - '0';
+      } else if (cdigit != '\0') {
+        assert(0 && "Invalid character in digit string");
+      }
+#undef isxdigit
+#undef isdigit
+      // Shift or multiply the value by the radix
+      if (shift)
+        *this <<= shift;
+      else
+        *this *= apradix;
+
+      // Add in the digit we just interpreted
+      apdigit.set_VAL(digit);
+      *this += apdigit;
+    }
+    // If its negative, put it in two's complement form
+    if (isNeg) {
+      (*this)--;
+      this->flip();
+    }
+    clearUnusedBits();
+  }
+
+  INLINE ap_private read() volatile { return *this; }
+
+  INLINE void write(const ap_private& op2) volatile { *this = (op2); }
+
+  INLINE operator ValType() const { return get_VAL(); }
+
+  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
+
+  INLINE int to_char() const { return (signed char)get_VAL(); }
+
+  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
+
+  INLINE int to_short() const { return (short)get_VAL(); }
+
+  INLINE int to_int() const { return (int)get_VAL(); }
+
+  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
+
+  INLINE long to_long() const { return (long)get_VAL(); }
+
+  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
+
+  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
+
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
+
+  INLINE double to_double() const {
+    if (isNegative())
+      return roundToDouble(true);
+    else
+      return roundToDouble(false);
+  }
+
+  INLINE unsigned length() const { return _AP_W; }
+
+  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
+   * versa*/
+  INLINE ap_private& reverse() {
+    for (int i = 0; i < _AP_W / 2; ++i) {
+      bool tmp = operator[](i);
+      if (operator[](_AP_W - 1 - i))
+        set(i);
+      else
+        clear(i);
+      if (tmp)
+        set(_AP_W - 1 - i);
+      else
+        clear(_AP_W - 1 - i);
+    }
+    clearUnusedBits();
+    return *this;
+  }
+
+  /*Return true if the value of ap_private instance is zero*/
+  INLINE bool iszero() const { return isMinValue(); }
+
+  INLINE bool to_bool() const { return !iszero(); }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (isNegative()) return true;
+    return false;
+  }
+
+  /* x[i] = !x[i] */
+  INLINE void invert(int i) {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    flip(i);
+  }
+
+  /* x[i] */
+  INLINE bool test(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return operator[](i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // FIXME different argument for different action?
+  INLINE ap_private& set(uint32_t bitPosition) {
+    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE void set() {
+    for (int i = 0; i < _AP_N; ++i) pVal[i] = ~0ULL;
+    clearUnusedBits();
+  }
+
+  // Get the value of ith bit
+  INLINE bool get(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the left
+  INLINE void lrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(shl(n) | lshr(_AP_W - n));
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the right
+  INLINE void rrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(lshr(n) | shl(_AP_W - n));
+  }
+
+  /// Set the given bit to 0 whose position is given as "bitPosition".
+  /// @brief Set a given bit to 0.
+  INLINE ap_private& clear(uint32_t bitPosition) {
+    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @brief Set every bit to 0.
+  INLINE void clear() { memset(pVal, 0, _AP_N * APINT_WORD_SIZE); }
+
+  /// @brief Toggle every bit to its opposite value.
+  ap_private& flip() {
+    for (int i = 0; i < _AP_N; ++i) pVal[i] ^= ~0ULL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @brief Toggles a given bit to its opposite value.
+  INLINE ap_private& flip(uint32_t bitPosition) {
+    assert(bitPosition < BitWidth && "Out of the bit-width range!");
+    set_bit(bitPosition, !get_bit(bitPosition));
+    return *this;
+  }
+
+  // complements every bit
+  INLINE void b_not() { flip(); }
+
+  INLINE ap_private getLoBits(uint32_t numBits) const {
+    return ap_private_ops::lshr(ap_private_ops::shl(*this, _AP_W - numBits),
+                                _AP_W - numBits);
+  }
+
+  INLINE ap_private getHiBits(uint32_t numBits) const {
+    return ap_private_ops::lshr(*this, _AP_W - numBits);
+  }
+
+  // Binary Arithmetic
+  //-----------------------------------------------------------
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this & a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this | a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this ^ a2.get();
+//  }
+
+/// Arithmetic assign
+//-------------------------------------------------------------
+
+#define OP_BIN_LOGIC_ASSIGN_AP(Sym)                                            \
+  template <int _AP_W1, bool _AP_S1>                                           \
+  INLINE ap_private& operator Sym(const ap_private<_AP_W1, _AP_S1>& RHS) {     \
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                      \
+    uint32_t numWords = AESL_std::min((int)_AP_N, _AP_N1);                     \
+    uint32_t i;                                                                \
+    if (_AP_W != _AP_W1)                                                       \
+      fprintf(stderr,                                                          \
+              "Warning! Bitsize mismach for ap_[u]int " #Sym " ap_[u]int.\n"); \
+    for (i = 0; i < numWords; ++i) pVal[i] Sym RHS.get_pVal(i);                \
+    if (_AP_N1 < _AP_N) {                                                      \
+      uint64_t ext = RHS.isNegative() ? ~0ULL : 0;                             \
+      for (; i < _AP_N; i++) pVal[i] Sym ext;                                  \
+    }                                                                          \
+    clearUnusedBits();                                                         \
+    return *this;                                                              \
+  }
+
+  OP_BIN_LOGIC_ASSIGN_AP(&=);
+  OP_BIN_LOGIC_ASSIGN_AP(|=);
+  OP_BIN_LOGIC_ASSIGN_AP(^=);
+#undef OP_BIN_LOGIC_ASSIGN_AP
+
+  /// Adds the RHS APint to this ap_private.
+  /// @returns this, after addition of RHS.
+  /// @brief Addition assignment operator.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    uint64_t RHSpVal[_AP_N1];
+    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
+    ap_private_ops::add(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
+                        _AP_S1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    uint64_t RHSpVal[_AP_N1];
+    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
+    ap_private_ops::sub(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
+                        _AP_S1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    // Get some bit facts about LHS and check for zero
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
+    if (!lhsWords) {
+      // 0 * X ===> 0
+      return *this;
+    }
+
+    ap_private dupRHS = RHS;
+    // Get some bit facts about RHS and check for zero
+    uint32_t rhsBits = dupRHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
+    if (!rhsWords) {
+      // X * 0 ===> 0
+      clear();
+      return *this;
+    }
+
+    // Allocate space for the result
+    uint32_t destWords = rhsWords + lhsWords;
+    uint64_t* dest = (uint64_t*)malloc(destWords * sizeof(uint64_t));
+
+    // Perform the long multiply
+    ap_private_ops::mul(dest, pVal, lhsWords, dupRHS.get_pVal(), rhsWords,
+                        destWords);
+
+    // Copy result back into *this
+    clear();
+    uint32_t wordsToCopy = destWords >= _AP_N ? _AP_N : destWords;
+
+    memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
+
+    uint64_t ext = (isNegative() ^ RHS.isNegative()) ? ~0ULL : 0ULL;
+    for (int i = wordsToCopy; i < _AP_N; i++) pVal[i] = ext;
+    clearUnusedBits();
+    // delete dest array and return
+    free(dest);
+    return *this;
+  }
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }
+
+  OP_ASSIGN_AP(/)
+  OP_ASSIGN_AP(%)
+#undef OP_ASSIGN_AP
+
+#define OP_BIN_LOGIC_AP(Sym)                                                  \
+  template <int _AP_W1, bool _AP_S1>                                          \
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator Sym(                  \
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {                          \
+    enum {                                                                    \
+      numWords = (RType<_AP_W1, _AP_S1>::logic_w + APINT_BITS_PER_WORD - 1) / \
+                 APINT_BITS_PER_WORD                                          \
+    };                                                                        \
+    typename RType<_AP_W1, _AP_S1>::logic Result;                             \
+    uint32_t i;                                                               \
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                     \
+    uint32_t min_N = std::min((int)_AP_N, _AP_N1);                            \
+    uint32_t max_N = std::max((int)_AP_N, _AP_N1);                            \
+    for (i = 0; i < min_N; ++i)                                               \
+      Result.set_pVal(i, pVal[i] Sym RHS.get_pVal(i));                        \
+    if (numWords > i) {                                                       \
+      uint64_t ext = ((_AP_N < _AP_N1 && isNegative()) ||                     \
+                      (_AP_N1 < _AP_N && RHS.isNegative()))                   \
+                         ? ~0ULL                                              \
+                         : 0;                                                 \
+      if (_AP_N > _AP_N1)                                                     \
+        for (; i < max_N; i++) Result.set_pVal(i, pVal[i] Sym ext);           \
+      else                                                                    \
+        for (; i < max_N; i++) Result.set_pVal(i, RHS.get_pVal(i) Sym ext);   \
+      if (numWords > i) {                                                     \
+        uint64_t ext2 = ((_AP_N > _AP_N1 && isNegative()) ||                  \
+                         (_AP_N1 > _AP_N && RHS.isNegative()))                \
+                            ? ~0ULL                                           \
+                            : 0;                                              \
+        Result.set_pVal(i, ext Sym ext2);                                     \
+      }                                                                       \
+    }                                                                         \
+    Result.clearUnusedBits();                                                 \
+    return Result;                                                            \
+  }
+
+  OP_BIN_LOGIC_AP(|);
+  OP_BIN_LOGIC_AP(&);
+  OP_BIN_LOGIC_AP(^);
+
+#undef OP_BIN_LOGIC_AP
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::plus Result, lhs(*this), rhs(RHS);
+    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::plus_w + 63) / 64;
+    ap_private_ops::add(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
+                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::minus Result, lhs(*this), rhs(RHS);
+    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::minus_w + 63) / 64;
+    ap_private_ops::sub(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
+                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::mult temp = *this;
+    temp *= RHS;
+    return temp;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    return typename RType<_AP_W2, _AP_S2>::div(
+        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    typename RType<_AP_W2, _AP_S2>::mod res =
+        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
+                                                  : lhs.urem(rhs));
+    return res;
+  }
+
+#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
+  INLINE ap_private operator<<(const TYPE op) const { \
+    if (op >= _AP_W) return ap_private(0);            \
+    if (SIGNED && op < 0) return *this >> (0 - op);   \
+    return shl(op);                                   \
+  }
+
+  OP_LEFT_SHIFT_CTYPE(int, true)
+  // OP_LEFT_SHIFT_CTYPE(bool, false)
+  OP_LEFT_SHIFT_CTYPE(signed char, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
+  OP_LEFT_SHIFT_CTYPE(short, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
+  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
+  OP_LEFT_SHIFT_CTYPE(long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
+  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
+  OP_LEFT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_LEFT_SHIFT_CTYPE(half, false)
+  OP_LEFT_SHIFT_CTYPE(float, false)
+  OP_LEFT_SHIFT_CTYPE(double, false)
+#endif
+#undef OP_LEFT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this << sh;
+    } else {
+      int sh = op2.to_int();
+      return *this << sh;
+    }
+  }
+
+#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
+  INLINE ap_private operator>>(const TYPE op) const { \
+    if (op >= _AP_W) {                                \
+      if (isNegative())                               \
+        return ap_private(-1);                        \
+      else                                            \
+        return ap_private(0);                         \
+    }                                                 \
+    if ((SIGNED) && op < 0) return *this << (0 - op); \
+    if (_AP_S)                                        \
+      return ashr(op);                                \
+    else                                              \
+      return lshr(op);                                \
+  }
+
+  // OP_RIGHT_SHIFT_CTYPE(bool, false)
+  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_RIGHT_SHIFT_CTYPE(signed char, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
+  OP_RIGHT_SHIFT_CTYPE(short, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
+  OP_RIGHT_SHIFT_CTYPE(int, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
+  OP_RIGHT_SHIFT_CTYPE(long, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
+  OP_RIGHT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_RIGHT_SHIFT_CTYPE(half, false)
+  OP_RIGHT_SHIFT_CTYPE(float, false)
+  OP_RIGHT_SHIFT_CTYPE(double, false)
+#endif
+#undef OP_RIGHT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this >> sh;
+    } else {
+      int sh = op2.to_int();
+      return *this >> sh;
+    }
+  }
+
+  /// Shift assign
+  //------------------------------------------------------------------
+  // TODO call clearUnusedBits ?
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(int op) {                               \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }                                                                          \
+  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }                                                                          \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }
+  OP_ASSIGN_AP(>>)
+  OP_ASSIGN_AP(<<)
+#undef OP_ASSIGN_AP
+
+  /// Comparisons
+  //-----------------------------------------------------------------
+  INLINE bool operator==(const ap_private& RHS) const {
+    // Get some facts about the number of bits used in the two operands.
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 = RHS.getActiveBits();
+
+    // If the number of bits isn't the same, they aren't equal
+    if (n1 != n2) return false;
+
+    // If the number of bits fits in a word, we only need to compare the low
+    // word.
+    if (n1 <= APINT_BITS_PER_WORD) return pVal[0] == RHS.get_pVal(0);
+
+    // Otherwise, compare everything
+    for (int i = whichWord(n1 - 1); i >= 0; --i)
+      if (pVal[i] != RHS.get_pVal(i)) return false;
+    return true;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W, _AP_W2),
+    };
+    ap_private<_AP_MAX_W, false> lhs(*this);
+    ap_private<_AP_MAX_W, false> rhs(op);
+    return lhs == rhs;
+  }
+
+  INLINE bool operator==(uint64_t Val) const {
+    uint32_t n = getActiveBits();
+    if (n <= APINT_BITS_PER_WORD)
+      return pVal[0] == Val;
+    else
+      return false;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this == op);
+  }
+
+  template <bool _AP_S1>
+  INLINE bool operator!=(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  INLINE bool operator!=(uint64_t Val) const { return !((*this) == Val); }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this > op);
+  }
+
+  INLINE bool operator<(const ap_private& op) const {
+    return _AP_S ? slt(op) : ult(op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ult(rhs);
+      else
+        return lhs.slt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ult(rhs);
+    else
+      return lhs.slt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this < op);
+  }
+
+  INLINE bool operator>(const ap_private& op) const {
+    return _AP_S ? sgt(op) : ugt(op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ugt(rhs);
+      else
+        return lhs.sgt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ugt(rhs);
+    else
+      return lhs.sgt(rhs);
+  }
+
+  /// Bit and Part Select
+  //--------------------------------------------------------------
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> range(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> range(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(const_cast<ap_private*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+//                                                                         a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+//                                                                       a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(
+//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+//  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> ret(*this);
+    return ret;
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_private<_AP_W3, false>& val) {
+    operator=(ap_private<_AP_W3, _AP_S>(val));
+  }
+
+  ///
+  /// @name Value Tests
+  ///
+  /// This tests the high bit of this ap_private to determine if it is set.
+  /// @returns true if this ap_private is negative, false otherwise
+  /// @brief Determine sign of this ap_private.
+  INLINE bool isNegative() const {
+    // just for get rid of warnings
+    enum { shift = (_AP_W - APINT_BITS_PER_WORD * (_AP_N - 1) - 1) };
+    static const uint64_t mask = 1ULL << (shift);
+    return _AP_S && (pVal[_AP_N - 1] & mask);
+  }
+
+  /// This tests the high bit of the ap_private to determine if it is unset.
+  /// @brief Determine if this ap_private Value is positive (not negative).
+  INLINE bool isPositive() const { return !isNegative(); }
+
+  /// This tests if the value of this ap_private is strictly positive (> 0).
+  /// @returns true if this ap_private is Positive and not zero.
+  /// @brief Determine if this ap_private Value is strictly positive.
+  INLINE bool isStrictlyPositive() const {
+    return isPositive() && (*this) != 0;
+  }
+
+  /// This checks to see if the value has all bits of the ap_private are set or
+  /// not.
+  /// @brief Determine if all bits are set
+  INLINE bool isAllOnesValue() const { return countPopulation() == _AP_W; }
+
+  /// This checks to see if the value of this ap_private is the maximum unsigned
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the largest unsigned value.
+  INLINE bool isMaxValue() const { return countPopulation() == _AP_W; }
+
+  /// This checks to see if the value of this ap_private is the maximum signed
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the largest signed value.
+  INLINE bool isMaxSignedValue() const {
+    return !isNegative() && countPopulation() == _AP_W - 1;
+  }
+
+  /// This checks to see if the value of this ap_private is the minimum unsigned
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the smallest unsigned value.
+  INLINE bool isMinValue() const { return countPopulation() == 0; }
+
+  /// This checks to see if the value of this ap_private is the minimum signed
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the smallest signed value.
+  INLINE bool isMinSignedValue() const {
+    return isNegative() && countPopulation() == 1;
+  }
+
+  /// This function returns a pointer to the internal storage of the ap_private.
+  /// This is useful for writing out the ap_private in binary form without any
+  /// conversions.
+  INLINE const uint64_t* getRawData() const { return &pVal[0]; }
+
+  // Square Root - this method computes and returns the square root of "this".
+  // Three mechanisms are used for computation. For small values (<= 5 bits),
+  // a table lookup is done. This gets some performance for common cases. For
+  // values using less than 52 bits, the value is converted to double and then
+  // the libc sqrt function is called. The result is rounded and then converted
+  // back to a uint64_t which is then used to construct the result. Finally,
+  // the Babylonian method for computing square roots is used.
+  INLINE ap_private sqrt() const {
+    // Determine the magnitude of the value.
+    uint32_t magnitude = getActiveBits();
+
+    // Use a fast table for some small values. This also gets rid of some
+    // rounding errors in libc sqrt for small values.
+    if (magnitude <= 5) {
+      static const uint8_t results[32] = {
+          /*     0 */ 0,
+          /*  1- 2 */ 1, 1,
+          /*  3- 6 */ 2, 2, 2, 2,
+          /*  7-12 */ 3, 3, 3, 3, 3, 3,
+          /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
+          /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+          /*    31 */ 6};
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ results[get_VAL()]);
+    }
+
+    // If the magnitude of the value fits in less than 52 bits (the precision of
+    // an IEEE double precision floating point value), then we can use the
+    // libc sqrt function which will probably use a hardware sqrt computation.
+    // This should be faster than the algorithm below.
+    if (magnitude < 52) {
+#ifdef _MSC_VER
+      // Amazingly, VC++ doesn't have round().
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
+                                      uint64_t(::sqrt(double(get_VAL()))) +
+                                      0.5);
+#else
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
+                                      uint64_t(
+                                          ::round(::sqrt(double(get_VAL())))));
+#endif
+    }
+
+    // Okay, all the short cuts are exhausted. We must compute it. The following
+    // is a classical Babylonian method for computing the square root. This code
+    // was adapted to APINt from a wikipedia article on such computations.
+    // See http://www.wikipedia.org/ and go to the page named
+    // Calculate_an_integer_square_root.
+    uint32_t nbits = BitWidth, i = 4;
+    ap_private<_AP_W, _AP_S> testy(16);
+    ap_private<_AP_W, _AP_S> x_old(/*BitWidth,*/ 1);
+    ap_private<_AP_W, _AP_S> x_new(0);
+    ap_private<_AP_W, _AP_S> two(/*BitWidth,*/ 2);
+
+    // Select a good starting value using binary logarithms.
+    for (;; i += 2, testy = testy.shl(2))
+      if (i >= nbits || this->ule(testy)) {
+        x_old = x_old.shl(i / 2);
+        break;
+      }
+
+    // Use the Babylonian method to arrive at the integer square root:
+    for (;;) {
+      x_new = (this->udiv(x_old) + x_old).udiv(two);
+      if (x_old.ule(x_new)) break;
+      x_old = x_new;
+    }
+
+    // Make sure we return the closest approximation
+    // NOTE: The rounding calculation below is correct. It will produce an
+    // off-by-one discrepancy with results from pari/gp. That discrepancy has
+    // been
+    // determined to be a rounding issue with pari/gp as it begins to use a
+    // floating point representation after 192 bits. There are no discrepancies
+    // between this algorithm and pari/gp for bit widths < 192 bits.
+    ap_private<_AP_W, _AP_S> square(x_old * x_old);
+    ap_private<_AP_W, _AP_S> nextSquare((x_old + 1) * (x_old + 1));
+    if (this->ult(square))
+      return x_old;
+    else if (this->ule(nextSquare)) {
+      ap_private<_AP_W, _AP_S> midpoint((nextSquare - square).udiv(two));
+      ap_private<_AP_W, _AP_S> offset(*this - square);
+      if (offset.ult(midpoint))
+        return x_old;
+      else
+        return x_old + 1;
+    } else
+      assert(0 && "Error in ap_private<_AP_W, _AP_S>::sqrt computation");
+    return x_old + 1;
+  }
+
+  ///
+  /// @Assignment Operators
+  ///
+  /// @returns *this after assignment of RHS.
+  /// @brief Copy assignment operator.
+  INLINE ap_private& operator=(const ap_private& RHS) {
+    if (this != &RHS) memcpy(pVal, RHS.get_pVal(), _AP_N * APINT_WORD_SIZE);
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator=(const volatile ap_private& RHS) {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE void operator=(const ap_private& RHS) volatile {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+  }
+  INLINE void operator=(const volatile ap_private& RHS) volatile {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    if (_AP_S1)
+      cpSextOrTrunc(RHS);
+    else
+      cpZextOrTrunc(RHS);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
+    if (_AP_S1)
+      cpSextOrTrunc(RHS);
+    else
+      cpZextOrTrunc(RHS);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    *this = ap_private<_AP_W2, false>(op2);
+    return *this;
+  }
+
+#if 0
+    template<int _AP_W1, bool _AP_S1>
+    INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1, true>& RHS) {
+        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
+        if (RHS.isNegative()) {
+            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
+            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
+        } else {
+            pVal[0] = RHS.get_VAL();
+            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
+        }
+        clearUnusedBits();
+        return *this;
+    }
+
+    template<int _AP_W1, bool _AP_S1>
+    INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1, true>& RHS) {
+        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
+        if (RHS.isNegative()) {
+            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
+            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
+        } else {
+            pVal[0] = RHS.get_VAL();
+            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
+        }
+        clearUnusedBits();
+        return *this;
+    }
+#endif
+
+/// from all c types.
+#define ASSIGN_OP_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \
+  INLINE ap_private& operator=(const C_TYPE rhs) { \
+    ap_private<(_AP_W2), (_AP_S2)> tmp = rhs;      \
+    operator=(tmp);                                \
+    return *this;                                  \
+  }
+
+  ASSIGN_OP_FROM_INT(bool, 1, false)
+  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  ASSIGN_OP_FROM_INT(signed char, 8, true)
+  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
+  ASSIGN_OP_FROM_INT(short, sizeof(short) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned short, sizeof(unsigned short) * 8, false)
+  ASSIGN_OP_FROM_INT(int, sizeof(int) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned int, sizeof(unsigned int) * 8, false)
+  ASSIGN_OP_FROM_INT(long, sizeof(long) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned long, sizeof(unsigned long) * 8, false)
+  ASSIGN_OP_FROM_INT(ap_slong, sizeof(ap_slong) * 8, true)
+  ASSIGN_OP_FROM_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef ASSIGN_OP_FROM_INT
+
+  /// from c string.
+  // XXX this is a must, to prevent pointer being converted to bool.
+  INLINE ap_private& operator=(const char* s) {
+    ap_private tmp(s); // XXX direct initialization, as ctor is explicit.
+    operator=(tmp);
+    return *this;
+  }
+
+  ///
+  /// @name Unary Operators
+  ///
+  /// @returns a new ap_private value representing *this incremented by one
+  /// @brief Postfix increment operator.
+  INLINE const ap_private operator++(int) {
+    ap_private API(*this);
+    ++(*this);
+    return API;
+  }
+
+  /// @returns *this incremented by one
+  /// @brief Prefix increment operator.
+  INLINE ap_private& operator++() {
+    ap_private_ops::add_1(pVal, pVal, _AP_N, 1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @returns a new ap_private representing *this decremented by one.
+  /// @brief Postfix decrement operator.
+  INLINE const ap_private operator--(int) {
+    ap_private API(*this);
+    --(*this);
+    return API;
+  }
+
+  /// @returns *this decremented by one.
+  /// @brief Prefix decrement operator.
+  INLINE ap_private& operator--() {
+    ap_private_ops::sub_1(pVal, _AP_N, 1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// Performs a bitwise complement operation on this ap_private.
+  /// @returns an ap_private that is the bitwise complement of *this
+  /// @brief Unary bitwise complement operator.
+  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
+    ap_private<_AP_W + !_AP_S, true> Result(*this);
+    Result.flip();
+    return Result;
+  }
+
+  /// Negates *this using two's complement logic.
+  /// @returns An ap_private value representing the negation of *this.
+  /// @brief Unary negation operator
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_private<1, false>(0) - (*this);
+  }
+
+  /// Performs logical negation operation on this ap_private.
+  /// @returns true if *this is zero, false otherwise.
+  /// @brief Logical negation operator.
+  INLINE bool operator!() const {
+    for (int i = 0; i < _AP_N; ++i)
+      if (pVal[i]) return false;
+    return true;
+  }
+
+  template <bool _AP_S1>
+  INLINE ap_private<_AP_W, _AP_S || _AP_S1> And(
+      const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator&(RHS);
+  }
+  template <bool _AP_S1>
+  INLINE ap_private Or(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator|(RHS);
+  }
+  template <bool _AP_S1>
+  INLINE ap_private Xor(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator^(RHS);
+  }
+
+  INLINE ap_private Mul(const ap_private& RHS) const {
+    ap_private Result(*this);
+    Result *= RHS;
+    return Result;
+  }
+
+  INLINE ap_private Add(const ap_private& RHS) const {
+    ap_private Result(0);
+    ap_private_ops::add(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
+                        _AP_N, _AP_S, _AP_S);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  INLINE ap_private Sub(const ap_private& RHS) const {
+    ap_private Result(0);
+    ap_private_ops::sub(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
+                        _AP_N, _AP_S, _AP_S);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  /// Arithmetic right-shift this ap_private by shiftAmt.
+  /// @brief Arithmetic right-shift function.
+  INLINE ap_private ashr(uint32_t shiftAmt) const {
+    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
+    // Handle a degenerate case
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // If all the bits were shifted out, the result is, technically, undefined.
+    // We return -1 if it was negative, 0 otherwise. We check this early to
+    // avoid
+    // issues in the algorithm below.
+    if (shiftAmt == BitWidth) {
+      if (isNegative())
+        return ap_private(-1);
+      else
+        return ap_private(0);
+    }
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+
+    // Compute some values needed by the following shift algorithms
+    uint32_t wordShift =
+        shiftAmt % APINT_BITS_PER_WORD;               // bits to shift per word
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
+    uint32_t breakWord = _AP_N - 1 - offset;          // last word affected
+    uint32_t bitsInWord = whichBit(BitWidth); // how many bits in last word?
+    if (bitsInWord == 0) bitsInWord = APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      // Move the words containing significant bits
+      for (uint32_t i = 0; i <= breakWord; ++i)
+        val[i] = pVal[i + offset]; // move whole word
+
+      // Adjust the top significant word for sign bit fill, if negative
+      if (isNegative())
+        if (bitsInWord < APINT_BITS_PER_WORD)
+          val[breakWord] |= ~0ULL << (bitsInWord); // set high bits
+    } else {
+      // Shift the low order words
+      for (uint32_t i = 0; i < breakWord; ++i) {
+        // This combines the shifted corresponding word with the low bits from
+        // the next word (shifted into this word's high bits).
+        val[i] = ((pVal[i + offset]) >> (wordShift));
+        val[i] |= ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
+      }
+
+      // Shift the break word. In this case there are no bits from the next word
+      // to include in this word.
+      val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
+
+      // Deal with sign extenstion in the break word, and possibly the word
+      // before
+      // it.
+      if (isNegative()) {
+        if (wordShift > bitsInWord) {
+          if (breakWord > 0)
+            val[breakWord - 1] |=
+                ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
+          val[breakWord] |= ~0ULL;
+        } else
+          val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
+      }
+    }
+
+    // Remaining words are 0 or -1, just assign them.
+    uint64_t fillValue = (isNegative() ? ~0ULL : 0);
+    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = fillValue;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  /// Logical right-shift this ap_private by shiftAmt.
+  /// @brief Logical right-shift function.
+  INLINE ap_private lshr(uint32_t shiftAmt) const {
+    // If all the bits were shifted out, the result is 0. This avoids issues
+    // with shifting by the size of the integer type, which produces undefined
+    // results. We define these "undefined results" to always be 0.
+    if (shiftAmt == BitWidth) return ap_private(0);
+
+    // If none of the bits are shifted out, the result is *this. This avoids
+    // issues with shifting byt he size of the integer type, which produces
+    // undefined results in the code below. This is also an optimization.
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+
+    // If we are shifting less than a word, compute the shift with a simple
+    // carry
+    if (shiftAmt < APINT_BITS_PER_WORD) {
+      uint64_t carry = 0;
+      for (int i = _AP_N - 1; i >= 0; --i) {
+        val[i] = ((pVal[i]) >> (shiftAmt)) | carry;
+        carry = (pVal[i]) << (APINT_BITS_PER_WORD - shiftAmt);
+      }
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Compute some values needed by the remaining shift algorithms
+    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      for (uint32_t i = 0; i < _AP_N - offset; ++i) val[i] = pVal[i + offset];
+      for (uint32_t i = _AP_N - offset; i < _AP_N; i++) val[i] = 0;
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Shift the low order words
+    uint32_t breakWord = _AP_N - offset - 1;
+    for (uint32_t i = 0; i < breakWord; ++i)
+      val[i] = ((pVal[i + offset]) >> (wordShift)) |
+               ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
+    // Shift the break word.
+    val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
+
+    // Remaining words are 0
+    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = 0;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  /// Left-shift this ap_private by shiftAmt.
+  /// @brief Left-shift function.
+  INLINE ap_private shl(uint32_t shiftAmt) const {
+    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
+    // If all the bits were shifted out, the result is 0. This avoids issues
+    // with shifting by the size of the integer type, which produces undefined
+    // results. We define these "undefined results" to always be 0.
+    if (shiftAmt == BitWidth) return ap_private(0);
+
+    // If none of the bits are shifted out, the result is *this. This avoids a
+    // lshr by the words size in the loop below which can produce incorrect
+    // results. It also avoids the expensive computation below for a common
+    // case.
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+    // If we are shifting less than a word, do it the easy way
+    if (shiftAmt < APINT_BITS_PER_WORD) {
+      uint64_t carry = 0;
+      for (int i = 0; i < _AP_N; i++) {
+        val[i] = ((pVal[i]) << (shiftAmt)) | carry;
+        carry = (pVal[i]) >> (APINT_BITS_PER_WORD - shiftAmt);
+      }
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Compute some values needed by the remaining shift algorithms
+    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      for (uint32_t i = 0; i < offset; i++) val[i] = 0;
+      for (int i = offset; i < _AP_N; i++) val[i] = pVal[i - offset];
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Copy whole words from this to Result.
+    uint32_t i = _AP_N - 1;
+    for (; i > offset; --i)
+      val[i] = (pVal[i - offset]) << (wordShift) |
+               (pVal[i - offset - 1]) >> (APINT_BITS_PER_WORD - wordShift);
+    val[offset] = (pVal[0]) << (wordShift);
+    for (i = 0; i < offset; ++i) val[i] = 0;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  INLINE ap_private rotl(uint32_t rotateAmt) const {
+    if (rotateAmt == 0) return ap_private(*this);
+    // Don't get too fancy, just use existing shift/or facilities
+    ap_private hi(*this);
+    ap_private lo(*this);
+    hi.shl(rotateAmt);
+    lo.lshr(BitWidth - rotateAmt);
+    return hi | lo;
+  }
+
+  INLINE ap_private rotr(uint32_t rotateAmt) const {
+    if (rotateAmt == 0) return ap_private(*this);
+    // Don't get too fancy, just use existing shift/or facilities
+    ap_private hi(*this);
+    ap_private lo(*this);
+    lo.lshr(rotateAmt);
+    hi.shl(BitWidth - rotateAmt);
+    return hi | lo;
+  }
+
+  /// Perform an unsigned divide operation on this ap_private by RHS. Both this
+  /// and
+  /// RHS are treated as unsigned quantities for purposes of this division.
+  /// @returns a new ap_private value containing the division result
+  /// @brief Unsigned division operation.
+  INLINE ap_private udiv(const ap_private& RHS) const {
+    // Get some facts about the LHS and RHS number of bits and words
+    uint32_t rhsBits = RHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Divided by zero???");
+    uint32_t lhsBits = this->getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+    // Deal with some degenerate cases
+    if (!lhsWords)
+      // 0 / X ===> 0
+      return ap_private(0);
+    else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X / Y ===> 0, iff X < Y
+      return ap_private(0);
+    } else if (*this == RHS) {
+      // X / X ===> 1
+      return ap_private(1);
+    } else if (lhsWords == 1 && rhsWords == 1) {
+      // All high words are zero, just use native divide
+      return ap_private(this->pVal[0] / RHS.get_pVal(0));
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Quotient(0); // to hold result.
+    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, &Quotient,
+                           (ap_private*)0);
+    return Quotient;
+  }
+
+  /// Signed divide this ap_private by ap_private RHS.
+  /// @brief Signed division function for ap_private.
+  INLINE ap_private sdiv(const ap_private& RHS) const {
+    if (isNegative())
+      if (RHS.isNegative())
+        return (-(*this)).udiv(-RHS);
+      else
+        return -((-(*this)).udiv(RHS));
+    else if (RHS.isNegative())
+      return -(this->udiv((ap_private)(-RHS)));
+    return this->udiv(RHS);
+  }
+
+  /// Perform an unsigned remainder operation on this ap_private with RHS being
+  /// the
+  /// divisor. Both this and RHS are treated as unsigned quantities for purposes
+  /// of this operation. Note that this is a true remainder operation and not
+  /// a modulo operation because the sign follows the sign of the dividend
+  /// which is *this.
+  /// @returns a new ap_private value containing the remainder result
+  /// @brief Unsigned remainder operation.
+  INLINE ap_private urem(const ap_private& RHS) const {
+    // Get some facts about the LHS
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+    // Get some facts about the RHS
+    uint32_t rhsBits = RHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Performing remainder operation by zero ???");
+
+    // Check the degenerate cases
+    if (lhsWords == 0) {
+      // 0 % Y ===> 0
+      return ap_private(0);
+    } else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X % Y ===> X, iff X < Y
+      return *this;
+    } else if (*this == RHS) {
+      // X % X == 0;
+      return ap_private(0);
+    } else if (lhsWords == 1) {
+      // All high words are zero, just use native remainder
+      return ap_private(pVal[0] % RHS.get_pVal(0));
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Remainder(0);
+    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, (ap_private*)(0),
+                           &Remainder);
+    return Remainder;
+  }
+
+  INLINE ap_private urem(uint64_t RHS) const {
+    // Get some facts about the LHS
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+    // Get some facts about the RHS
+    uint32_t rhsWords = 1; //! rhsBits ? 0 : (ap_private<_AP_W,
+                           //! _AP_S>::whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Performing remainder operation by zero ???");
+    // Check the degenerate cases
+    if (lhsWords == 0) {
+      // 0 % Y ===> 0
+      return ap_private(0);
+    } else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X % Y ===> X, iff X < Y
+      return *this;
+    } else if (*this == RHS) {
+      // X % X == 0;
+      return ap_private(0);
+    } else if (lhsWords == 1) {
+      // All high words are zero, just use native remainder
+      return ap_private(pVal[0] % RHS);
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Remainder(0);
+    divide(*this, lhsWords, RHS, (ap_private*)(0), &Remainder);
+    return Remainder;
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  INLINE ap_private srem(const ap_private& RHS) const {
+    if (isNegative()) {
+      ap_private lhs = -(*this);
+      if (RHS.isNegative()) {
+        ap_private rhs = -RHS;
+        return -(lhs.urem(rhs));
+      } else
+        return -(lhs.urem(RHS));
+    } else if (RHS.isNegative()) {
+      ap_private rhs = -RHS;
+      return this->urem(rhs);
+    }
+    return this->urem(RHS);
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  INLINE ap_private srem(int64_t RHS) const {
+    if (isNegative())
+      if (RHS < 0)
+        return -((-(*this)).urem(-RHS));
+      else
+        return -((-(*this)).urem(RHS));
+    else if (RHS < 0)
+      return this->urem(-RHS);
+    return this->urem(RHS);
+  }
+
+  /// Compares this ap_private with RHS for the validity of the equality
+  /// relationship.
+  /// @returns true if *this == Val
+  /// @brief Equality comparison.
+  template <bool _AP_S1>
+  INLINE bool eq(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return (*this) == RHS;
+  }
+
+  /// Compares this ap_private with RHS for the validity of the inequality
+  /// relationship.
+  /// @returns true if *this != Val
+  /// @brief Inequality comparison
+  template <bool _AP_S1>
+  INLINE bool ne(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered unsigned.
+  /// @brief Unsigned less than comparison
+  template <bool _AP_S1>
+  INLINE bool ult(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    // Get active bit length of both operands
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 = RHS.getActiveBits();
+
+    // If magnitude of LHS is less than RHS, return true.
+    if (n1 < n2) return true;
+
+    // If magnitude of RHS is greather than LHS, return false.
+    if (n2 < n1) return false;
+
+    // If they bot fit in a word, just compare the low order word
+    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+      return pVal[0] < RHS.get_pVal(0);
+
+    // Otherwise, compare all words
+    uint32_t topWord = whichWord(AESL_std::max(n1, n2) - 1);
+    for (int i = topWord; i >= 0; --i) {
+      if (pVal[i] > RHS.get_pVal(i)) return false;
+      if (pVal[i] < RHS.get_pVal(i)) return true;
+    }
+    return false;
+  }
+
+  INLINE bool ult(uint64_t RHS) const {
+    // Get active bit length of both operands
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 =
+        64 - ap_private_ops::CountLeadingZeros_64(RHS); // RHS.getActiveBits();
+
+    // If magnitude of LHS is less than RHS, return true.
+    if (n1 < n2) return true;
+
+    // If magnitude of RHS is greather than LHS, return false.
+    if (n2 < n1) return false;
+
+    // If they bot fit in a word, just compare the low order word
+    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+      return pVal[0] < RHS;
+    assert(0);
+  }
+
+  template <bool _AP_S1>
+  INLINE bool slt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    ap_private lhs(*this);
+    ap_private<_AP_W, _AP_S1> rhs(RHS);
+    bool lhsNeg = isNegative();
+    bool rhsNeg = rhs.isNegative();
+    if (lhsNeg) {
+      // Sign bit is set so perform two's complement to make it positive
+      lhs.flip();
+      lhs++;
+    }
+    if (rhsNeg) {
+      // Sign bit is set so perform two's complement to make it positive
+      rhs.flip();
+      rhs++;
+    }
+
+    // Now we have unsigned values to compare so do the comparison if necessary
+    // based on the negativeness of the values.
+    if (lhsNeg)
+      if (rhsNeg)
+        return lhs.ugt(rhs);
+      else
+        return true;
+    else if (rhsNeg)
+      return false;
+    else
+      return lhs.ult(rhs);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered unsigned.
+  /// @brief Unsigned less or equal comparison
+  template <bool _AP_S1>
+  INLINE bool ule(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return ult(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered signed.
+  /// @brief Signed less or equal comparison
+  template <bool _AP_S1>
+  INLINE bool sle(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return slt(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered unsigned.
+  /// @brief Unsigned greather than comparison
+  template <bool _AP_S1>
+  INLINE bool ugt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !ult(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered signed.
+  /// @brief Signed greather than comparison
+  template <bool _AP_S1>
+  INLINE bool sgt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !slt(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered unsigned.
+  /// @brief Unsigned greater or equal comparison
+  template <bool _AP_S1>
+  INLINE bool uge(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return !ult(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered signed.
+  /// @brief Signed greather or equal comparison
+  template <bool _AP_S1>
+  INLINE bool sge(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !slt(RHS);
+  }
+
+  // Sign extend to a new width.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpSext(const ap_private<_AP_W1, _AP_S1>& that) {
+    assert(_AP_W1 < BitWidth && "Invalid ap_private SignExtend request");
+    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
+    // If the sign bit isn't set, this is the same as zext.
+    if (!that.isNegative()) {
+      cpZext(that);
+      return;
+    }
+
+    // The sign bit is set. First, get some facts
+    enum { wordBits = _AP_W1 % APINT_BITS_PER_WORD };
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    // Mask the high order word appropriately
+    if (_AP_N1 == _AP_N) {
+      enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
+      // The extension is contained to the wordsBefore-1th word.
+      static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      pVal[_AP_N - 1] |= mask;
+      return;
+    }
+
+    enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
+    // The extension is contained to the wordsBefore-1th word.
+    static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
+    int i;
+    for (i = 0; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
+    pVal[i - 1] |= mask;
+    for (; i < _AP_N - 1; i++) pVal[i] = ~0ULL;
+    pVal[i] = ~0ULL;
+    clearUnusedBits();
+    return;
+  }
+
+  //  Zero extend to a new width.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpZext(const ap_private<_AP_W1, _AP_S1>& that) {
+    assert(_AP_W1 < BitWidth && "Invalid ap_private ZeroExtend request");
+    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    int i = 0;
+    for (; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
+    for (; i < _AP_N; ++i) pVal[i] = 0;
+    clearUnusedBits();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpZextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
+    if (BitWidth > _AP_W1)
+      cpZext(that);
+    else {
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      clearUnusedBits();
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpSextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
+    if (BitWidth > _AP_W1)
+      cpSext(that);
+    else {
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      clearUnusedBits();
+    }
+  }
+
+  /// @}
+  /// @name Value Characterization Functions
+  /// @{
+
+  /// @returns the total number of bits.
+  INLINE uint32_t getBitWidth() const { return BitWidth; }
+
+  /// Here one word's bitwidth equals to that of uint64_t.
+  /// @returns the number of words to hold the integer value of this ap_private.
+  /// @brief Get the number of words.
+  INLINE uint32_t getNumWords() const {
+    return (BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
+  }
+
+  /// This function returns the number of active bits which is defined as the
+  /// bit width minus the number of leading zeros. This is used in several
+  /// computations to see how "wide" the value is.
+  /// @brief Compute the number of active bits in the value
+  INLINE uint32_t getActiveBits() const {
+    uint32_t bits = BitWidth - countLeadingZeros();
+    return bits ? bits : 1;
+  }
+
+  /// This method attempts to return the value of this ap_private as a zero
+  /// extended
+  /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
+  /// uint64_t. Otherwise an assertion will result.
+  /// @brief Get zero extended value
+  INLINE uint64_t getZExtValue() const {
+    assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
+    return *pVal;
+  }
+
+  /// This method attempts to return the value of this ap_private as a sign
+  /// extended
+  /// int64_t. The bit width must be <= 64 or the value must fit within an
+  /// int64_t. Otherwise an assertion will result.
+  /// @brief Get sign extended value
+  INLINE int64_t getSExtValue() const {
+    assert(getActiveBits() <= 64 && "Too many bits for int64_t");
+    return int64_t(pVal[0]);
+  }
+
+  /// This method determines how many bits are required to hold the ap_private
+  /// equivalent of the string given by \p str of length \p slen.
+  /// @brief Get bits required for string value.
+  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
+                                       uint8_t radix) {
+    assert(str != 0 && "Invalid value string");
+    assert(slen > 0 && "Invalid string length");
+
+    // Each computation below needs to know if its negative
+    uint32_t isNegative = str[0] == '-';
+    if (isNegative) {
+      slen--;
+      str++;
+    }
+    // For radixes of power-of-two values, the bits required is accurately and
+    // easily computed
+    if (radix == 2) return slen + isNegative;
+    if (radix == 8) return slen * 3 + isNegative;
+    if (radix == 16) return slen * 4 + isNegative;
+
+    // Otherwise it must be radix == 10, the hard case
+    assert(radix == 10 && "Invalid radix");
+
+    // Convert to the actual binary value.
+    // ap_private<_AP_W, _AP_S> tmp(sufficient, str, slen, radix);
+
+    // Compute how many bits are required.
+    // return isNegative + tmp.logBase2() + 1;
+    return isNegative + slen * 4;
+  }
+
+  /// countLeadingZeros - This function is an ap_private version of the
+  /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
+  /// of zeros from the most significant bit to the first one bit.
+  /// @returns BitWidth if the value is zero.
+  /// @returns the number of zeros from the most significant bit to the first
+  /// one bits.
+  INLINE uint32_t countLeadingZeros() const {
+    enum {
+      msw_bits = (BitWidth % APINT_BITS_PER_WORD)
+                     ? (BitWidth % APINT_BITS_PER_WORD)
+                     : APINT_BITS_PER_WORD,
+      excessBits = APINT_BITS_PER_WORD - msw_bits
+    };
+    uint32_t Count = ap_private_ops::CountLeadingZeros_64(pVal[_AP_N - 1]);
+    if (Count >= excessBits) Count -= excessBits;
+    if (!pVal[_AP_N - 1]) {
+      for (int i = _AP_N - 1; i; --i) {
+        if (!pVal[i - 1])
+          Count += APINT_BITS_PER_WORD;
+        else {
+          Count += ap_private_ops::CountLeadingZeros_64(pVal[i - 1]);
+          break;
+        }
+      }
+    }
+    return Count;
+  }
+
+  /// countLeadingOnes - This function counts the number of contiguous 1 bits
+  /// in the high order bits. The count stops when the first 0 bit is reached.
+  /// @returns 0 if the high order bit is not set
+  /// @returns the number of 1 bits from the most significant to the least
+  /// @brief Count the number of leading one bits.
+  INLINE uint32_t countLeadingOnes() const {
+    if (isSingleWord())
+      return countLeadingOnes_64(get_VAL(), APINT_BITS_PER_WORD - BitWidth);
+
+    uint32_t highWordBits = BitWidth % APINT_BITS_PER_WORD;
+    uint32_t shift =
+        (highWordBits == 0 ? 0 : APINT_BITS_PER_WORD - highWordBits);
+    int i = _AP_N - 1;
+    uint32_t Count = countLeadingOnes_64(get_pVal(i), shift);
+    if (Count == highWordBits) {
+      for (i--; i >= 0; --i) {
+        if (get_pVal(i) == ~0ULL)
+          Count += APINT_BITS_PER_WORD;
+        else {
+          Count += countLeadingOnes_64(get_pVal(i), 0);
+          break;
+        }
+      }
+    }
+    return Count;
+  }
+
+  /// countTrailingZeros - This function is an ap_private version of the
+  /// countTrailingZoers_{32,64} functions in MathExtras.h. It counts
+  /// the number of zeros from the least significant bit to the first set bit.
+  /// @returns BitWidth if the value is zero.
+  /// @returns the number of zeros from the least significant bit to the first
+  /// one bit.
+  /// @brief Count the number of trailing zero bits.
+  INLINE uint32_t countTrailingZeros() const {
+    uint32_t Count = 0;
+    uint32_t i = 0;
+    for (; i < _AP_N && get_pVal(i) == 0; ++i) Count += APINT_BITS_PER_WORD;
+    if (i < _AP_N) Count += ap_private_ops::CountTrailingZeros_64(get_pVal(i));
+    return AESL_std::min(Count, BitWidth);
+  }
+  /// countPopulation - This function is an ap_private version of the
+  /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
+  /// of 1 bits in the ap_private value.
+  /// @returns 0 if the value is zero.
+  /// @returns the number of set bits.
+  /// @brief Count the number of bits set.
+  INLINE uint32_t countPopulation() const {
+    uint32_t Count = 0;
+    for (int i = 0; i < _AP_N - 1; ++i)
+      Count += ap_private_ops::CountPopulation_64(pVal[i]);
+    Count += ap_private_ops::CountPopulation_64(pVal[_AP_N - 1] & mask);
+    return Count;
+  }
+
+  /// @}
+  /// @name Conversion Functions
+  /// @
+
+  /// This is used internally to convert an ap_private to a string.
+  /// @brief Converts an ap_private to a std::string
+  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
+
+  /// Considers the ap_private to be unsigned and converts it into a string in
+  /// the
+  /// radix given. The radix can be 2, 8, 10 or 16.
+  /// @returns a character interpretation of the ap_private
+  /// @brief Convert unsigned ap_private to string representation.
+  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
+    return toString(radix, false);
+  }
+
+  /// Considers the ap_private to be unsigned and converts it into a string in
+  /// the
+  /// radix given. The radix can be 2, 8, 10 or 16.
+  /// @returns a character interpretation of the ap_private
+  /// @brief Convert unsigned ap_private to string representation.
+  INLINE std::string toStringSigned(uint8_t radix = 10) const {
+    return toString(radix, true);
+  }
+
+  /// @brief Converts this ap_private to a double value.
+  INLINE double roundToDouble(bool isSigned) const {
+    // Handle the simple case where the value is contained in one uint64_t.
+    if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
+      uint64_t val = pVal[0];
+      if (isSigned) {
+        int64_t sext = ((int64_t(val)) << (64 - BitWidth)) >> (64 - BitWidth);
+        return double(sext);
+      } else
+        return double(val);
+    }
+
+    // Determine if the value is negative.
+    bool isNeg = isSigned ? (*this)[BitWidth - 1] : false;
+
+    // Construct the absolute value if we're negative.
+    ap_private<_AP_W, _AP_S> Tmp(isNeg ? -(*this) : (*this));
+
+    // Figure out how many bits we're using.
+    uint32_t n = Tmp.getActiveBits();
+
+    // The exponent (without bias normalization) is just the number of bits
+    // we are using. Note that the sign bit is gone since we constructed the
+    // absolute value.
+    uint64_t exp = n;
+
+    // Return infinity for exponent overflow
+    if (exp > 1023) {
+      if (!isSigned || !isNeg)
+        return std::numeric_limits<double>::infinity();
+      else
+        return -std::numeric_limits<double>::infinity();
+    }
+    exp += 1023; // Increment for 1023 bias
+
+    // Number of bits in mantissa is 52. To obtain the mantissa value, we must
+    // extract the high 52 bits from the correct words in pVal.
+    uint64_t mantissa;
+    unsigned hiWord = whichWord(n - 1);
+    if (hiWord == 0) {
+      mantissa = Tmp.get_pVal(0);
+      if (n > 52)
+        (mantissa) >>= (n - 52); // shift down, we want the top 52 bits.
+    } else {
+      assert(hiWord > 0 && "High word is negative?");
+      uint64_t hibits = (Tmp.get_pVal(hiWord))
+                        << (52 - n % APINT_BITS_PER_WORD);
+      uint64_t lobits =
+          (Tmp.get_pVal(hiWord - 1)) >> (11 + n % APINT_BITS_PER_WORD);
+      mantissa = hibits | lobits;
+    }
+
+    // The leading bit of mantissa is implicit, so get rid of it.
+    uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
+    union {
+      double __D;
+      uint64_t __I;
+    } __T;
+    __T.__I = sign | ((exp) << 52) | mantissa;
+    return __T.__D;
+  }
+
+  /// @brief Converts this unsigned ap_private to a double value.
+  INLINE double roundToDouble() const { return roundToDouble(false); }
+
+  /// @brief Converts this signed ap_private to a double value.
+  INLINE double signedRoundToDouble() const { return roundToDouble(true); }
+
+  /// The conversion does not do a translation from integer to double, it just
+  /// re-interprets the bits as a double. Note that it is valid to do this on
+  /// any bit width. Exactly 64 bits will be translated.
+  /// @brief Converts ap_private bits to a double
+  INLINE double bitsToDouble() const {
+    union {
+      uint64_t __I;
+      double __D;
+    } __T;
+    __T.__I = pVal[0];
+    return __T.__D;
+  }
+
+  /// The conversion does not do a translation from integer to float, it just
+  /// re-interprets the bits as a float. Note that it is valid to do this on
+  /// any bit width. Exactly 32 bits will be translated.
+  /// @brief Converts ap_private bits to a double
+  INLINE float bitsToFloat() const {
+    union {
+      uint32_t __I;
+      float __F;
+    } __T;
+    __T.__I = uint32_t(pVal[0]);
+    return __T.__F;
+  }
+
+  /// The conversion does not do a translation from double to integer, it just
+  /// re-interprets the bits of the double. Note that it is valid to do this on
+  /// any bit width but bits from V may get truncated.
+  /// @brief Converts a double to ap_private bits.
+  INLINE ap_private& doubleToBits(double __V) {
+    union {
+      uint64_t __I;
+      double __D;
+    } __T;
+    __T.__D = __V;
+    pVal[0] = __T.__I;
+    return *this;
+  }
+
+  /// The conversion does not do a translation from float to integer, it just
+  /// re-interprets the bits of the float. Note that it is valid to do this on
+  /// any bit width but bits from V may get truncated.
+  /// @brief Converts a float to ap_private bits.
+  INLINE ap_private& floatToBits(float __V) {
+    union {
+      uint32_t __I;
+      float __F;
+    } __T;
+    __T.__F = __V;
+    pVal[0] = __T.__I;
+  }
+
+  // Reduce operation
+  //-----------------------------------------------------------
+  INLINE bool and_reduce() const { return isMaxValue(); }
+
+  INLINE bool nand_reduce() const { return isMinValue(); }
+
+  INLINE bool or_reduce() const { return (bool)countPopulation(); }
+
+  INLINE bool nor_reduce() const { return countPopulation() == 0; }
+
+  INLINE bool xor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? true : false;
+  }
+
+  INLINE bool xnor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? false : true;
+  }
+  INLINE std::string to_string(uint8_t radix = 16, bool sign = false) const {
+    return toString(radix, radix == 10 ? _AP_S : sign);
+  }
+}; // End of class ap_private <_AP_W, _AP_S, false>
+
+namespace ap_private_ops {
+
+enum { APINT_BITS_PER_WORD = 64 };
+template <int _AP_W, bool _AP_S>
+INLINE bool operator==(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
+  return V2 == V1;
+}
+
+template <int _AP_W, bool _AP_S>
+INLINE bool operator!=(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
+  return V2 != V1;
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE bool get(const ap_private<_AP_W, _AP_S>& a) {
+  static const uint64_t mask = 1ULL << (index & 0x3f);
+  return ((mask & a.get_pVal((index) >> 6)) != 0);
+}
+
+template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
+INLINE void set(ap_private<_AP_W, _AP_S>& a,
+                const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
+                const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    lsb_word = lsb_index / APINT_BITS_PER_WORD,
+    msb_word = msb_index / APINT_BITS_PER_WORD,
+    msb = msb_index % APINT_BITS_PER_WORD,
+    lsb = lsb_index % APINT_BITS_PER_WORD
+  };
+  if (msb_word == lsb_word) {
+    const uint64_t mask = ~0ULL >>
+                          (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
+                          (APINT_BITS_PER_WORD - msb - 1);
+    // a.set_pVal(msb_word, a.get_pVal(msb_word)  | mask);
+    a.get_pVal(msb_word) |= mask;
+  } else {
+    const uint64_t lsb_mask = ~0ULL >> (lsb) << (lsb);
+    const uint64_t msb_mask = ~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
+                              (APINT_BITS_PER_WORD - msb - 1);
+    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) | lsb_mask);
+    a.get_pVal(lsb_word) |= lsb_mask;
+    for (int i = lsb_word + 1; i < msb_word; i++) {
+      a.set_pVal(i, ~0ULL);
+      // a.get_pVal(i)=0;
+    }
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) | msb_mask);
+
+    a.get_pVal(msb_word) |= msb_mask;
+  }
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
+INLINE void clear(ap_private<_AP_W, _AP_S>& a,
+                  const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
+                  const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    lsb_word = lsb_index / APINT_BITS_PER_WORD,
+    msb_word = msb_index / APINT_BITS_PER_WORD,
+    msb = msb_index % APINT_BITS_PER_WORD,
+    lsb = lsb_index % APINT_BITS_PER_WORD
+  };
+  if (msb_word == lsb_word) {
+    const uint64_t mask =
+        ~(~0ULL >> (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
+          (APINT_BITS_PER_WORD - msb - 1));
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) & mask);
+    a.get_pVal(msb_word) &= mask;
+  } else {
+    const uint64_t lsb_mask = ~(~0ULL >> (lsb) << (lsb));
+    const uint64_t msb_mask = ~(~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
+                                (APINT_BITS_PER_WORD - msb - 1));
+    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) & lsb_mask);
+    a.get_pVal(lsb_word) &= lsb_mask;
+    for (int i = lsb_word + 1; i < msb_word; i++) {
+      // a.set_pVal(i, 0);
+      a.get_pVal(i) = 0;
+    }
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) & msb_mask);
+    a.get_pVal(msb_word) &= msb_mask;
+  }
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE void set(ap_private<_AP_W, _AP_S>& a,
+                const ap_private<AP_MAX(index, 1), true>& mark = 0) {
+  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
+  static const uint64_t mask = 1ULL << (index % APINT_BITS_PER_WORD);
+  // a.set_pVal(word, a.get_pVal(word) | mask);
+  a.get_pVal(word) |= mask;
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE void clear(ap_private<_AP_W, _AP_S>& a,
+                  const ap_private<AP_MAX(index, 1), true>& mark = 0) {
+  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
+  static const uint64_t mask = ~(1ULL << (index % APINT_BITS_PER_WORD));
+  // a.set_pVal(word, a.get_pVal(word) & mask);
+  a.get_pVal(word) &= mask;
+  a.clearUnusedBits();
+}
+
+} // End of ap_private_ops namespace
+
+template <int _AP_W, bool _AP_S>
+INLINE std::string ap_private<_AP_W, _AP_S, false>::toString(
+    uint8_t radix, bool wantSigned) const {
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "A", "B", "C", "D", "E", "F"};
+  std::string result;
+
+  if (radix != 10) {
+    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+    // because the number of bits per digit (1,3 and 4 respectively) divides
+    // equaly. We just shift until there value is zero.
+
+    // First, check for a zero value and just short circuit the logic below.
+    if (*this == (uint64_t)(0))
+      result = "0";
+    else {
+      ap_private<_AP_W, false> tmp(*this);
+      size_t insert_at = 0;
+      bool leading_zero = true;
+      if (wantSigned && isNegative()) {
+        // They want to print the signed version and it is a negative value
+        // Flip the bits and add one to turn it into the equivalent positive
+        // value and put a '-' in the result.
+        tmp.flip();
+        tmp++;
+        tmp.clearUnusedBitsToZero();
+        result = "-";
+        insert_at = 1;
+        leading_zero = false;
+      }
+      switch (radix) {
+        case 2:
+          result += "0b";
+          break;
+        case 8:
+          result += "0o";
+          break;
+        case 16:
+          result += "0x";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+      insert_at += 2;
+      // Just shift tmp right for each digit width until it becomes zero
+      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
+      uint64_t mask = radix - 1;
+      ap_private<_AP_W, false> zero(0);
+      unsigned bits = 0;
+      while (tmp.ne(zero)) {
+        uint64_t digit = tmp.get_VAL() & mask;
+        result.insert(insert_at, digits[digit]);
+        tmp = tmp.lshr(shift);
+        ++bits;
+      }
+      bits *= shift;
+      if (bits < _AP_W && leading_zero) result.insert(insert_at, digits[0]);
+    }
+    return result;
+  }
+
+  ap_private<_AP_W, false> tmp(*this);
+  ap_private<_AP_W, false> divisor(radix);
+  ap_private<_AP_W, false> zero(0);
+  size_t insert_at = 0;
+  if (wantSigned && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    tmp.flip();
+    tmp++;
+    tmp.clearUnusedBitsToZero();
+    result = "-";
+    insert_at = 1;
+  }
+  if (tmp == ap_private<_AP_W, false>(0))
+    result = "0";
+  else
+    while (tmp.ne(zero)) {
+      ap_private<_AP_W, false> APdigit(0);
+      ap_private<_AP_W, false> tmp2(0);
+      ap_private_ops::divide(tmp, tmp.getNumWords(), divisor,
+                             divisor.getNumWords(), &tmp2, &APdigit);
+      uint64_t digit = APdigit.getZExtValue();
+      assert(digit < radix && "divide failed");
+      result.insert(insert_at, digits[digit]);
+      tmp = tmp2;
+    }
+
+  return result;
+} // End of ap_private<_AP_W, _AP_S, false>::toString()
+
+template <int _AP_W, bool _AP_S>
+std::ostream &operator<<(std::ostream &os, const ap_private<_AP_W, _AP_S> &x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.toString(16, false); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.toString(8, false); // don't print sign
+  } else {
+    os << x.toString(10, _AP_S);
+  }
+  return os;
+}
+
+// ------------------------------------------------------------ //
+//           XXX moved here from ap_int_sim.h  XXX              //
+// ------------------------------------------------------------ //
+
+/// Concatination reference.
+/// Proxy class which allows concatination to be used as rvalue(for reading) and
+/// lvalue(for writing)
+// ----------------------------------------------------------------
+// template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+// struct ap_concat_ref {
+//#ifdef _MSC_VER
+//#pragma warning(disable : 4521 4522)
+//#endif
+//  enum {
+//    _AP_WR = _AP_W1 + _AP_W2,
+//  };
+//  _AP_T1& mbv1;
+//  _AP_T2& mbv2;
+//
+//  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>&
+//  ref)
+//      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
+//
+//  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const ap_private<_AP_W3, _AP_S3>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
+//    int W_ref1 = mbv1.length();
+//    int W_ref2 = mbv2.length();
+//    ap_private<_AP_W1, false> mask1(-1);
+//    mask1 >>= _AP_W1 - W_ref1;
+//    ap_private<_AP_W2, false> mask2(-1);
+//    mask2 >>= _AP_W2 - W_ref2;
+//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
+//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
+//    return *this;
+//  }
+//
+//  INLINE ap_concat_ref& operator=(unsigned long long val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE ap_concat_ref& operator=(
+//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  INLINE ap_concat_ref& operator=(
+//      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const _private_bit_ref<_AP_W3, _AP_S3>&
+//  val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const _private_range_ref<_AP_W3, _AP_S3>&
+//  val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val)
+//      {
+//    return operator=((const ap_private<_AP_W3, false>)(val));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
+//          val) {
+//    return operator=(val.to_ap_private());
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+//    return operator=((unsigned long long)(bool)(val));
+//  }
+//
+//  INLINE operator ap_private<_AP_WR, false>() const { return get(); }
+//
+//  INLINE operator unsigned long long() const { return get().to_uint64(); }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                       _private_range_ref<_AP_W3, _AP_S3> >
+//  operator,(const _private_range_ref<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         _private_range_ref<_AP_W3, _AP_S3> >(
+//        *this, const_cast<_private_range_ref<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
+//      >
+//      operator,(ap_private<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         ap_private<_AP_W3, _AP_S3> >(*this, a2);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
+//      >
+//      operator,(const ap_private<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         ap_private<_AP_W3, _AP_S3> >(
+//        *this, const_cast<ap_private<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
+//  _AP_S3> >
+//  operator,(const _private_bit_ref<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
+//    _AP_S3> >(
+//        *this, const_cast<_private_bit_ref<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+//                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
+//  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+//                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
+//        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4,
+//        _AP_T4>&>(a2));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref<
+//      _AP_WR, ap_concat_ref, _AP_W3,
+//      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+//  operator,(
+//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2)
+//      {
+//    return ap_concat_ref<
+//        _AP_WR, ap_concat_ref, _AP_W3,
+//        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+//        *this,
+//        const_cast<
+//            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//            _AP_N3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
+//                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
+//                    >
+//      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//      _AP_N3>
+//                    &a2) {
+//    return ap_concat_ref<
+//        _AP_WR, ap_concat_ref, 1,
+//        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+//        *this,
+//        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//        _AP_N3>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return get() & a2;
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return get() | a2;
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3>(get() ^ a2);
+//  }
+//
+//  INLINE const ap_private<_AP_WR, false> get() const {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
+//    int W_ref2 = mbv2.length();
+//    tmpVal <<= W_ref2;
+//    tmpVal |= tmpVal2;
+//    return tmpVal;
+//  }
+//
+//  INLINE const ap_private<_AP_WR, false> get() {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
+//    int W_ref2 = mbv2.length();
+//    tmpVal <<= W_ref2;
+//    tmpVal |= tmpVal2;
+//    return tmpVal;
+//  }
+//
+//  template <int _AP_W3>
+//  INLINE void set(const ap_private<_AP_W3, false>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
+//    int W_ref1 = mbv1.length();
+//    int W_ref2 = mbv2.length();
+//    ap_private<_AP_W1, false> mask1(-1);
+//    mask1 >>= _AP_W1 - W_ref1;
+//    ap_private<_AP_W2, false> mask2(-1);
+//    mask2 >>= _AP_W2 - W_ref2;
+//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
+//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
+//  }
+//
+//  INLINE int length() const { return mbv1.length() + mbv2.length(); }
+//
+//  INLINE std::string to_string(uint8_t radix = 2) const {
+//    return get().to_string(radix);
+//  }
+//}; // struct ap_concat_ref.
+
+/// Range(slice) reference
+/// Proxy class, which allows part selection to be used as rvalue(for reading)
+/// and lvalue(for writing)
+//------------------------------------------------------------
+template <int _AP_W, bool _AP_S>
+struct _private_range_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  ap_private<_AP_W, _AP_S>& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  /// copy ctor.
+  INLINE _private_range_ref(const _private_range_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  /// direct ctor.
+  INLINE _private_range_ref(ap_private<_AP_W, _AP_S>* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound (%d) and lower bound (%d) cannot be "
+                "negative.",
+                h, l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound (%d) or lower bound (%d) out of range (%d).", h, l,
+                _AP_W);
+  }
+
+  /// compound or assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() &= "
+                "ap_private<>.range().");
+    this->d_bv |= ref.d_bv;
+    return *this;
+  }
+
+  /// compound or assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() |= _AP_ROOT_TYPE<>.");
+    this->d_bv |= ref.V;
+    return *this;
+  }
+
+  /// compound and assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() &= "
+                "ap_private<>.range().");
+    this->d_bv &= ref.d_bv;
+    return *this;
+  };
+
+  /// compound and assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() &= _AP_ROOT_TYPE<>.");
+    this->d_bv &= ref.V;
+    return *this;
+  }
+
+  /// compound xor assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() ^= "
+                "ap_private<>.range().");
+    this->d_bv ^= ref.d_bv;
+    return *this;
+  };
+
+  /// compound xor assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() ^= _AP_ROOT_TYPE<>.");
+    this->d_bv ^= ref.V;
+    return *this;
+  }
+
+  /// @name convertors.
+  //  @{
+  INLINE operator ap_private<_AP_W, false>() const {
+    ap_private<_AP_W, false> val(0);
+    if (h_index >= l_index) {
+      if (_AP_W > 64) {
+        val = d_bv;
+        ap_private<_AP_W, false> mask(-1);
+        mask >>= _AP_W - (h_index - l_index + 1);
+        val >>= l_index;
+        val &= mask;
+      } else {
+        const static uint64_t mask = (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
+        val = (d_bv >> l_index) & (mask >> (_AP_W - (h_index - l_index + 1)));
+      }
+    } else {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    }
+    return val;
+  }
+
+  INLINE operator unsigned long long() const { return to_uint64(); }
+  //  @}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(const ap_private<_AP_W2, _AP_S2>& val) {
+    ap_private<_AP_W, false> vval = ap_private<_AP_W, false>(val);
+    if (l_index > h_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
+    } else {
+      if (_AP_W > 64) {
+        ap_private<_AP_W, false> mask(-1);
+        if (l_index > 0) {
+          mask <<= l_index;
+          vval <<= l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          ap_private<_AP_W, false> mask2(-1);
+          mask2 >>= _AP_W - h_index - 1;
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask.flip();
+        d_bv &= mask;
+        d_bv |= vval;
+      } else {
+        unsigned shift = 64 - _AP_W;
+        uint64_t mask = ~0ULL >> (shift);
+        if (l_index > 0) {
+          vval = mask & vval << l_index;
+          mask = mask & mask << l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          uint64_t mask2 = mask;
+          mask2 >>= (_AP_W - h_index - 1);
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask = ~mask;
+        d_bv &= mask;
+        d_bv |= vval;
+      }
+    }
+    return *this;
+  } // operator=(const ap_private<>&)
+
+  INLINE _private_range_ref& operator=(unsigned long long val) {
+    const ap_private<_AP_W, _AP_S> vval = val;
+    return operator=(vval);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(
+      const _private_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((unsigned long long)(bool)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(
+      const _private_range_ref<_AP_W2, _AP_S2>& val) {
+    const ap_private<_AP_W, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE _private_range_ref& operator=(
+//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+//    const ap_private<_AP_W, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+
+  // TODO from ap_int_base, ap_bit_ref and ap_range_ref.
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(val.to_ap_int_base().V);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(val.operator ap_int_base<_AP_W2, false>().V);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((unsigned long long)(bool)val);
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        *this, const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  INLINE
+//  ap_concat_ref<_AP_W, _private_range_ref, _AP_W, ap_private<_AP_W, _AP_S> >
+//  operator,(ap_private<_AP_W, _AP_S>& a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W,
+//                         ap_private<_AP_W, _AP_S> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        *this, const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, _private_range_ref, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, _private_range_ref, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        *this,
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) {
+//    return ap_concat_ref<
+//        _AP_W, _private_range_ref, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        *this,
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs == rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs != rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs > rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs >= rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs < rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs <= rhs;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_private<_AP_W2, false>& val) {
+    ap_private<_AP_W, _AP_S> vval = val;
+    if (l_index > h_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
+    } else {
+      if (_AP_W > 64) {
+        ap_private<_AP_W, _AP_S> mask(-1);
+        if (l_index > 0) {
+          ap_private<_AP_W, false> mask1(-1);
+          mask1 >>= _AP_W - l_index;
+          mask1.flip();
+          mask = mask1;
+          // vval&=mask1;
+          vval <<= l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          ap_private<_AP_W, false> mask2(-1);
+          mask2 <<= h_index + 1;
+          mask2.flip();
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask.flip();
+        d_bv &= mask;
+        d_bv |= vval;
+      } else {
+        uint64_t mask = ~0ULL >> (64 - _AP_W);
+        if (l_index > 0) {
+          uint64_t mask1 = mask;
+          mask1 = mask & (mask1 >> (_AP_W - l_index));
+          vval = mask & (vval << l_index);
+          mask = ~mask1 & mask;
+          // vval&=mask1;
+        }
+        if (h_index < _AP_W - 1) {
+          uint64_t mask2 = ~0ULL >> (64 - _AP_W);
+          mask2 = mask & (mask2 << (h_index + 1));
+          mask &= ~mask2;
+          vval &= ~mask2;
+        }
+        d_bv &= (~mask & (~0ULL >> (64 - _AP_W)));
+        d_bv |= vval;
+      }
+    }
+  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> val(0);
+    if (h_index < l_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    } else {
+      val = d_bv;
+      val >>= l_index;
+      if (h_index < _AP_W - 1) {
+        if (_AP_W <= 64) {
+          const static uint64_t mask =
+              (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
+          val &= (mask >> (_AP_W - (h_index - l_index + 1)));
+        } else {
+          ap_private<_AP_W, false> mask(-1);
+          mask >>= _AP_W - (h_index - l_index + 1);
+          val &= mask;
+        }
+      }
+    }
+    return val;
+  }
+
+  INLINE ap_private<_AP_W, false> get() {
+    ap_private<_AP_W, false> val(0);
+    if (h_index < l_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    } else {
+      val = d_bv;
+      val >>= l_index;
+      if (h_index < _AP_W - 1) {
+        if (_AP_W <= 64) {
+          static const uint64_t mask = ~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0);
+          return val &= ((mask) >> (_AP_W - (h_index - l_index + 1)));
+        } else {
+          ap_private<_AP_W, false> mask(-1);
+          mask >>= _AP_W - (h_index - l_index + 1);
+          val &= mask;
+        }
+      }
+    }
+    return val;
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+  INLINE int to_int() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_int();
+  }
+
+  INLINE unsigned int to_uint() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_uint();
+  }
+
+  INLINE long to_long() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_long();
+  }
+
+  INLINE unsigned long to_ulong() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_ulong();
+  }
+
+  INLINE ap_slong to_int64() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_int64();
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_uint64();
+  }
+
+  INLINE std::string to_string(uint8_t radix = 2) const {
+    return get().to_string(radix);
+  }
+
+  INLINE bool and_reduce() {
+    bool ret = true;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret &= d_bv[i];
+    return ret;
+  }
+
+  INLINE bool or_reduce() {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret |= d_bv[i];
+    return ret;
+  }
+
+  INLINE bool xor_reduce() {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret ^= d_bv[i];
+    return ret;
+  }
+}; // struct _private_range_ref.
+
+/// Bit reference
+/// Proxy class, which allows bit selection to be used as rvalue(for reading)
+/// and lvalue(for writing)
+//--------------------------------------------------------------
+template <int _AP_W, bool _AP_S>
+struct _private_bit_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  ap_private<_AP_W, _AP_S>& d_bv;
+  int d_index;
+
+ public:
+  // copy ctor.
+  INLINE _private_bit_ref(const _private_bit_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {}
+
+  // director ctor.
+  INLINE _private_bit_ref(ap_private<_AP_W, _AP_S>& bv, int index = 0)
+      : d_bv(bv), d_index(index) {
+    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.\n",
+                d_index);
+    _AP_WARNING(d_index >= _AP_W,
+                "Index of bit vector (%d) out of range (%d).\n", d_index, _AP_W);
+  }
+
+  INLINE operator bool() const { return d_bv.get_bit(d_index); }
+
+  INLINE bool to_bool() const { return operator bool(); }
+
+  template <typename T>
+  INLINE _private_bit_ref& operator=(const T& val) {
+    if (!!val)
+      d_bv.set(d_index);
+    else
+      d_bv.clear(d_index);
+    return *this;
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
+//  _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
+//    _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2,
+//  _private_range_ref<_AP_W2,
+//  _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2,
+//    _private_range_ref<_AP_W2,
+//    _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref<_AP_W2,
+//  _AP_S2> > operator,(
+//      const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1,
+//    _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>
+//  operator,(
+//      const _private_bit_ref &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      1, _private_bit_ref, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//  _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        1, _private_bit_ref, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//            _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<1, _private_bit_ref, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//                    _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//      _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1, af_bit_ref<_AP_W2,
+//    _AP_I2, _AP_S2,
+//                                                      _AP_Q2, _AP_O2,
+//                                                      _AP_N2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//        _AP_N2>&>(
+//            a2));
+//  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
+    return get() != op.get();
+  }
+
+  INLINE bool get() const { return operator bool(); }
+
+  //  template <int _AP_W3>
+  //  INLINE void set(const ap_private<_AP_W3, false>& val) {
+  //    operator=(val);
+  //  }
+
+  //  INLINE bool operator~() const {
+  //    bool bit = (d_bv)[d_index];
+  //    return bit ? false : true;
+  //  }
+
+  INLINE int length() const { return 1; }
+
+  //  INLINE std::string to_string() const {
+  //    bool val = get();
+  //    return val ? "1" : "0";
+  //  }
+
+}; // struct _private_bit_ref.
+
+// char a[100];
+// char* ptr = a;
+// ap_int<2> n = 3;
+// char* ptr2 = ptr + n*2;
+// avoid ambiguous errors
+#define OP_BIN_MIX_PTR(BIN_OP)                                           \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
+  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                       \
+                                   const ap_private<_AP_W, _AP_S>& op) { \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
+    return i_op BIN_OP op2;                                              \
+  }                                                                      \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
+  INLINE PTR_TYPE* operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
+                                   PTR_TYPE* i_op) {                     \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
+    return op2 BIN_OP i_op;                                              \
+  }
+
+OP_BIN_MIX_PTR(+)
+OP_BIN_MIX_PTR(-)
+#undef OP_BIN_MIX_PTR
+
+// float OP ap_int
+// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
+#define OP_BIN_MIX_FLOAT(BIN_OP, C_TYPE)                              \
+  template <int _AP_W, bool _AP_S>                                    \
+  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                          \
+                                const ap_private<_AP_W, _AP_S>& op) { \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
+    return i_op BIN_OP op2;                                           \
+  }                                                                   \
+  template <int _AP_W, bool _AP_S>                                    \
+  INLINE C_TYPE operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
+                                C_TYPE i_op) {                        \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
+    return op2 BIN_OP i_op;                                           \
+  }
+
+#define OPS_MIX_FLOAT(C_TYPE) \
+  OP_BIN_MIX_FLOAT(*, C_TYPE) \
+  OP_BIN_MIX_FLOAT(/, C_TYPE) \
+  OP_BIN_MIX_FLOAT(+, C_TYPE) \
+  OP_BIN_MIX_FLOAT(-, C_TYPE)
+
+OPS_MIX_FLOAT(float)
+OPS_MIX_FLOAT(double)
+#undef OP_BIN_MIX_FLOAT
+#undef OPS_MIX_FLOAT
+
+/// Operators mixing Integers with AP_Int
+// ----------------------------------------------------------------
+
+// partially specialize template argument _AP_C in order that:
+// for _AP_W > 64, we will explicitly convert operand with native data type
+// into corresponding ap_private
+// for _AP_W <= 64, we will implicitly convert operand with ap_private into
+// (unsigned) long long
+#define OP_BIN_MIX_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                  \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_WI, _AP_SI>::template RType<_AP_W, _AP_S>::RTYPE \
+      operator BIN_OP(C_TYPE i_op, const ap_private<_AP_W, _AP_S>& op) {       \
+    return ap_private<_AP_WI, _AP_SI>(i_op).operator BIN_OP(op);               \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
+      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
+    return op.operator BIN_OP(ap_private<_AP_WI, _AP_SI>(i_op));               \
+  }
+
+#define OP_REL_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                     \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_private<_AP_W, _AP_S>& op,          \
+                              C_TYPE op2) {                                \
+    return op.operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));            \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(C_TYPE op2,                                  \
+                              const ap_private<_AP_W, _AP_S, false>& op) { \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(op);            \
+  }
+
+#define OP_ASSIGN_MIX_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
+  template <int _AP_W, bool _AP_S>                                 \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(             \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
+    return op.operator ASSIGN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \
+  }
+
+#define OP_BIN_SHIFT_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                \
+  template <int _AP_W, bool _AP_S>                                             \
+  C_TYPE operator BIN_OP(C_TYPE i_op,                                          \
+                         const ap_private<_AP_W, _AP_S, false>& op) {          \
+    return i_op BIN_OP(op.get_VAL());                                          \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
+      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
+    return op.operator BIN_OP(i_op);                                           \
+  }
+
+#define OP_ASSIGN_RSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
+  template <int _AP_W, bool _AP_S>                              \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
+    op = op.operator>>(op2);                                    \
+    return op;                                                  \
+  }
+
+#define OP_ASSIGN_LSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
+  template <int _AP_W, bool _AP_S>                              \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
+    op = op.operator<<(op2);                                    \
+    return op;                                                  \
+  }
+
+#define OPS_MIX_INT(C_TYPE, _AP_W2, _AP_S2)              \
+  OP_BIN_MIX_INT(*, C_TYPE, (_AP_W2), (_AP_S2), mult)    \
+  OP_BIN_MIX_INT(+, C_TYPE, (_AP_W2), (_AP_S2), plus)    \
+  OP_BIN_MIX_INT(-, C_TYPE, (_AP_W2), (_AP_S2), minus)   \
+  OP_BIN_MIX_INT(/, C_TYPE, (_AP_W2), (_AP_S2), div)     \
+  OP_BIN_MIX_INT(%, C_TYPE, (_AP_W2), (_AP_S2), mod)     \
+  OP_BIN_MIX_INT(&, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
+  OP_BIN_MIX_INT(|, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
+  OP_BIN_MIX_INT (^, C_TYPE, (_AP_W2), (_AP_S2), logic)  \
+  OP_BIN_SHIFT_INT(>>, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
+  OP_BIN_SHIFT_INT(<<, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
+                                                         \
+  OP_ASSIGN_MIX_INT(+=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(-=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(*=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(/=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(%=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(&=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(|=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(^=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_RSHIFT_INT(>>=, C_TYPE, (_AP_W2), (_AP_S2))  \
+  OP_ASSIGN_LSHIFT_INT(<<=, C_TYPE, (_AP_W2), (_AP_S2))  \
+                                                         \
+  OP_REL_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))          \
+  OP_REL_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))          \
+  OP_REL_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+OPS_MIX_INT(bool, 1, false)
+OPS_MIX_INT(char, 8, CHAR_IS_SIGNED)
+OPS_MIX_INT(signed char, 8, true)
+OPS_MIX_INT(unsigned char, 8, false)
+OPS_MIX_INT(short, sizeof(short) * 8, true)
+OPS_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+OPS_MIX_INT(int, sizeof(int) * 8, true)
+OPS_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+OPS_MIX_INT(long, sizeof(long) * 8, true)
+OPS_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+OPS_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+OPS_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+
+#undef OP_BIN_MIX_INT
+#undef OP_BIN_SHIFT_INT
+#undef OP_ASSIGN_MIX_INT
+#undef OP_ASSIGN_RSHIFT_INT
+#undef OP_ASSIGN_LSHIFT_INT
+#undef OP_REL_MIX_INT
+#undef OPS_MIX_INT
+
+#define OP_BIN_MIX_RANGE(BIN_OP, RTYPE)                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                             _AP_S2>::RTYPE \
+  operator BIN_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,            \
+                  const ap_private<_AP_W2, _AP_S2>& op2) {                  \
+    return ap_private<_AP_W1, false>(op1).operator BIN_OP(op2);             \
+  }                                                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                             _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                    \
+                  const _private_range_ref<_AP_W2, _AP_S2>& op2) {          \
+    return op1.operator BIN_OP(ap_private<_AP_W2, false>(op2));             \
+  }
+
+#define OP_ASSIGN_MIX_RANGE(ASSIGN_OP)                             \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
+  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(           \
+      ap_private<_AP_W1, _AP_S1>& op1,                             \
+      const _private_range_ref<_AP_W2, _AP_S2>& op2) {             \
+    return op1.operator ASSIGN_OP(ap_private<_AP_W2, false>(op2)); \
+  }                                                                \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
+  INLINE _private_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(   \
+      _private_range_ref<_AP_W1, _AP_S1>& op1,                     \
+      ap_private<_AP_W2, _AP_S2>& op2) {                           \
+    ap_private<_AP_W1, false> tmp(op1);                            \
+    tmp.operator ASSIGN_OP(op2);                                   \
+    op1 = tmp;                                                     \
+    return op1;                                                    \
+  }
+
+#define OP_REL_MIX_RANGE(REL_OP)                                               \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE bool operator REL_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,   \
+                              const ap_private<_AP_W2, _AP_S2>& op2) {         \
+    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);                \
+  }                                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,           \
+                              const _private_range_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1.operator REL_OP(op2.operator ap_private<_AP_W2, false>());      \
+  }
+
+OP_BIN_MIX_RANGE(+, plus)
+OP_BIN_MIX_RANGE(-, minus)
+OP_BIN_MIX_RANGE(*, mult)
+OP_BIN_MIX_RANGE(/, div)
+OP_BIN_MIX_RANGE(%, mod)
+OP_BIN_MIX_RANGE(&, logic)
+OP_BIN_MIX_RANGE(|, logic)
+OP_BIN_MIX_RANGE(^, logic)
+OP_BIN_MIX_RANGE(>>, arg1)
+OP_BIN_MIX_RANGE(<<, arg1)
+#undef OP_BIN_MIX_RANGE
+
+OP_ASSIGN_MIX_RANGE(+=)
+OP_ASSIGN_MIX_RANGE(-=)
+OP_ASSIGN_MIX_RANGE(*=)
+OP_ASSIGN_MIX_RANGE(/=)
+OP_ASSIGN_MIX_RANGE(%=)
+OP_ASSIGN_MIX_RANGE(&=)
+OP_ASSIGN_MIX_RANGE(|=)
+OP_ASSIGN_MIX_RANGE(^=)
+OP_ASSIGN_MIX_RANGE(>>=)
+OP_ASSIGN_MIX_RANGE(<<=)
+#undef OP_ASSIGN_MIX_RANGE
+
+OP_REL_MIX_RANGE(>)
+OP_REL_MIX_RANGE(<)
+OP_REL_MIX_RANGE(>=)
+OP_REL_MIX_RANGE(<=)
+OP_REL_MIX_RANGE(==)
+OP_REL_MIX_RANGE(!=)
+#undef OP_REL_MIX_RANGE
+
+#define OP_BIN_MIX_BIT(BIN_OP, RTYPE)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
+  INLINE typename ap_private<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+  operator BIN_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,                \
+                  const ap_private<_AP_W2, _AP_S2>& op2) {                    \
+    return ap_private<1, false>(op1).operator BIN_OP(op2);                    \
+  }                                                                           \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
+  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                      \
+                  const _private_bit_ref<_AP_W2, _AP_S2>& op2) {              \
+    return op1.operator BIN_OP(ap_private<1, false>(op2));                    \
+  }
+
+#define OP_ASSIGN_MIX_BIT(ASSIGN_OP)                           \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
+  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(       \
+      ap_private<_AP_W1, _AP_S1>& op1,                         \
+      _private_bit_ref<_AP_W2, _AP_S2>& op2) {                 \
+    return op1.operator ASSIGN_OP(ap_private<1, false>(op2));  \
+  }                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
+  INLINE _private_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \
+      _private_bit_ref<_AP_W1, _AP_S1>& op1,                   \
+      ap_private<_AP_W2, _AP_S2>& op2) {                       \
+    ap_private<1, false> tmp(op1);                             \
+    tmp.operator ASSIGN_OP(op2);                               \
+    op1 = tmp;                                                 \
+    return op1;                                                \
+  }
+
+#define OP_REL_MIX_BIT(REL_OP)                                               \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,   \
+                              const ap_private<_AP_W2, _AP_S2>& op2) {       \
+    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);              \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,         \
+                              const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1.operator REL_OP(ap_private<1, false>(op2));                   \
+  }
+
+OP_ASSIGN_MIX_BIT(+=)
+OP_ASSIGN_MIX_BIT(-=)
+OP_ASSIGN_MIX_BIT(*=)
+OP_ASSIGN_MIX_BIT(/=)
+OP_ASSIGN_MIX_BIT(%=)
+OP_ASSIGN_MIX_BIT(&=)
+OP_ASSIGN_MIX_BIT(|=)
+OP_ASSIGN_MIX_BIT(^=)
+OP_ASSIGN_MIX_BIT(>>=)
+OP_ASSIGN_MIX_BIT(<<=)
+#undef OP_ASSIGN_MIX_BIT
+
+OP_BIN_MIX_BIT(+, plus)
+OP_BIN_MIX_BIT(-, minus)
+OP_BIN_MIX_BIT(*, mult)
+OP_BIN_MIX_BIT(/, div)
+OP_BIN_MIX_BIT(%, mod)
+OP_BIN_MIX_BIT(&, logic)
+OP_BIN_MIX_BIT(|, logic)
+OP_BIN_MIX_BIT(^, logic)
+OP_BIN_MIX_BIT(>>, arg1)
+OP_BIN_MIX_BIT(<<, arg1)
+#undef OP_BIN_MIX_BIT
+
+OP_REL_MIX_BIT(>)
+OP_REL_MIX_BIT(<)
+OP_REL_MIX_BIT(<=)
+OP_REL_MIX_BIT(>=)
+OP_REL_MIX_BIT(==)
+OP_REL_MIX_BIT(!=)
+#undef OP_REL_MIX_BIT
+
+#define REF_REL_OP_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(const _private_range_ref<_AP_W, _AP_S>& op,   \
+                              C_TYPE op2) {                                 \
+    return (ap_private<_AP_W, false>(op))                                   \
+        .                                                                   \
+        operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));                   \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(C_TYPE op2,                                   \
+                              const _private_range_ref<_AP_W, _AP_S>& op) { \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(                 \
+        ap_private<_AP_W, false>(op));                                      \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W, _AP_S>& op,     \
+                              C_TYPE op2) {                                 \
+    return (bool(op))REL_OP op2;                                            \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(C_TYPE op2,                                   \
+                              const _private_bit_ref<_AP_W, _AP_S>& op) {   \
+    return op2 REL_OP(bool(op));                                            \
+  }
+
+#define REF_REL_MIX_INT(C_TYPE, _AP_W2, _AP_S2)      \
+  REF_REL_OP_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_REL_OP_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_REL_OP_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_REL_MIX_INT(bool, 1, false)
+REF_REL_MIX_INT(char, 8, CHAR_IS_SIGNED)
+REF_REL_MIX_INT(signed char, 8, true)
+REF_REL_MIX_INT(unsigned char, 8, false)
+REF_REL_MIX_INT(short, sizeof(short) * 8, true)
+REF_REL_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+REF_REL_MIX_INT(int, sizeof(int) * 8, true)
+REF_REL_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+REF_REL_MIX_INT(long, sizeof(long) * 8, true)
+REF_REL_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+REF_REL_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+REF_REL_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef REF_REL_OP_MIX_INT
+#undef REF_REL_MIX_INT
+
+#define REF_BIN_OP_MIX_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)              \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& op,              \
+                      C_TYPE op2) {                                            \
+    return (ap_private<_AP_W, false>(op))                                      \
+        .                                                                      \
+        operator BIN_OP(ap_private<_AP_W2, _AP_S2>(op2));                      \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W2, _AP_S2>::template RType<_AP_W, false>::RTYPE \
+      operator BIN_OP(C_TYPE op2,                                              \
+                      const _private_range_ref<_AP_W, _AP_S>& op) {            \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator BIN_OP(                    \
+        ap_private<_AP_W, false>(op));                                         \
+  }
+
+#define REF_BIN_MIX_INT(C_TYPE, _AP_W2, _AP_S2)            \
+  REF_BIN_OP_MIX_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_MIX_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_MIX_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_MIX_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_MIX_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_MIX_INT(bool, 1, false)
+REF_BIN_MIX_INT(char, 8, CHAR_IS_SIGNED)
+REF_BIN_MIX_INT(signed char, 8, true)
+REF_BIN_MIX_INT(unsigned char, 8, false)
+REF_BIN_MIX_INT(short, sizeof(short) * 8, true)
+REF_BIN_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+REF_BIN_MIX_INT(int, sizeof(int) * 8, true)
+REF_BIN_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+REF_BIN_MIX_INT(long, sizeof(long) * 8, true)
+REF_BIN_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+REF_BIN_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+REF_BIN_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef REF_BIN_OP_MIX_INT
+#undef REF_BIN_MIX_INT
+
+#define REF_BIN_OP(BIN_OP, RTYPE)                                             \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_private<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
+      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& lhs,            \
+                      const _private_range_ref<_AP_W2, _AP_S2>& rhs) {        \
+    return ap_private<_AP_W, false>(lhs).operator BIN_OP(                     \
+        ap_private<_AP_W2, false>(rhs));                                      \
+  }
+
+REF_BIN_OP(+, plus)
+REF_BIN_OP(-, minus)
+REF_BIN_OP(*, mult)
+REF_BIN_OP(/, div)
+REF_BIN_OP(%, mod)
+REF_BIN_OP(&, logic)
+REF_BIN_OP(|, logic)
+REF_BIN_OP(^, logic)
+REF_BIN_OP(>>, arg1)
+REF_BIN_OP(<<, arg1)
+#undef REF_BIN_OP
+
+//************************************************************************
+//  Implement
+//      ap_private<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
+//  for operators  +, -, *, /, %, >>, <<, &, |, ^
+//  Without these operators the operands are converted to int64 and
+//  larger results lose informations (higher order bits).
+//
+//                       operand OP
+//                      /          |
+//              left-concat        right-concat
+//                /     |           /         |
+//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>   <RW2,RT2>
+//
+//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
+//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
+//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
+//
+//  In Verilog 2001 result of concatenation is always unsigned even
+//  when both sides are signed.
+//************************************************************************
+
+#endif // ifndef __AP_PRIVATE_H__
+
+// -*- cpp -*-
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_math.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_math.h
new file mode 100644
index 00000000..f1299714
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_math.h
@@ -0,0 +1,27 @@
+#ifndef X_HLS_MATH_H
+#define X_HLS_MATH_H
+
+#include <cmath>
+#include "ap_fixed.h"
+
+namespace hls {
+
+template<class T>
+static T exp(const T x) {
+  return (T) std::exp(x.to_double());
+}
+
+template <typename T> T sin(T x) { return (T) std::sin(x.to_double()); };
+
+template <typename T> T cos(T x) { return (T) std::cos(x.to_double()); };
+
+template <typename T> T asin(T x) { return (T) std::asin(x.to_double()); };
+
+template <typename T> T acos(T x) { return (T) std::acos(x.to_double()); };
+
+template <typename T> T atan(T x) { return (T) std::atan(x.to_double()); };
+
+template <typename T> T atan2(T x, T y) { return (T) hls::atan2(x.to_double(), y.to_double()); };
+
+}
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_stream.h
new file mode 100644
index 00000000..f516c39e
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/hls_stream.h
@@ -0,0 +1,263 @@
+/*
+#-  (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef X_HLS_STREAM_SIM_H
+#define X_HLS_STREAM_SIM_H
+
+/*
+ * This file contains a C++ model of hls::stream.
+ * It defines C simulation model.
+ */
+#ifndef __cplusplus
+
+#error C++ is required to include this header file
+
+#else
+
+//////////////////////////////////////////////
+// C level simulation models for hls::stream
+//////////////////////////////////////////////
+#include <queue>
+#include <iostream>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#ifdef HLS_STREAM_THREAD_SAFE
+#include <mutex>
+#include <condition_variable>
+#endif
+
+#ifndef _MSC_VER
+#include <cxxabi.h>
+#include <stdlib.h>
+#endif
+
+namespace hls {
+
+template<typename __STREAM_T__>
+class stream
+{
+  protected:
+    std::string _name;
+    std::deque<__STREAM_T__> _data; // container for the elements
+#ifdef HLS_STREAM_THREAD_SAFE
+    std::mutex _mutex;
+    std::condition_variable _condition_var;
+#endif    
+
+  public:
+    /// Constructors
+    // Keep consistent with the synthesis model's constructors
+    stream() {
+        static unsigned _counter = 1;
+        std::stringstream ss;
+#ifndef _MSC_VER
+        char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0);
+        if (_demangle_name) {
+            _name = _demangle_name;
+            free(_demangle_name);
+        }
+        else {
+            _name = "hls_stream";
+        }
+#else
+        _name = typeid(*this).name();
+#endif
+
+        ss << _counter++;
+        _name += "." + ss.str();
+    }
+
+    stream(const std::string name) {
+    // default constructor,
+    // capacity set to predefined maximum
+        _name = name;
+    }
+
+  /// Make copy constructor and assignment operator private
+  private:
+    stream(const stream< __STREAM_T__ >& chn):
+        _name(chn._name), _data(chn._data) {
+    }
+
+    stream& operator = (const stream< __STREAM_T__ >& chn) {
+        _name = chn._name;
+        _data = chn._data;
+        return *this;
+    }
+
+  public:
+    /// Overload >> and << operators to implement read() and write()
+    void operator >> (__STREAM_T__& rdata) {
+        read(rdata);
+    }
+
+    void operator << (const __STREAM_T__& wdata) {
+        write(wdata);
+    }
+
+
+  public:
+    /// Destructor
+    /// Check status of the queue
+    virtual ~stream() {
+        if (!_data.empty())
+        {
+            std::cout << "WARNING: Hls::stream '" 
+                      << _name 
+                      << "' contains leftover data,"
+                      << " which may result in RTL simulation hanging."
+                      << std::endl;
+        }
+    }
+
+    /// Status of the queue
+    bool empty() {
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::lock_guard<std::mutex> lg(_mutex);
+#endif
+        return _data.empty();
+    }    
+
+    bool full() const { return false; }
+
+    /// Blocking read
+    void read(__STREAM_T__& head) {
+        head = read();
+    }
+
+#ifdef HLS_STREAM_THREAD_SAFE
+    __STREAM_T__ read() {
+        std::unique_lock<std::mutex> ul(_mutex);
+        while (_data.empty()) {
+            _condition_var.wait(ul);
+        }
+
+        __STREAM_T__ elem;
+        elem = _data.front();
+        _data.pop_front();
+        return elem;
+    }
+#else
+    __STREAM_T__ read() {
+        __STREAM_T__ elem;
+        if (_data.empty()) {
+            std::cout << "WARNING: Hls::stream '"
+                      << _name 
+                      << "' is read while empty,"
+                      << " which may result in RTL simulation hanging."
+                      << std::endl;
+            elem = __STREAM_T__();
+        } else {
+            elem = _data.front();
+            _data.pop_front();
+        }
+        return elem;
+    }
+#endif
+
+    /// Blocking write
+    void write(const __STREAM_T__& tail) { 
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::unique_lock<std::mutex> ul(_mutex);
+#endif
+        _data.push_back(tail);
+#ifdef HLS_STREAM_THREAD_SAFE
+        _condition_var.notify_one();
+#endif
+    }
+
+    /// Nonblocking read
+    bool read_nb(__STREAM_T__& head) {
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::lock_guard<std::mutex> lg(_mutex);
+#endif    
+        bool is_empty = _data.empty();
+        if (is_empty) {
+            head = __STREAM_T__();
+        } else {
+            __STREAM_T__ elem(_data.front());
+            _data.pop_front();
+            head = elem;
+        }
+        return !is_empty;
+    }
+
+    /// Nonblocking write
+    bool write_nb(const __STREAM_T__& tail) {
+        bool is_full = full();
+        write(tail);
+        return !is_full;
+    }
+
+    /// Fifo size
+    size_t size() {
+        return _data.size();
+    }
+};
+
+} // namespace hls
+
+#endif // __cplusplus
+#endif  // X_HLS_STREAM_SIM_H
+
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/utils/x_hls_utils.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/utils/x_hls_utils.h
new file mode 100644
index 00000000..3e751c36
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/ap_types/utils/x_hls_utils.h
@@ -0,0 +1,80 @@
+#ifndef X_HLS_UTILS_H
+#define X_HLS_UTILS_H
+#include "ap_fixed.h"
+#include <limits>
+
+namespace hls {
+
+    template<typename T>
+    class numeric_limits {
+    public:
+        static T max()     { return std::numeric_limits<T>::max(); }
+        static T min()     { return std::numeric_limits<T>::min(); }
+        static T epsilon() { return std::numeric_limits<T>::epsilon(); }
+    };
+
+    template <int W, int I, ap_q_mode Q, ap_o_mode O>
+    class numeric_limits<ap_fixed<W,I,Q,O> > {
+    public:
+        static ap_fixed<W,I,Q,O> max() {
+            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::max();
+            ap_fixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_fixed<W,I,Q,O> min() {
+            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::min();
+            ap_fixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_fixed<W,I,Q,O> epsilon() {
+          ap_fixed<W,I,Q,O> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W, int I, ap_q_mode Q, ap_o_mode O>
+    class numeric_limits<ap_ufixed<W,I,Q,O> > {
+    public:
+        static ap_ufixed<W,I,Q,O> max() {
+            ap_uint<W> m = ::hls::numeric_limits<ap_uint<W> >::max();
+            ap_ufixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_ufixed<W,I,Q,O> min() { return 0; }
+        static ap_ufixed<W,I,Q,O> epsilon() {
+          ap_ufixed<W,I,Q,O> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W>
+    class numeric_limits<ap_int<W> > {
+    public:
+        static ap_int<W> max() { ap_int<W> m = min(); return ~m; }
+        static ap_int<W> min() { ap_int<W> m = 0; m[W-1] = 1; return m; }
+        static ap_int<W> epsilon() {
+          ap_int<W> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W>
+    class numeric_limits<ap_uint<W> > {
+    public:
+        static ap_uint<W> max() { ap_uint<W> zero = 0; return ~zero; }
+        static ap_uint<W> min() { return 0; }
+        static ap_uint<W> epsilon() {
+          ap_uint<W> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+}
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/defines.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/defines.h
new file mode 100644
index 00000000..cf8d98c3
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/defines.h
@@ -0,0 +1,67 @@
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+#include "nnet_utils/nnet_types.h"
+#include <cstddef>
+#include <cstdio>
+
+// hls-fpga-machine-learning insert numbers
+#define N_INPUT_1_1 100
+#define N_INPUT_1_2 100
+#define N_LAYER_1_3 100
+#define N_LAYER_2_3 2
+#define N_LAYER_1_4 100
+#define N_LAYER_2_4 2
+#define N_INPUT_1_5 100
+#define N_INPUT_2_5 4
+#define OUT_CONCAT_0_6 100
+#define OUT_CONCAT_1_6 4
+#define OUT_CONCAT_0_7 100
+#define OUT_CONCAT_1_7 8
+#define N_OUTPUTS_22 100
+#define N_FILT_22 12
+#define N_LAYER_1_8 100
+#define N_LAYER_2_8 12
+#define N_OUTPUTS_23 100
+#define N_FILT_23 36
+#define N_LAYER_1_12 100
+#define N_LAYER_2_12 36
+#define N_OUTPUTS_24 100
+#define N_FILT_24 1
+#define N_INPUT_1_19 100
+#define N_INPUT_2_19 2
+#define N_INPUT_1_19 100
+#define N_INPUT_2_19 2
+#define N_FILT_21 2
+
+// hls-fpga-machine-learning insert layer-precision
+typedef ap_uint<4> input_t;
+typedef ap_uint<4> input2_t;
+typedef ap_fixed<32,16> layer3_t;
+typedef ap_fixed<32,16> embedding0_embeddings_t;
+typedef ap_fixed<32,16> layer4_t;
+typedef ap_fixed<32,16> embedding1_embeddings_t;
+typedef ap_fixed<32,16> input5_t;
+typedef ap_fixed<32,16> layer6_t;
+typedef ap_fixed<32,16> layer7_t;
+typedef ap_fixed<32,16> model_default_t;
+typedef ap_fixed<32,16> layer22_t;
+typedef ap_fixed<32,16> dense_weight_t;
+typedef ap_fixed<32,16> dense_bias_t;
+typedef ap_fixed<32,16> layer11_t;
+typedef ap_fixed<18,8> activation_table_t;
+typedef ap_fixed<32,16> layer23_t;
+typedef ap_fixed<32,16> dense_1_weight_t;
+typedef ap_fixed<32,16> dense_1_bias_t;
+typedef ap_fixed<32,16> layer15_t;
+typedef ap_fixed<18,8> activation_1_table_t;
+typedef ap_fixed<32,16> layer24_t;
+typedef ap_fixed<32,16> met_weight_weight_t;
+typedef ap_fixed<32,16> met_weight_bias_t;
+typedef ap_fixed<32,16> input19_t;
+typedef ap_fixed<32,16> layer20_t;
+typedef ap_fixed<32,16> result_t;
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation.h
new file mode 100644
index 00000000..8baadf28
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation.h
@@ -0,0 +1,777 @@
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+#include "ap_fixed.h"
+#include "nnet_common.h"
+#include <cmath>
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static const unsigned n_in = 10;
+
+    // Internal info
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef ap_fixed<18, 8> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = data[ii];
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
+
+template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default logistic sigmoid function:
+    //   result = 1/(1+e^(-x))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)sigmoid_table[index];
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
+
+inline float exp_fcn_float(float input) { return std::exp(input); }
+
+template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
+    // Treat the index as the top N bits
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    data_T x(0);
+    x(x.width - 1, x.width - N) = i;
+    return (float)x;
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
+    return (unsigned)y(N - 1, 0);
+}
+
+template <class data_T, typename CONFIG_T>
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        // Slicing bits for address is going to round towards 0, so take the central value
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
+        table_out[i] = exp_x;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::inv_table_t inv_x = 1 / x;
+        table_out[i] = inv_x;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Find the max and compute all delta(x_i, x_max)
+    Op_max<data_T> op_max;
+    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Inversion function:
+    //   result = 1/x
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +64)
+        float in_val = 64.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / in_val;
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
+    data_T data_cache[CONFIG_T::n_in];
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_cache[ii] = data[ii];
+        exp_res[ii] = 0;
+    }
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj)
+                exp_diff_res = 1;
+            else {
+                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
+                index = data_round + 8 * CONFIG_T::table_size / 16;
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                exp_diff_res = exp_table[index];
+            }
+            exp_res[ii] += exp_diff_res;
+        }
+    }
+
+    // Second loop to invert
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
+        res[ii] = (res_T)invert_table[exp_res_index];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS UNROLL
+        res[i] = (res_T)0;
+    }
+
+    data_T maximum = data[0];
+    int idx = 0;
+
+    for (int i = 1; i < CONFIG_T::n_in; i++) {
+        #pragma HLS PIPELINE
+        if (data[i] > maximum) {
+            maximum = data[i];
+            idx = i;
+        }
+    }
+
+    res[idx] = (res_T)1;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS inline
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Implement tanh lookup
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
+        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = tanh(in_val);
+        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
+        // std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 8;
+        index = data_round + 4 * CONFIG_T::table_size / 8;
+        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)tanh_table[index];
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    if (CONFIG_T::io_type == io_parallel) {
+        #pragma HLS PIPELINE
+    }
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softplus function:
+    //   result = log(exp(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softsign function:
+    //   result = x / (abs(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softsign_table[index];
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
+
+template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default ELU function:
+    //   result = alpha * (e^(x) - 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+inline float selu_fcn_float(float input) {
+    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default SELU function:
+    //   result = 1.05 * (1.673 * (e^(x) - 1))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = 2 * data[ii];
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation_stream.h
new file mode 100644
index 00000000..b72809ef
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_activation_stream.h
@@ -0,0 +1,777 @@
+#ifndef NNET_ACTIVATION_STREAM_H_
+#define NNET_ACTIVATION_STREAM_H_
+
+#include "ap_fixed.h"
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_stream.h"
+#include "nnet_types.h"
+#include <cmath>
+
+namespace nnet {
+
+// *************************************************
+//       LINEAR Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+LinearActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    LinearPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+ReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+SigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = sigmoid_table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = data_T::size / multiplier_limit;
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[data_T::size];
+    #pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+SoftmaxExpLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE II=ii
+
+        data_T in_pack = data.read();
+    SoftmaxExpPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j]);
+            exp_res[j] = exp_table[x];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        res_T out_pack;
+        PRAGMA_DATA_PACK(out_pack)
+
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = data_T::size / multiplier_limit;
+
+    typename data_T::value_type data_array[data_T::size];
+#pragma HLS ARRAY_PARTITION variable=data_array complete
+SoftmaxArrayLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE II=ii
+
+        data_T in_pack = data.read();
+    SoftmaxArrayPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            data_array[j] = in_pack[j];
+        }
+
+        // Find the max and compute all delta(x_i, x_max)
+        Op_max<typename data_T::value_type> op_max;
+        typename data_T::value_type x_max =
+            reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
+
+        // For the diffs, use the same type as the input but force rounding and saturation
+        ap_fixed<data_T::value_type::width, data_T::value_type::iwidth, AP_RND, AP_SAT> d_xi_xmax[data_T::size];
+        for (unsigned j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            d_xi_xmax[j] = data_array[j] - x_max;
+        }
+
+        // Calculate all the e^x's
+        typename CONFIG_T::exp_table_t exp_res[data_T::size];
+        #pragma HLS ARRAY_PARTITION variable=exp_res complete
+        typename CONFIG_T::exp_table_t exp_sum(0);
+        for (unsigned j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[j]);
+            exp_res[j] = exp_table[x];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        res_T out_pack;
+        PRAGMA_DATA_PACK(out_pack)
+
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            #pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::table_t exp_res[data_T::size];
+    typename CONFIG_T::table_t exp_diff_res;
+    typename data_T::value_type data_cache[data_T::size];
+
+SoftmaxInitLoop:
+    for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
+        #pragma HLS PIPELINE
+        data_T in_pack = data.read();
+    SoftmaxInitPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            data_cache[j] = in_pack[j];
+            exp_res[j] = 0;
+        }
+
+    SoftmaxExpLoop:
+        for (int i = 0; i < data_T::size; i++) {
+        #pragma HLS UNROLL
+        SoftmaxExpInner:
+            for (int j = 0; j < data_T::size; j++) {
+                #pragma HLS UNROLL
+
+                if (i == j) {
+                    exp_diff_res = 1;
+                } else {
+                    int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16;
+                    int index = data_round + 8 * CONFIG_T::table_size / 16;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
+                    exp_diff_res = exp_table[index];
+                }
+
+                exp_res[i] += exp_diff_res;
+            }
+        }
+
+        res_T out_pack;
+        PRAGMA_DATA_PACK(out_pack)
+
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+
+            int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64;
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = CONFIG_T::table_size - 1;
+
+            out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index];
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_argmax(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+        res_T out_data;
+
+        for (int i = 0; i < res_T::size; i++) {
+            #pragma HLS UNROLL
+            out_data[i] = (typename res_T::value_type)0;
+        }
+
+        typename data_T::value_type maximum = in_data[0];
+        int idx = 0;
+
+        for (int i = 1; i < res_T::size; i++) {
+            #pragma HLS PIPELINE
+            if (in_data[i] > maximum) {
+                maximum = in_data[i];
+                idx = i;
+            }
+        }
+
+        out_data[idx] = (typename res_T::value_type)1;
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::axis == -1);
+
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+TanHActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    TanHPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            int data_round = in_data[j] * CONFIG_T::table_size / 8;
+            int index = data_round + 4 * CONFIG_T::table_size / 8;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = tanh_table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+
+HardSigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    HardSigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
+            out_data[j] = datareg;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+
+HardSigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    HardSigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stream<res_T> &res) {
+LeakyReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    LeakyReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(hls::stream<data_T> &data, typename data_T::value_type theta, hls::stream<res_T> &res) {
+ThresholdedReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ThresholdedReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void softplus(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+SoftplusActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SoftplusPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = softplus_table[index];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void softsign(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+SoftsignActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SoftsignPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = softsign_table[index];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(hls::stream<data_T> &data, typename data_T::value_type alpha, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+EluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    EluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+
+            typename data_T::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = datareg;
+            } else {
+                int index = datareg * CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = alpha * elu_table[index];
+            }
+        }
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+SeluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SeluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+
+            typename data_T::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg;
+            } else {
+                int index = datareg * CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = selu_table[index];
+            }
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(hls::stream<data_T> &data, typename data_T::value_type alpha[CONFIG_T::n_in], hls::stream<res_T> &res) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * res_T::size + j] * in_data[j];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = (typename res_T::value_type)1;
+            else
+                out_data[j] = (typename res_T::value_type) - 1;
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            if (in_data[j] > 1)
+                out_data[j] = (typename res_T::value_type)1;
+            else if (in_data[j] <= -1)
+                out_data[j] = (typename res_T::value_type) - 1;
+            else
+                out_data[j] = (typename res_T::value_type)0;
+        }
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_array.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_array.h
new file mode 100644
index 00000000..d179102a
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_array.h
@@ -0,0 +1,52 @@
+#ifndef NNET_ARRAY_H_
+#define NNET_ARRAY_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    unsigned dims_t[3];
+    dims_t[0] = dims[CONFIG_T::perm[0]];
+    dims_t[1] = dims[CONFIG_T::perm[1]];
+    dims_t[2] = dims[CONFIG_T::perm[2]];
+
+    int idx[3] = {0}, idx_t[3] = {0};
+    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
+        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
+            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
+                idx_t[0] = idx[CONFIG_T::perm[0]];
+                idx_t[1] = idx[CONFIG_T::perm[1]];
+                idx_t[2] = idx[CONFIG_T::perm[2]];
+
+                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
+                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm.h
new file mode 100644
index 00000000..d8be45b7
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm.h
@@ -0,0 +1,124 @@
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    data_T cache;
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=scale,bias
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Calcuate result
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
+                           data_T threshold[CONFIG_T::n_scale_bias]) {
+    #pragma HLS PIPELINE
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_uint<1> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
+                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma HLS PIPELINE
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_int<2> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm_stream.h
new file mode 100644
index 00000000..a064677d
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -0,0 +1,123 @@
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+BatchNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE II=ii
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormpack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                              in_data[j], scale[norm_index]) +
+                          bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
+                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=threshold complete
+
+BinaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
+                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
+                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=threshold_hi complete
+    #pragma HLS ARRAY_PARTITION variable=threshold_lo complete
+
+TernaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+
+            if (in_data[j] > threshold_hi[norm_index]) {
+                out_data[j] = 1;
+            } else if (in_data[j] <= threshold_lo[norm_index]) {
+                out_data[j] = -1;
+            } else {
+                out_data[j] = 0;
+            }
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_code_gen.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_code_gen.h
new file mode 100644
index 00000000..5bffda3d
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_code_gen.h
@@ -0,0 +1,1262 @@
+#ifndef NNET_INSTR_GEN_H_
+#define NNET_INSTR_GEN_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
+  public:
+    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+                            const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
+  public:
+    static void
+    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+                const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+// hls4ml insert code
+template<class data_T, typename CONFIG_T>
+class fill_buffer_22 : public FillConv1DBuffer<data_T, CONFIG_T> {
+    public:
+    static void fill_buffer(
+        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+        const unsigned partition
+    ) {
+        if (partition ==   0) {
+            buffer[0][0] =    data[0]; buffer[0][1] =    data[1]; buffer[0][2] =    data[2]; buffer[0][3] =    data[3]; buffer[0][4] =    data[4]; buffer[0][5] =    data[5]; buffer[0][6] =    data[6]; buffer[0][7] =    data[7];
+
+        }
+        if (partition ==   1) {
+            buffer[0][0] =    data[8]; buffer[0][1] =    data[9]; buffer[0][2] =   data[10]; buffer[0][3] =   data[11]; buffer[0][4] =   data[12]; buffer[0][5] =   data[13]; buffer[0][6] =   data[14]; buffer[0][7] =   data[15];
+
+        }
+        if (partition ==   2) {
+            buffer[0][0] =   data[16]; buffer[0][1] =   data[17]; buffer[0][2] =   data[18]; buffer[0][3] =   data[19]; buffer[0][4] =   data[20]; buffer[0][5] =   data[21]; buffer[0][6] =   data[22]; buffer[0][7] =   data[23];
+
+        }
+        if (partition ==   3) {
+            buffer[0][0] =   data[24]; buffer[0][1] =   data[25]; buffer[0][2] =   data[26]; buffer[0][3] =   data[27]; buffer[0][4] =   data[28]; buffer[0][5] =   data[29]; buffer[0][6] =   data[30]; buffer[0][7] =   data[31];
+
+        }
+        if (partition ==   4) {
+            buffer[0][0] =   data[32]; buffer[0][1] =   data[33]; buffer[0][2] =   data[34]; buffer[0][3] =   data[35]; buffer[0][4] =   data[36]; buffer[0][5] =   data[37]; buffer[0][6] =   data[38]; buffer[0][7] =   data[39];
+
+        }
+        if (partition ==   5) {
+            buffer[0][0] =   data[40]; buffer[0][1] =   data[41]; buffer[0][2] =   data[42]; buffer[0][3] =   data[43]; buffer[0][4] =   data[44]; buffer[0][5] =   data[45]; buffer[0][6] =   data[46]; buffer[0][7] =   data[47];
+
+        }
+        if (partition ==   6) {
+            buffer[0][0] =   data[48]; buffer[0][1] =   data[49]; buffer[0][2] =   data[50]; buffer[0][3] =   data[51]; buffer[0][4] =   data[52]; buffer[0][5] =   data[53]; buffer[0][6] =   data[54]; buffer[0][7] =   data[55];
+
+        }
+        if (partition ==   7) {
+            buffer[0][0] =   data[56]; buffer[0][1] =   data[57]; buffer[0][2] =   data[58]; buffer[0][3] =   data[59]; buffer[0][4] =   data[60]; buffer[0][5] =   data[61]; buffer[0][6] =   data[62]; buffer[0][7] =   data[63];
+
+        }
+        if (partition ==   8) {
+            buffer[0][0] =   data[64]; buffer[0][1] =   data[65]; buffer[0][2] =   data[66]; buffer[0][3] =   data[67]; buffer[0][4] =   data[68]; buffer[0][5] =   data[69]; buffer[0][6] =   data[70]; buffer[0][7] =   data[71];
+
+        }
+        if (partition ==   9) {
+            buffer[0][0] =   data[72]; buffer[0][1] =   data[73]; buffer[0][2] =   data[74]; buffer[0][3] =   data[75]; buffer[0][4] =   data[76]; buffer[0][5] =   data[77]; buffer[0][6] =   data[78]; buffer[0][7] =   data[79];
+
+        }
+        if (partition ==  10) {
+            buffer[0][0] =   data[80]; buffer[0][1] =   data[81]; buffer[0][2] =   data[82]; buffer[0][3] =   data[83]; buffer[0][4] =   data[84]; buffer[0][5] =   data[85]; buffer[0][6] =   data[86]; buffer[0][7] =   data[87];
+
+        }
+        if (partition ==  11) {
+            buffer[0][0] =   data[88]; buffer[0][1] =   data[89]; buffer[0][2] =   data[90]; buffer[0][3] =   data[91]; buffer[0][4] =   data[92]; buffer[0][5] =   data[93]; buffer[0][6] =   data[94]; buffer[0][7] =   data[95];
+
+        }
+        if (partition ==  12) {
+            buffer[0][0] =   data[96]; buffer[0][1] =   data[97]; buffer[0][2] =   data[98]; buffer[0][3] =   data[99]; buffer[0][4] =  data[100]; buffer[0][5] =  data[101]; buffer[0][6] =  data[102]; buffer[0][7] =  data[103];
+
+        }
+        if (partition ==  13) {
+            buffer[0][0] =  data[104]; buffer[0][1] =  data[105]; buffer[0][2] =  data[106]; buffer[0][3] =  data[107]; buffer[0][4] =  data[108]; buffer[0][5] =  data[109]; buffer[0][6] =  data[110]; buffer[0][7] =  data[111];
+
+        }
+        if (partition ==  14) {
+            buffer[0][0] =  data[112]; buffer[0][1] =  data[113]; buffer[0][2] =  data[114]; buffer[0][3] =  data[115]; buffer[0][4] =  data[116]; buffer[0][5] =  data[117]; buffer[0][6] =  data[118]; buffer[0][7] =  data[119];
+
+        }
+        if (partition ==  15) {
+            buffer[0][0] =  data[120]; buffer[0][1] =  data[121]; buffer[0][2] =  data[122]; buffer[0][3] =  data[123]; buffer[0][4] =  data[124]; buffer[0][5] =  data[125]; buffer[0][6] =  data[126]; buffer[0][7] =  data[127];
+
+        }
+        if (partition ==  16) {
+            buffer[0][0] =  data[128]; buffer[0][1] =  data[129]; buffer[0][2] =  data[130]; buffer[0][3] =  data[131]; buffer[0][4] =  data[132]; buffer[0][5] =  data[133]; buffer[0][6] =  data[134]; buffer[0][7] =  data[135];
+
+        }
+        if (partition ==  17) {
+            buffer[0][0] =  data[136]; buffer[0][1] =  data[137]; buffer[0][2] =  data[138]; buffer[0][3] =  data[139]; buffer[0][4] =  data[140]; buffer[0][5] =  data[141]; buffer[0][6] =  data[142]; buffer[0][7] =  data[143];
+
+        }
+        if (partition ==  18) {
+            buffer[0][0] =  data[144]; buffer[0][1] =  data[145]; buffer[0][2] =  data[146]; buffer[0][3] =  data[147]; buffer[0][4] =  data[148]; buffer[0][5] =  data[149]; buffer[0][6] =  data[150]; buffer[0][7] =  data[151];
+
+        }
+        if (partition ==  19) {
+            buffer[0][0] =  data[152]; buffer[0][1] =  data[153]; buffer[0][2] =  data[154]; buffer[0][3] =  data[155]; buffer[0][4] =  data[156]; buffer[0][5] =  data[157]; buffer[0][6] =  data[158]; buffer[0][7] =  data[159];
+
+        }
+        if (partition ==  20) {
+            buffer[0][0] =  data[160]; buffer[0][1] =  data[161]; buffer[0][2] =  data[162]; buffer[0][3] =  data[163]; buffer[0][4] =  data[164]; buffer[0][5] =  data[165]; buffer[0][6] =  data[166]; buffer[0][7] =  data[167];
+
+        }
+        if (partition ==  21) {
+            buffer[0][0] =  data[168]; buffer[0][1] =  data[169]; buffer[0][2] =  data[170]; buffer[0][3] =  data[171]; buffer[0][4] =  data[172]; buffer[0][5] =  data[173]; buffer[0][6] =  data[174]; buffer[0][7] =  data[175];
+
+        }
+        if (partition ==  22) {
+            buffer[0][0] =  data[176]; buffer[0][1] =  data[177]; buffer[0][2] =  data[178]; buffer[0][3] =  data[179]; buffer[0][4] =  data[180]; buffer[0][5] =  data[181]; buffer[0][6] =  data[182]; buffer[0][7] =  data[183];
+
+        }
+        if (partition ==  23) {
+            buffer[0][0] =  data[184]; buffer[0][1] =  data[185]; buffer[0][2] =  data[186]; buffer[0][3] =  data[187]; buffer[0][4] =  data[188]; buffer[0][5] =  data[189]; buffer[0][6] =  data[190]; buffer[0][7] =  data[191];
+
+        }
+        if (partition ==  24) {
+            buffer[0][0] =  data[192]; buffer[0][1] =  data[193]; buffer[0][2] =  data[194]; buffer[0][3] =  data[195]; buffer[0][4] =  data[196]; buffer[0][5] =  data[197]; buffer[0][6] =  data[198]; buffer[0][7] =  data[199];
+
+        }
+        if (partition ==  25) {
+            buffer[0][0] =  data[200]; buffer[0][1] =  data[201]; buffer[0][2] =  data[202]; buffer[0][3] =  data[203]; buffer[0][4] =  data[204]; buffer[0][5] =  data[205]; buffer[0][6] =  data[206]; buffer[0][7] =  data[207];
+
+        }
+        if (partition ==  26) {
+            buffer[0][0] =  data[208]; buffer[0][1] =  data[209]; buffer[0][2] =  data[210]; buffer[0][3] =  data[211]; buffer[0][4] =  data[212]; buffer[0][5] =  data[213]; buffer[0][6] =  data[214]; buffer[0][7] =  data[215];
+
+        }
+        if (partition ==  27) {
+            buffer[0][0] =  data[216]; buffer[0][1] =  data[217]; buffer[0][2] =  data[218]; buffer[0][3] =  data[219]; buffer[0][4] =  data[220]; buffer[0][5] =  data[221]; buffer[0][6] =  data[222]; buffer[0][7] =  data[223];
+
+        }
+        if (partition ==  28) {
+            buffer[0][0] =  data[224]; buffer[0][1] =  data[225]; buffer[0][2] =  data[226]; buffer[0][3] =  data[227]; buffer[0][4] =  data[228]; buffer[0][5] =  data[229]; buffer[0][6] =  data[230]; buffer[0][7] =  data[231];
+
+        }
+        if (partition ==  29) {
+            buffer[0][0] =  data[232]; buffer[0][1] =  data[233]; buffer[0][2] =  data[234]; buffer[0][3] =  data[235]; buffer[0][4] =  data[236]; buffer[0][5] =  data[237]; buffer[0][6] =  data[238]; buffer[0][7] =  data[239];
+
+        }
+        if (partition ==  30) {
+            buffer[0][0] =  data[240]; buffer[0][1] =  data[241]; buffer[0][2] =  data[242]; buffer[0][3] =  data[243]; buffer[0][4] =  data[244]; buffer[0][5] =  data[245]; buffer[0][6] =  data[246]; buffer[0][7] =  data[247];
+
+        }
+        if (partition ==  31) {
+            buffer[0][0] =  data[248]; buffer[0][1] =  data[249]; buffer[0][2] =  data[250]; buffer[0][3] =  data[251]; buffer[0][4] =  data[252]; buffer[0][5] =  data[253]; buffer[0][6] =  data[254]; buffer[0][7] =  data[255];
+
+        }
+        if (partition ==  32) {
+            buffer[0][0] =  data[256]; buffer[0][1] =  data[257]; buffer[0][2] =  data[258]; buffer[0][3] =  data[259]; buffer[0][4] =  data[260]; buffer[0][5] =  data[261]; buffer[0][6] =  data[262]; buffer[0][7] =  data[263];
+
+        }
+        if (partition ==  33) {
+            buffer[0][0] =  data[264]; buffer[0][1] =  data[265]; buffer[0][2] =  data[266]; buffer[0][3] =  data[267]; buffer[0][4] =  data[268]; buffer[0][5] =  data[269]; buffer[0][6] =  data[270]; buffer[0][7] =  data[271];
+
+        }
+        if (partition ==  34) {
+            buffer[0][0] =  data[272]; buffer[0][1] =  data[273]; buffer[0][2] =  data[274]; buffer[0][3] =  data[275]; buffer[0][4] =  data[276]; buffer[0][5] =  data[277]; buffer[0][6] =  data[278]; buffer[0][7] =  data[279];
+
+        }
+        if (partition ==  35) {
+            buffer[0][0] =  data[280]; buffer[0][1] =  data[281]; buffer[0][2] =  data[282]; buffer[0][3] =  data[283]; buffer[0][4] =  data[284]; buffer[0][5] =  data[285]; buffer[0][6] =  data[286]; buffer[0][7] =  data[287];
+
+        }
+        if (partition ==  36) {
+            buffer[0][0] =  data[288]; buffer[0][1] =  data[289]; buffer[0][2] =  data[290]; buffer[0][3] =  data[291]; buffer[0][4] =  data[292]; buffer[0][5] =  data[293]; buffer[0][6] =  data[294]; buffer[0][7] =  data[295];
+
+        }
+        if (partition ==  37) {
+            buffer[0][0] =  data[296]; buffer[0][1] =  data[297]; buffer[0][2] =  data[298]; buffer[0][3] =  data[299]; buffer[0][4] =  data[300]; buffer[0][5] =  data[301]; buffer[0][6] =  data[302]; buffer[0][7] =  data[303];
+
+        }
+        if (partition ==  38) {
+            buffer[0][0] =  data[304]; buffer[0][1] =  data[305]; buffer[0][2] =  data[306]; buffer[0][3] =  data[307]; buffer[0][4] =  data[308]; buffer[0][5] =  data[309]; buffer[0][6] =  data[310]; buffer[0][7] =  data[311];
+
+        }
+        if (partition ==  39) {
+            buffer[0][0] =  data[312]; buffer[0][1] =  data[313]; buffer[0][2] =  data[314]; buffer[0][3] =  data[315]; buffer[0][4] =  data[316]; buffer[0][5] =  data[317]; buffer[0][6] =  data[318]; buffer[0][7] =  data[319];
+
+        }
+        if (partition ==  40) {
+            buffer[0][0] =  data[320]; buffer[0][1] =  data[321]; buffer[0][2] =  data[322]; buffer[0][3] =  data[323]; buffer[0][4] =  data[324]; buffer[0][5] =  data[325]; buffer[0][6] =  data[326]; buffer[0][7] =  data[327];
+
+        }
+        if (partition ==  41) {
+            buffer[0][0] =  data[328]; buffer[0][1] =  data[329]; buffer[0][2] =  data[330]; buffer[0][3] =  data[331]; buffer[0][4] =  data[332]; buffer[0][5] =  data[333]; buffer[0][6] =  data[334]; buffer[0][7] =  data[335];
+
+        }
+        if (partition ==  42) {
+            buffer[0][0] =  data[336]; buffer[0][1] =  data[337]; buffer[0][2] =  data[338]; buffer[0][3] =  data[339]; buffer[0][4] =  data[340]; buffer[0][5] =  data[341]; buffer[0][6] =  data[342]; buffer[0][7] =  data[343];
+
+        }
+        if (partition ==  43) {
+            buffer[0][0] =  data[344]; buffer[0][1] =  data[345]; buffer[0][2] =  data[346]; buffer[0][3] =  data[347]; buffer[0][4] =  data[348]; buffer[0][5] =  data[349]; buffer[0][6] =  data[350]; buffer[0][7] =  data[351];
+
+        }
+        if (partition ==  44) {
+            buffer[0][0] =  data[352]; buffer[0][1] =  data[353]; buffer[0][2] =  data[354]; buffer[0][3] =  data[355]; buffer[0][4] =  data[356]; buffer[0][5] =  data[357]; buffer[0][6] =  data[358]; buffer[0][7] =  data[359];
+
+        }
+        if (partition ==  45) {
+            buffer[0][0] =  data[360]; buffer[0][1] =  data[361]; buffer[0][2] =  data[362]; buffer[0][3] =  data[363]; buffer[0][4] =  data[364]; buffer[0][5] =  data[365]; buffer[0][6] =  data[366]; buffer[0][7] =  data[367];
+
+        }
+        if (partition ==  46) {
+            buffer[0][0] =  data[368]; buffer[0][1] =  data[369]; buffer[0][2] =  data[370]; buffer[0][3] =  data[371]; buffer[0][4] =  data[372]; buffer[0][5] =  data[373]; buffer[0][6] =  data[374]; buffer[0][7] =  data[375];
+
+        }
+        if (partition ==  47) {
+            buffer[0][0] =  data[376]; buffer[0][1] =  data[377]; buffer[0][2] =  data[378]; buffer[0][3] =  data[379]; buffer[0][4] =  data[380]; buffer[0][5] =  data[381]; buffer[0][6] =  data[382]; buffer[0][7] =  data[383];
+
+        }
+        if (partition ==  48) {
+            buffer[0][0] =  data[384]; buffer[0][1] =  data[385]; buffer[0][2] =  data[386]; buffer[0][3] =  data[387]; buffer[0][4] =  data[388]; buffer[0][5] =  data[389]; buffer[0][6] =  data[390]; buffer[0][7] =  data[391];
+
+        }
+        if (partition ==  49) {
+            buffer[0][0] =  data[392]; buffer[0][1] =  data[393]; buffer[0][2] =  data[394]; buffer[0][3] =  data[395]; buffer[0][4] =  data[396]; buffer[0][5] =  data[397]; buffer[0][6] =  data[398]; buffer[0][7] =  data[399];
+
+        }
+        if (partition ==  50) {
+            buffer[0][0] =  data[400]; buffer[0][1] =  data[401]; buffer[0][2] =  data[402]; buffer[0][3] =  data[403]; buffer[0][4] =  data[404]; buffer[0][5] =  data[405]; buffer[0][6] =  data[406]; buffer[0][7] =  data[407];
+
+        }
+        if (partition ==  51) {
+            buffer[0][0] =  data[408]; buffer[0][1] =  data[409]; buffer[0][2] =  data[410]; buffer[0][3] =  data[411]; buffer[0][4] =  data[412]; buffer[0][5] =  data[413]; buffer[0][6] =  data[414]; buffer[0][7] =  data[415];
+
+        }
+        if (partition ==  52) {
+            buffer[0][0] =  data[416]; buffer[0][1] =  data[417]; buffer[0][2] =  data[418]; buffer[0][3] =  data[419]; buffer[0][4] =  data[420]; buffer[0][5] =  data[421]; buffer[0][6] =  data[422]; buffer[0][7] =  data[423];
+
+        }
+        if (partition ==  53) {
+            buffer[0][0] =  data[424]; buffer[0][1] =  data[425]; buffer[0][2] =  data[426]; buffer[0][3] =  data[427]; buffer[0][4] =  data[428]; buffer[0][5] =  data[429]; buffer[0][6] =  data[430]; buffer[0][7] =  data[431];
+
+        }
+        if (partition ==  54) {
+            buffer[0][0] =  data[432]; buffer[0][1] =  data[433]; buffer[0][2] =  data[434]; buffer[0][3] =  data[435]; buffer[0][4] =  data[436]; buffer[0][5] =  data[437]; buffer[0][6] =  data[438]; buffer[0][7] =  data[439];
+
+        }
+        if (partition ==  55) {
+            buffer[0][0] =  data[440]; buffer[0][1] =  data[441]; buffer[0][2] =  data[442]; buffer[0][3] =  data[443]; buffer[0][4] =  data[444]; buffer[0][5] =  data[445]; buffer[0][6] =  data[446]; buffer[0][7] =  data[447];
+
+        }
+        if (partition ==  56) {
+            buffer[0][0] =  data[448]; buffer[0][1] =  data[449]; buffer[0][2] =  data[450]; buffer[0][3] =  data[451]; buffer[0][4] =  data[452]; buffer[0][5] =  data[453]; buffer[0][6] =  data[454]; buffer[0][7] =  data[455];
+
+        }
+        if (partition ==  57) {
+            buffer[0][0] =  data[456]; buffer[0][1] =  data[457]; buffer[0][2] =  data[458]; buffer[0][3] =  data[459]; buffer[0][4] =  data[460]; buffer[0][5] =  data[461]; buffer[0][6] =  data[462]; buffer[0][7] =  data[463];
+
+        }
+        if (partition ==  58) {
+            buffer[0][0] =  data[464]; buffer[0][1] =  data[465]; buffer[0][2] =  data[466]; buffer[0][3] =  data[467]; buffer[0][4] =  data[468]; buffer[0][5] =  data[469]; buffer[0][6] =  data[470]; buffer[0][7] =  data[471];
+
+        }
+        if (partition ==  59) {
+            buffer[0][0] =  data[472]; buffer[0][1] =  data[473]; buffer[0][2] =  data[474]; buffer[0][3] =  data[475]; buffer[0][4] =  data[476]; buffer[0][5] =  data[477]; buffer[0][6] =  data[478]; buffer[0][7] =  data[479];
+
+        }
+        if (partition ==  60) {
+            buffer[0][0] =  data[480]; buffer[0][1] =  data[481]; buffer[0][2] =  data[482]; buffer[0][3] =  data[483]; buffer[0][4] =  data[484]; buffer[0][5] =  data[485]; buffer[0][6] =  data[486]; buffer[0][7] =  data[487];
+
+        }
+        if (partition ==  61) {
+            buffer[0][0] =  data[488]; buffer[0][1] =  data[489]; buffer[0][2] =  data[490]; buffer[0][3] =  data[491]; buffer[0][4] =  data[492]; buffer[0][5] =  data[493]; buffer[0][6] =  data[494]; buffer[0][7] =  data[495];
+
+        }
+        if (partition ==  62) {
+            buffer[0][0] =  data[496]; buffer[0][1] =  data[497]; buffer[0][2] =  data[498]; buffer[0][3] =  data[499]; buffer[0][4] =  data[500]; buffer[0][5] =  data[501]; buffer[0][6] =  data[502]; buffer[0][7] =  data[503];
+
+        }
+        if (partition ==  63) {
+            buffer[0][0] =  data[504]; buffer[0][1] =  data[505]; buffer[0][2] =  data[506]; buffer[0][3] =  data[507]; buffer[0][4] =  data[508]; buffer[0][5] =  data[509]; buffer[0][6] =  data[510]; buffer[0][7] =  data[511];
+
+        }
+        if (partition ==  64) {
+            buffer[0][0] =  data[512]; buffer[0][1] =  data[513]; buffer[0][2] =  data[514]; buffer[0][3] =  data[515]; buffer[0][4] =  data[516]; buffer[0][5] =  data[517]; buffer[0][6] =  data[518]; buffer[0][7] =  data[519];
+
+        }
+        if (partition ==  65) {
+            buffer[0][0] =  data[520]; buffer[0][1] =  data[521]; buffer[0][2] =  data[522]; buffer[0][3] =  data[523]; buffer[0][4] =  data[524]; buffer[0][5] =  data[525]; buffer[0][6] =  data[526]; buffer[0][7] =  data[527];
+
+        }
+        if (partition ==  66) {
+            buffer[0][0] =  data[528]; buffer[0][1] =  data[529]; buffer[0][2] =  data[530]; buffer[0][3] =  data[531]; buffer[0][4] =  data[532]; buffer[0][5] =  data[533]; buffer[0][6] =  data[534]; buffer[0][7] =  data[535];
+
+        }
+        if (partition ==  67) {
+            buffer[0][0] =  data[536]; buffer[0][1] =  data[537]; buffer[0][2] =  data[538]; buffer[0][3] =  data[539]; buffer[0][4] =  data[540]; buffer[0][5] =  data[541]; buffer[0][6] =  data[542]; buffer[0][7] =  data[543];
+
+        }
+        if (partition ==  68) {
+            buffer[0][0] =  data[544]; buffer[0][1] =  data[545]; buffer[0][2] =  data[546]; buffer[0][3] =  data[547]; buffer[0][4] =  data[548]; buffer[0][5] =  data[549]; buffer[0][6] =  data[550]; buffer[0][7] =  data[551];
+
+        }
+        if (partition ==  69) {
+            buffer[0][0] =  data[552]; buffer[0][1] =  data[553]; buffer[0][2] =  data[554]; buffer[0][3] =  data[555]; buffer[0][4] =  data[556]; buffer[0][5] =  data[557]; buffer[0][6] =  data[558]; buffer[0][7] =  data[559];
+
+        }
+        if (partition ==  70) {
+            buffer[0][0] =  data[560]; buffer[0][1] =  data[561]; buffer[0][2] =  data[562]; buffer[0][3] =  data[563]; buffer[0][4] =  data[564]; buffer[0][5] =  data[565]; buffer[0][6] =  data[566]; buffer[0][7] =  data[567];
+
+        }
+        if (partition ==  71) {
+            buffer[0][0] =  data[568]; buffer[0][1] =  data[569]; buffer[0][2] =  data[570]; buffer[0][3] =  data[571]; buffer[0][4] =  data[572]; buffer[0][5] =  data[573]; buffer[0][6] =  data[574]; buffer[0][7] =  data[575];
+
+        }
+        if (partition ==  72) {
+            buffer[0][0] =  data[576]; buffer[0][1] =  data[577]; buffer[0][2] =  data[578]; buffer[0][3] =  data[579]; buffer[0][4] =  data[580]; buffer[0][5] =  data[581]; buffer[0][6] =  data[582]; buffer[0][7] =  data[583];
+
+        }
+        if (partition ==  73) {
+            buffer[0][0] =  data[584]; buffer[0][1] =  data[585]; buffer[0][2] =  data[586]; buffer[0][3] =  data[587]; buffer[0][4] =  data[588]; buffer[0][5] =  data[589]; buffer[0][6] =  data[590]; buffer[0][7] =  data[591];
+
+        }
+        if (partition ==  74) {
+            buffer[0][0] =  data[592]; buffer[0][1] =  data[593]; buffer[0][2] =  data[594]; buffer[0][3] =  data[595]; buffer[0][4] =  data[596]; buffer[0][5] =  data[597]; buffer[0][6] =  data[598]; buffer[0][7] =  data[599];
+
+        }
+        if (partition ==  75) {
+            buffer[0][0] =  data[600]; buffer[0][1] =  data[601]; buffer[0][2] =  data[602]; buffer[0][3] =  data[603]; buffer[0][4] =  data[604]; buffer[0][5] =  data[605]; buffer[0][6] =  data[606]; buffer[0][7] =  data[607];
+
+        }
+        if (partition ==  76) {
+            buffer[0][0] =  data[608]; buffer[0][1] =  data[609]; buffer[0][2] =  data[610]; buffer[0][3] =  data[611]; buffer[0][4] =  data[612]; buffer[0][5] =  data[613]; buffer[0][6] =  data[614]; buffer[0][7] =  data[615];
+
+        }
+        if (partition ==  77) {
+            buffer[0][0] =  data[616]; buffer[0][1] =  data[617]; buffer[0][2] =  data[618]; buffer[0][3] =  data[619]; buffer[0][4] =  data[620]; buffer[0][5] =  data[621]; buffer[0][6] =  data[622]; buffer[0][7] =  data[623];
+
+        }
+        if (partition ==  78) {
+            buffer[0][0] =  data[624]; buffer[0][1] =  data[625]; buffer[0][2] =  data[626]; buffer[0][3] =  data[627]; buffer[0][4] =  data[628]; buffer[0][5] =  data[629]; buffer[0][6] =  data[630]; buffer[0][7] =  data[631];
+
+        }
+        if (partition ==  79) {
+            buffer[0][0] =  data[632]; buffer[0][1] =  data[633]; buffer[0][2] =  data[634]; buffer[0][3] =  data[635]; buffer[0][4] =  data[636]; buffer[0][5] =  data[637]; buffer[0][6] =  data[638]; buffer[0][7] =  data[639];
+
+        }
+        if (partition ==  80) {
+            buffer[0][0] =  data[640]; buffer[0][1] =  data[641]; buffer[0][2] =  data[642]; buffer[0][3] =  data[643]; buffer[0][4] =  data[644]; buffer[0][5] =  data[645]; buffer[0][6] =  data[646]; buffer[0][7] =  data[647];
+
+        }
+        if (partition ==  81) {
+            buffer[0][0] =  data[648]; buffer[0][1] =  data[649]; buffer[0][2] =  data[650]; buffer[0][3] =  data[651]; buffer[0][4] =  data[652]; buffer[0][5] =  data[653]; buffer[0][6] =  data[654]; buffer[0][7] =  data[655];
+
+        }
+        if (partition ==  82) {
+            buffer[0][0] =  data[656]; buffer[0][1] =  data[657]; buffer[0][2] =  data[658]; buffer[0][3] =  data[659]; buffer[0][4] =  data[660]; buffer[0][5] =  data[661]; buffer[0][6] =  data[662]; buffer[0][7] =  data[663];
+
+        }
+        if (partition ==  83) {
+            buffer[0][0] =  data[664]; buffer[0][1] =  data[665]; buffer[0][2] =  data[666]; buffer[0][3] =  data[667]; buffer[0][4] =  data[668]; buffer[0][5] =  data[669]; buffer[0][6] =  data[670]; buffer[0][7] =  data[671];
+
+        }
+        if (partition ==  84) {
+            buffer[0][0] =  data[672]; buffer[0][1] =  data[673]; buffer[0][2] =  data[674]; buffer[0][3] =  data[675]; buffer[0][4] =  data[676]; buffer[0][5] =  data[677]; buffer[0][6] =  data[678]; buffer[0][7] =  data[679];
+
+        }
+        if (partition ==  85) {
+            buffer[0][0] =  data[680]; buffer[0][1] =  data[681]; buffer[0][2] =  data[682]; buffer[0][3] =  data[683]; buffer[0][4] =  data[684]; buffer[0][5] =  data[685]; buffer[0][6] =  data[686]; buffer[0][7] =  data[687];
+
+        }
+        if (partition ==  86) {
+            buffer[0][0] =  data[688]; buffer[0][1] =  data[689]; buffer[0][2] =  data[690]; buffer[0][3] =  data[691]; buffer[0][4] =  data[692]; buffer[0][5] =  data[693]; buffer[0][6] =  data[694]; buffer[0][7] =  data[695];
+
+        }
+        if (partition ==  87) {
+            buffer[0][0] =  data[696]; buffer[0][1] =  data[697]; buffer[0][2] =  data[698]; buffer[0][3] =  data[699]; buffer[0][4] =  data[700]; buffer[0][5] =  data[701]; buffer[0][6] =  data[702]; buffer[0][7] =  data[703];
+
+        }
+        if (partition ==  88) {
+            buffer[0][0] =  data[704]; buffer[0][1] =  data[705]; buffer[0][2] =  data[706]; buffer[0][3] =  data[707]; buffer[0][4] =  data[708]; buffer[0][5] =  data[709]; buffer[0][6] =  data[710]; buffer[0][7] =  data[711];
+
+        }
+        if (partition ==  89) {
+            buffer[0][0] =  data[712]; buffer[0][1] =  data[713]; buffer[0][2] =  data[714]; buffer[0][3] =  data[715]; buffer[0][4] =  data[716]; buffer[0][5] =  data[717]; buffer[0][6] =  data[718]; buffer[0][7] =  data[719];
+
+        }
+        if (partition ==  90) {
+            buffer[0][0] =  data[720]; buffer[0][1] =  data[721]; buffer[0][2] =  data[722]; buffer[0][3] =  data[723]; buffer[0][4] =  data[724]; buffer[0][5] =  data[725]; buffer[0][6] =  data[726]; buffer[0][7] =  data[727];
+
+        }
+        if (partition ==  91) {
+            buffer[0][0] =  data[728]; buffer[0][1] =  data[729]; buffer[0][2] =  data[730]; buffer[0][3] =  data[731]; buffer[0][4] =  data[732]; buffer[0][5] =  data[733]; buffer[0][6] =  data[734]; buffer[0][7] =  data[735];
+
+        }
+        if (partition ==  92) {
+            buffer[0][0] =  data[736]; buffer[0][1] =  data[737]; buffer[0][2] =  data[738]; buffer[0][3] =  data[739]; buffer[0][4] =  data[740]; buffer[0][5] =  data[741]; buffer[0][6] =  data[742]; buffer[0][7] =  data[743];
+
+        }
+        if (partition ==  93) {
+            buffer[0][0] =  data[744]; buffer[0][1] =  data[745]; buffer[0][2] =  data[746]; buffer[0][3] =  data[747]; buffer[0][4] =  data[748]; buffer[0][5] =  data[749]; buffer[0][6] =  data[750]; buffer[0][7] =  data[751];
+
+        }
+        if (partition ==  94) {
+            buffer[0][0] =  data[752]; buffer[0][1] =  data[753]; buffer[0][2] =  data[754]; buffer[0][3] =  data[755]; buffer[0][4] =  data[756]; buffer[0][5] =  data[757]; buffer[0][6] =  data[758]; buffer[0][7] =  data[759];
+
+        }
+        if (partition ==  95) {
+            buffer[0][0] =  data[760]; buffer[0][1] =  data[761]; buffer[0][2] =  data[762]; buffer[0][3] =  data[763]; buffer[0][4] =  data[764]; buffer[0][5] =  data[765]; buffer[0][6] =  data[766]; buffer[0][7] =  data[767];
+
+        }
+        if (partition ==  96) {
+            buffer[0][0] =  data[768]; buffer[0][1] =  data[769]; buffer[0][2] =  data[770]; buffer[0][3] =  data[771]; buffer[0][4] =  data[772]; buffer[0][5] =  data[773]; buffer[0][6] =  data[774]; buffer[0][7] =  data[775];
+
+        }
+        if (partition ==  97) {
+            buffer[0][0] =  data[776]; buffer[0][1] =  data[777]; buffer[0][2] =  data[778]; buffer[0][3] =  data[779]; buffer[0][4] =  data[780]; buffer[0][5] =  data[781]; buffer[0][6] =  data[782]; buffer[0][7] =  data[783];
+
+        }
+        if (partition ==  98) {
+            buffer[0][0] =  data[784]; buffer[0][1] =  data[785]; buffer[0][2] =  data[786]; buffer[0][3] =  data[787]; buffer[0][4] =  data[788]; buffer[0][5] =  data[789]; buffer[0][6] =  data[790]; buffer[0][7] =  data[791];
+
+        }
+        if (partition ==  99) {
+            buffer[0][0] =  data[792]; buffer[0][1] =  data[793]; buffer[0][2] =  data[794]; buffer[0][3] =  data[795]; buffer[0][4] =  data[796]; buffer[0][5] =  data[797]; buffer[0][6] =  data[798]; buffer[0][7] =  data[799];
+
+        }
+    }
+};
+template<class data_T, typename CONFIG_T>
+class fill_buffer_23 : public FillConv1DBuffer<data_T, CONFIG_T> {
+    public:
+    static void fill_buffer(
+        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+        const unsigned partition
+    ) {
+        if (partition ==   0) {
+            buffer[0][0] =    data[0]; buffer[0][1] =    data[1]; buffer[0][2] =    data[2]; buffer[0][3] =    data[3]; buffer[0][4] =    data[4]; buffer[0][5] =    data[5]; buffer[0][6] =    data[6]; buffer[0][7] =    data[7]; buffer[0][8] =    data[8]; buffer[0][9] =    data[9]; buffer[0][10] =   data[10]; buffer[0][11] =   data[11];
+
+        }
+        if (partition ==   1) {
+            buffer[0][0] =   data[12]; buffer[0][1] =   data[13]; buffer[0][2] =   data[14]; buffer[0][3] =   data[15]; buffer[0][4] =   data[16]; buffer[0][5] =   data[17]; buffer[0][6] =   data[18]; buffer[0][7] =   data[19]; buffer[0][8] =   data[20]; buffer[0][9] =   data[21]; buffer[0][10] =   data[22]; buffer[0][11] =   data[23];
+
+        }
+        if (partition ==   2) {
+            buffer[0][0] =   data[24]; buffer[0][1] =   data[25]; buffer[0][2] =   data[26]; buffer[0][3] =   data[27]; buffer[0][4] =   data[28]; buffer[0][5] =   data[29]; buffer[0][6] =   data[30]; buffer[0][7] =   data[31]; buffer[0][8] =   data[32]; buffer[0][9] =   data[33]; buffer[0][10] =   data[34]; buffer[0][11] =   data[35];
+
+        }
+        if (partition ==   3) {
+            buffer[0][0] =   data[36]; buffer[0][1] =   data[37]; buffer[0][2] =   data[38]; buffer[0][3] =   data[39]; buffer[0][4] =   data[40]; buffer[0][5] =   data[41]; buffer[0][6] =   data[42]; buffer[0][7] =   data[43]; buffer[0][8] =   data[44]; buffer[0][9] =   data[45]; buffer[0][10] =   data[46]; buffer[0][11] =   data[47];
+
+        }
+        if (partition ==   4) {
+            buffer[0][0] =   data[48]; buffer[0][1] =   data[49]; buffer[0][2] =   data[50]; buffer[0][3] =   data[51]; buffer[0][4] =   data[52]; buffer[0][5] =   data[53]; buffer[0][6] =   data[54]; buffer[0][7] =   data[55]; buffer[0][8] =   data[56]; buffer[0][9] =   data[57]; buffer[0][10] =   data[58]; buffer[0][11] =   data[59];
+
+        }
+        if (partition ==   5) {
+            buffer[0][0] =   data[60]; buffer[0][1] =   data[61]; buffer[0][2] =   data[62]; buffer[0][3] =   data[63]; buffer[0][4] =   data[64]; buffer[0][5] =   data[65]; buffer[0][6] =   data[66]; buffer[0][7] =   data[67]; buffer[0][8] =   data[68]; buffer[0][9] =   data[69]; buffer[0][10] =   data[70]; buffer[0][11] =   data[71];
+
+        }
+        if (partition ==   6) {
+            buffer[0][0] =   data[72]; buffer[0][1] =   data[73]; buffer[0][2] =   data[74]; buffer[0][3] =   data[75]; buffer[0][4] =   data[76]; buffer[0][5] =   data[77]; buffer[0][6] =   data[78]; buffer[0][7] =   data[79]; buffer[0][8] =   data[80]; buffer[0][9] =   data[81]; buffer[0][10] =   data[82]; buffer[0][11] =   data[83];
+
+        }
+        if (partition ==   7) {
+            buffer[0][0] =   data[84]; buffer[0][1] =   data[85]; buffer[0][2] =   data[86]; buffer[0][3] =   data[87]; buffer[0][4] =   data[88]; buffer[0][5] =   data[89]; buffer[0][6] =   data[90]; buffer[0][7] =   data[91]; buffer[0][8] =   data[92]; buffer[0][9] =   data[93]; buffer[0][10] =   data[94]; buffer[0][11] =   data[95];
+
+        }
+        if (partition ==   8) {
+            buffer[0][0] =   data[96]; buffer[0][1] =   data[97]; buffer[0][2] =   data[98]; buffer[0][3] =   data[99]; buffer[0][4] =  data[100]; buffer[0][5] =  data[101]; buffer[0][6] =  data[102]; buffer[0][7] =  data[103]; buffer[0][8] =  data[104]; buffer[0][9] =  data[105]; buffer[0][10] =  data[106]; buffer[0][11] =  data[107];
+
+        }
+        if (partition ==   9) {
+            buffer[0][0] =  data[108]; buffer[0][1] =  data[109]; buffer[0][2] =  data[110]; buffer[0][3] =  data[111]; buffer[0][4] =  data[112]; buffer[0][5] =  data[113]; buffer[0][6] =  data[114]; buffer[0][7] =  data[115]; buffer[0][8] =  data[116]; buffer[0][9] =  data[117]; buffer[0][10] =  data[118]; buffer[0][11] =  data[119];
+
+        }
+        if (partition ==  10) {
+            buffer[0][0] =  data[120]; buffer[0][1] =  data[121]; buffer[0][2] =  data[122]; buffer[0][3] =  data[123]; buffer[0][4] =  data[124]; buffer[0][5] =  data[125]; buffer[0][6] =  data[126]; buffer[0][7] =  data[127]; buffer[0][8] =  data[128]; buffer[0][9] =  data[129]; buffer[0][10] =  data[130]; buffer[0][11] =  data[131];
+
+        }
+        if (partition ==  11) {
+            buffer[0][0] =  data[132]; buffer[0][1] =  data[133]; buffer[0][2] =  data[134]; buffer[0][3] =  data[135]; buffer[0][4] =  data[136]; buffer[0][5] =  data[137]; buffer[0][6] =  data[138]; buffer[0][7] =  data[139]; buffer[0][8] =  data[140]; buffer[0][9] =  data[141]; buffer[0][10] =  data[142]; buffer[0][11] =  data[143];
+
+        }
+        if (partition ==  12) {
+            buffer[0][0] =  data[144]; buffer[0][1] =  data[145]; buffer[0][2] =  data[146]; buffer[0][3] =  data[147]; buffer[0][4] =  data[148]; buffer[0][5] =  data[149]; buffer[0][6] =  data[150]; buffer[0][7] =  data[151]; buffer[0][8] =  data[152]; buffer[0][9] =  data[153]; buffer[0][10] =  data[154]; buffer[0][11] =  data[155];
+
+        }
+        if (partition ==  13) {
+            buffer[0][0] =  data[156]; buffer[0][1] =  data[157]; buffer[0][2] =  data[158]; buffer[0][3] =  data[159]; buffer[0][4] =  data[160]; buffer[0][5] =  data[161]; buffer[0][6] =  data[162]; buffer[0][7] =  data[163]; buffer[0][8] =  data[164]; buffer[0][9] =  data[165]; buffer[0][10] =  data[166]; buffer[0][11] =  data[167];
+
+        }
+        if (partition ==  14) {
+            buffer[0][0] =  data[168]; buffer[0][1] =  data[169]; buffer[0][2] =  data[170]; buffer[0][3] =  data[171]; buffer[0][4] =  data[172]; buffer[0][5] =  data[173]; buffer[0][6] =  data[174]; buffer[0][7] =  data[175]; buffer[0][8] =  data[176]; buffer[0][9] =  data[177]; buffer[0][10] =  data[178]; buffer[0][11] =  data[179];
+
+        }
+        if (partition ==  15) {
+            buffer[0][0] =  data[180]; buffer[0][1] =  data[181]; buffer[0][2] =  data[182]; buffer[0][3] =  data[183]; buffer[0][4] =  data[184]; buffer[0][5] =  data[185]; buffer[0][6] =  data[186]; buffer[0][7] =  data[187]; buffer[0][8] =  data[188]; buffer[0][9] =  data[189]; buffer[0][10] =  data[190]; buffer[0][11] =  data[191];
+
+        }
+        if (partition ==  16) {
+            buffer[0][0] =  data[192]; buffer[0][1] =  data[193]; buffer[0][2] =  data[194]; buffer[0][3] =  data[195]; buffer[0][4] =  data[196]; buffer[0][5] =  data[197]; buffer[0][6] =  data[198]; buffer[0][7] =  data[199]; buffer[0][8] =  data[200]; buffer[0][9] =  data[201]; buffer[0][10] =  data[202]; buffer[0][11] =  data[203];
+
+        }
+        if (partition ==  17) {
+            buffer[0][0] =  data[204]; buffer[0][1] =  data[205]; buffer[0][2] =  data[206]; buffer[0][3] =  data[207]; buffer[0][4] =  data[208]; buffer[0][5] =  data[209]; buffer[0][6] =  data[210]; buffer[0][7] =  data[211]; buffer[0][8] =  data[212]; buffer[0][9] =  data[213]; buffer[0][10] =  data[214]; buffer[0][11] =  data[215];
+
+        }
+        if (partition ==  18) {
+            buffer[0][0] =  data[216]; buffer[0][1] =  data[217]; buffer[0][2] =  data[218]; buffer[0][3] =  data[219]; buffer[0][4] =  data[220]; buffer[0][5] =  data[221]; buffer[0][6] =  data[222]; buffer[0][7] =  data[223]; buffer[0][8] =  data[224]; buffer[0][9] =  data[225]; buffer[0][10] =  data[226]; buffer[0][11] =  data[227];
+
+        }
+        if (partition ==  19) {
+            buffer[0][0] =  data[228]; buffer[0][1] =  data[229]; buffer[0][2] =  data[230]; buffer[0][3] =  data[231]; buffer[0][4] =  data[232]; buffer[0][5] =  data[233]; buffer[0][6] =  data[234]; buffer[0][7] =  data[235]; buffer[0][8] =  data[236]; buffer[0][9] =  data[237]; buffer[0][10] =  data[238]; buffer[0][11] =  data[239];
+
+        }
+        if (partition ==  20) {
+            buffer[0][0] =  data[240]; buffer[0][1] =  data[241]; buffer[0][2] =  data[242]; buffer[0][3] =  data[243]; buffer[0][4] =  data[244]; buffer[0][5] =  data[245]; buffer[0][6] =  data[246]; buffer[0][7] =  data[247]; buffer[0][8] =  data[248]; buffer[0][9] =  data[249]; buffer[0][10] =  data[250]; buffer[0][11] =  data[251];
+
+        }
+        if (partition ==  21) {
+            buffer[0][0] =  data[252]; buffer[0][1] =  data[253]; buffer[0][2] =  data[254]; buffer[0][3] =  data[255]; buffer[0][4] =  data[256]; buffer[0][5] =  data[257]; buffer[0][6] =  data[258]; buffer[0][7] =  data[259]; buffer[0][8] =  data[260]; buffer[0][9] =  data[261]; buffer[0][10] =  data[262]; buffer[0][11] =  data[263];
+
+        }
+        if (partition ==  22) {
+            buffer[0][0] =  data[264]; buffer[0][1] =  data[265]; buffer[0][2] =  data[266]; buffer[0][3] =  data[267]; buffer[0][4] =  data[268]; buffer[0][5] =  data[269]; buffer[0][6] =  data[270]; buffer[0][7] =  data[271]; buffer[0][8] =  data[272]; buffer[0][9] =  data[273]; buffer[0][10] =  data[274]; buffer[0][11] =  data[275];
+
+        }
+        if (partition ==  23) {
+            buffer[0][0] =  data[276]; buffer[0][1] =  data[277]; buffer[0][2] =  data[278]; buffer[0][3] =  data[279]; buffer[0][4] =  data[280]; buffer[0][5] =  data[281]; buffer[0][6] =  data[282]; buffer[0][7] =  data[283]; buffer[0][8] =  data[284]; buffer[0][9] =  data[285]; buffer[0][10] =  data[286]; buffer[0][11] =  data[287];
+
+        }
+        if (partition ==  24) {
+            buffer[0][0] =  data[288]; buffer[0][1] =  data[289]; buffer[0][2] =  data[290]; buffer[0][3] =  data[291]; buffer[0][4] =  data[292]; buffer[0][5] =  data[293]; buffer[0][6] =  data[294]; buffer[0][7] =  data[295]; buffer[0][8] =  data[296]; buffer[0][9] =  data[297]; buffer[0][10] =  data[298]; buffer[0][11] =  data[299];
+
+        }
+        if (partition ==  25) {
+            buffer[0][0] =  data[300]; buffer[0][1] =  data[301]; buffer[0][2] =  data[302]; buffer[0][3] =  data[303]; buffer[0][4] =  data[304]; buffer[0][5] =  data[305]; buffer[0][6] =  data[306]; buffer[0][7] =  data[307]; buffer[0][8] =  data[308]; buffer[0][9] =  data[309]; buffer[0][10] =  data[310]; buffer[0][11] =  data[311];
+
+        }
+        if (partition ==  26) {
+            buffer[0][0] =  data[312]; buffer[0][1] =  data[313]; buffer[0][2] =  data[314]; buffer[0][3] =  data[315]; buffer[0][4] =  data[316]; buffer[0][5] =  data[317]; buffer[0][6] =  data[318]; buffer[0][7] =  data[319]; buffer[0][8] =  data[320]; buffer[0][9] =  data[321]; buffer[0][10] =  data[322]; buffer[0][11] =  data[323];
+
+        }
+        if (partition ==  27) {
+            buffer[0][0] =  data[324]; buffer[0][1] =  data[325]; buffer[0][2] =  data[326]; buffer[0][3] =  data[327]; buffer[0][4] =  data[328]; buffer[0][5] =  data[329]; buffer[0][6] =  data[330]; buffer[0][7] =  data[331]; buffer[0][8] =  data[332]; buffer[0][9] =  data[333]; buffer[0][10] =  data[334]; buffer[0][11] =  data[335];
+
+        }
+        if (partition ==  28) {
+            buffer[0][0] =  data[336]; buffer[0][1] =  data[337]; buffer[0][2] =  data[338]; buffer[0][3] =  data[339]; buffer[0][4] =  data[340]; buffer[0][5] =  data[341]; buffer[0][6] =  data[342]; buffer[0][7] =  data[343]; buffer[0][8] =  data[344]; buffer[0][9] =  data[345]; buffer[0][10] =  data[346]; buffer[0][11] =  data[347];
+
+        }
+        if (partition ==  29) {
+            buffer[0][0] =  data[348]; buffer[0][1] =  data[349]; buffer[0][2] =  data[350]; buffer[0][3] =  data[351]; buffer[0][4] =  data[352]; buffer[0][5] =  data[353]; buffer[0][6] =  data[354]; buffer[0][7] =  data[355]; buffer[0][8] =  data[356]; buffer[0][9] =  data[357]; buffer[0][10] =  data[358]; buffer[0][11] =  data[359];
+
+        }
+        if (partition ==  30) {
+            buffer[0][0] =  data[360]; buffer[0][1] =  data[361]; buffer[0][2] =  data[362]; buffer[0][3] =  data[363]; buffer[0][4] =  data[364]; buffer[0][5] =  data[365]; buffer[0][6] =  data[366]; buffer[0][7] =  data[367]; buffer[0][8] =  data[368]; buffer[0][9] =  data[369]; buffer[0][10] =  data[370]; buffer[0][11] =  data[371];
+
+        }
+        if (partition ==  31) {
+            buffer[0][0] =  data[372]; buffer[0][1] =  data[373]; buffer[0][2] =  data[374]; buffer[0][3] =  data[375]; buffer[0][4] =  data[376]; buffer[0][5] =  data[377]; buffer[0][6] =  data[378]; buffer[0][7] =  data[379]; buffer[0][8] =  data[380]; buffer[0][9] =  data[381]; buffer[0][10] =  data[382]; buffer[0][11] =  data[383];
+
+        }
+        if (partition ==  32) {
+            buffer[0][0] =  data[384]; buffer[0][1] =  data[385]; buffer[0][2] =  data[386]; buffer[0][3] =  data[387]; buffer[0][4] =  data[388]; buffer[0][5] =  data[389]; buffer[0][6] =  data[390]; buffer[0][7] =  data[391]; buffer[0][8] =  data[392]; buffer[0][9] =  data[393]; buffer[0][10] =  data[394]; buffer[0][11] =  data[395];
+
+        }
+        if (partition ==  33) {
+            buffer[0][0] =  data[396]; buffer[0][1] =  data[397]; buffer[0][2] =  data[398]; buffer[0][3] =  data[399]; buffer[0][4] =  data[400]; buffer[0][5] =  data[401]; buffer[0][6] =  data[402]; buffer[0][7] =  data[403]; buffer[0][8] =  data[404]; buffer[0][9] =  data[405]; buffer[0][10] =  data[406]; buffer[0][11] =  data[407];
+
+        }
+        if (partition ==  34) {
+            buffer[0][0] =  data[408]; buffer[0][1] =  data[409]; buffer[0][2] =  data[410]; buffer[0][3] =  data[411]; buffer[0][4] =  data[412]; buffer[0][5] =  data[413]; buffer[0][6] =  data[414]; buffer[0][7] =  data[415]; buffer[0][8] =  data[416]; buffer[0][9] =  data[417]; buffer[0][10] =  data[418]; buffer[0][11] =  data[419];
+
+        }
+        if (partition ==  35) {
+            buffer[0][0] =  data[420]; buffer[0][1] =  data[421]; buffer[0][2] =  data[422]; buffer[0][3] =  data[423]; buffer[0][4] =  data[424]; buffer[0][5] =  data[425]; buffer[0][6] =  data[426]; buffer[0][7] =  data[427]; buffer[0][8] =  data[428]; buffer[0][9] =  data[429]; buffer[0][10] =  data[430]; buffer[0][11] =  data[431];
+
+        }
+        if (partition ==  36) {
+            buffer[0][0] =  data[432]; buffer[0][1] =  data[433]; buffer[0][2] =  data[434]; buffer[0][3] =  data[435]; buffer[0][4] =  data[436]; buffer[0][5] =  data[437]; buffer[0][6] =  data[438]; buffer[0][7] =  data[439]; buffer[0][8] =  data[440]; buffer[0][9] =  data[441]; buffer[0][10] =  data[442]; buffer[0][11] =  data[443];
+
+        }
+        if (partition ==  37) {
+            buffer[0][0] =  data[444]; buffer[0][1] =  data[445]; buffer[0][2] =  data[446]; buffer[0][3] =  data[447]; buffer[0][4] =  data[448]; buffer[0][5] =  data[449]; buffer[0][6] =  data[450]; buffer[0][7] =  data[451]; buffer[0][8] =  data[452]; buffer[0][9] =  data[453]; buffer[0][10] =  data[454]; buffer[0][11] =  data[455];
+
+        }
+        if (partition ==  38) {
+            buffer[0][0] =  data[456]; buffer[0][1] =  data[457]; buffer[0][2] =  data[458]; buffer[0][3] =  data[459]; buffer[0][4] =  data[460]; buffer[0][5] =  data[461]; buffer[0][6] =  data[462]; buffer[0][7] =  data[463]; buffer[0][8] =  data[464]; buffer[0][9] =  data[465]; buffer[0][10] =  data[466]; buffer[0][11] =  data[467];
+
+        }
+        if (partition ==  39) {
+            buffer[0][0] =  data[468]; buffer[0][1] =  data[469]; buffer[0][2] =  data[470]; buffer[0][3] =  data[471]; buffer[0][4] =  data[472]; buffer[0][5] =  data[473]; buffer[0][6] =  data[474]; buffer[0][7] =  data[475]; buffer[0][8] =  data[476]; buffer[0][9] =  data[477]; buffer[0][10] =  data[478]; buffer[0][11] =  data[479];
+
+        }
+        if (partition ==  40) {
+            buffer[0][0] =  data[480]; buffer[0][1] =  data[481]; buffer[0][2] =  data[482]; buffer[0][3] =  data[483]; buffer[0][4] =  data[484]; buffer[0][5] =  data[485]; buffer[0][6] =  data[486]; buffer[0][7] =  data[487]; buffer[0][8] =  data[488]; buffer[0][9] =  data[489]; buffer[0][10] =  data[490]; buffer[0][11] =  data[491];
+
+        }
+        if (partition ==  41) {
+            buffer[0][0] =  data[492]; buffer[0][1] =  data[493]; buffer[0][2] =  data[494]; buffer[0][3] =  data[495]; buffer[0][4] =  data[496]; buffer[0][5] =  data[497]; buffer[0][6] =  data[498]; buffer[0][7] =  data[499]; buffer[0][8] =  data[500]; buffer[0][9] =  data[501]; buffer[0][10] =  data[502]; buffer[0][11] =  data[503];
+
+        }
+        if (partition ==  42) {
+            buffer[0][0] =  data[504]; buffer[0][1] =  data[505]; buffer[0][2] =  data[506]; buffer[0][3] =  data[507]; buffer[0][4] =  data[508]; buffer[0][5] =  data[509]; buffer[0][6] =  data[510]; buffer[0][7] =  data[511]; buffer[0][8] =  data[512]; buffer[0][9] =  data[513]; buffer[0][10] =  data[514]; buffer[0][11] =  data[515];
+
+        }
+        if (partition ==  43) {
+            buffer[0][0] =  data[516]; buffer[0][1] =  data[517]; buffer[0][2] =  data[518]; buffer[0][3] =  data[519]; buffer[0][4] =  data[520]; buffer[0][5] =  data[521]; buffer[0][6] =  data[522]; buffer[0][7] =  data[523]; buffer[0][8] =  data[524]; buffer[0][9] =  data[525]; buffer[0][10] =  data[526]; buffer[0][11] =  data[527];
+
+        }
+        if (partition ==  44) {
+            buffer[0][0] =  data[528]; buffer[0][1] =  data[529]; buffer[0][2] =  data[530]; buffer[0][3] =  data[531]; buffer[0][4] =  data[532]; buffer[0][5] =  data[533]; buffer[0][6] =  data[534]; buffer[0][7] =  data[535]; buffer[0][8] =  data[536]; buffer[0][9] =  data[537]; buffer[0][10] =  data[538]; buffer[0][11] =  data[539];
+
+        }
+        if (partition ==  45) {
+            buffer[0][0] =  data[540]; buffer[0][1] =  data[541]; buffer[0][2] =  data[542]; buffer[0][3] =  data[543]; buffer[0][4] =  data[544]; buffer[0][5] =  data[545]; buffer[0][6] =  data[546]; buffer[0][7] =  data[547]; buffer[0][8] =  data[548]; buffer[0][9] =  data[549]; buffer[0][10] =  data[550]; buffer[0][11] =  data[551];
+
+        }
+        if (partition ==  46) {
+            buffer[0][0] =  data[552]; buffer[0][1] =  data[553]; buffer[0][2] =  data[554]; buffer[0][3] =  data[555]; buffer[0][4] =  data[556]; buffer[0][5] =  data[557]; buffer[0][6] =  data[558]; buffer[0][7] =  data[559]; buffer[0][8] =  data[560]; buffer[0][9] =  data[561]; buffer[0][10] =  data[562]; buffer[0][11] =  data[563];
+
+        }
+        if (partition ==  47) {
+            buffer[0][0] =  data[564]; buffer[0][1] =  data[565]; buffer[0][2] =  data[566]; buffer[0][3] =  data[567]; buffer[0][4] =  data[568]; buffer[0][5] =  data[569]; buffer[0][6] =  data[570]; buffer[0][7] =  data[571]; buffer[0][8] =  data[572]; buffer[0][9] =  data[573]; buffer[0][10] =  data[574]; buffer[0][11] =  data[575];
+
+        }
+        if (partition ==  48) {
+            buffer[0][0] =  data[576]; buffer[0][1] =  data[577]; buffer[0][2] =  data[578]; buffer[0][3] =  data[579]; buffer[0][4] =  data[580]; buffer[0][5] =  data[581]; buffer[0][6] =  data[582]; buffer[0][7] =  data[583]; buffer[0][8] =  data[584]; buffer[0][9] =  data[585]; buffer[0][10] =  data[586]; buffer[0][11] =  data[587];
+
+        }
+        if (partition ==  49) {
+            buffer[0][0] =  data[588]; buffer[0][1] =  data[589]; buffer[0][2] =  data[590]; buffer[0][3] =  data[591]; buffer[0][4] =  data[592]; buffer[0][5] =  data[593]; buffer[0][6] =  data[594]; buffer[0][7] =  data[595]; buffer[0][8] =  data[596]; buffer[0][9] =  data[597]; buffer[0][10] =  data[598]; buffer[0][11] =  data[599];
+
+        }
+        if (partition ==  50) {
+            buffer[0][0] =  data[600]; buffer[0][1] =  data[601]; buffer[0][2] =  data[602]; buffer[0][3] =  data[603]; buffer[0][4] =  data[604]; buffer[0][5] =  data[605]; buffer[0][6] =  data[606]; buffer[0][7] =  data[607]; buffer[0][8] =  data[608]; buffer[0][9] =  data[609]; buffer[0][10] =  data[610]; buffer[0][11] =  data[611];
+
+        }
+        if (partition ==  51) {
+            buffer[0][0] =  data[612]; buffer[0][1] =  data[613]; buffer[0][2] =  data[614]; buffer[0][3] =  data[615]; buffer[0][4] =  data[616]; buffer[0][5] =  data[617]; buffer[0][6] =  data[618]; buffer[0][7] =  data[619]; buffer[0][8] =  data[620]; buffer[0][9] =  data[621]; buffer[0][10] =  data[622]; buffer[0][11] =  data[623];
+
+        }
+        if (partition ==  52) {
+            buffer[0][0] =  data[624]; buffer[0][1] =  data[625]; buffer[0][2] =  data[626]; buffer[0][3] =  data[627]; buffer[0][4] =  data[628]; buffer[0][5] =  data[629]; buffer[0][6] =  data[630]; buffer[0][7] =  data[631]; buffer[0][8] =  data[632]; buffer[0][9] =  data[633]; buffer[0][10] =  data[634]; buffer[0][11] =  data[635];
+
+        }
+        if (partition ==  53) {
+            buffer[0][0] =  data[636]; buffer[0][1] =  data[637]; buffer[0][2] =  data[638]; buffer[0][3] =  data[639]; buffer[0][4] =  data[640]; buffer[0][5] =  data[641]; buffer[0][6] =  data[642]; buffer[0][7] =  data[643]; buffer[0][8] =  data[644]; buffer[0][9] =  data[645]; buffer[0][10] =  data[646]; buffer[0][11] =  data[647];
+
+        }
+        if (partition ==  54) {
+            buffer[0][0] =  data[648]; buffer[0][1] =  data[649]; buffer[0][2] =  data[650]; buffer[0][3] =  data[651]; buffer[0][4] =  data[652]; buffer[0][5] =  data[653]; buffer[0][6] =  data[654]; buffer[0][7] =  data[655]; buffer[0][8] =  data[656]; buffer[0][9] =  data[657]; buffer[0][10] =  data[658]; buffer[0][11] =  data[659];
+
+        }
+        if (partition ==  55) {
+            buffer[0][0] =  data[660]; buffer[0][1] =  data[661]; buffer[0][2] =  data[662]; buffer[0][3] =  data[663]; buffer[0][4] =  data[664]; buffer[0][5] =  data[665]; buffer[0][6] =  data[666]; buffer[0][7] =  data[667]; buffer[0][8] =  data[668]; buffer[0][9] =  data[669]; buffer[0][10] =  data[670]; buffer[0][11] =  data[671];
+
+        }
+        if (partition ==  56) {
+            buffer[0][0] =  data[672]; buffer[0][1] =  data[673]; buffer[0][2] =  data[674]; buffer[0][3] =  data[675]; buffer[0][4] =  data[676]; buffer[0][5] =  data[677]; buffer[0][6] =  data[678]; buffer[0][7] =  data[679]; buffer[0][8] =  data[680]; buffer[0][9] =  data[681]; buffer[0][10] =  data[682]; buffer[0][11] =  data[683];
+
+        }
+        if (partition ==  57) {
+            buffer[0][0] =  data[684]; buffer[0][1] =  data[685]; buffer[0][2] =  data[686]; buffer[0][3] =  data[687]; buffer[0][4] =  data[688]; buffer[0][5] =  data[689]; buffer[0][6] =  data[690]; buffer[0][7] =  data[691]; buffer[0][8] =  data[692]; buffer[0][9] =  data[693]; buffer[0][10] =  data[694]; buffer[0][11] =  data[695];
+
+        }
+        if (partition ==  58) {
+            buffer[0][0] =  data[696]; buffer[0][1] =  data[697]; buffer[0][2] =  data[698]; buffer[0][3] =  data[699]; buffer[0][4] =  data[700]; buffer[0][5] =  data[701]; buffer[0][6] =  data[702]; buffer[0][7] =  data[703]; buffer[0][8] =  data[704]; buffer[0][9] =  data[705]; buffer[0][10] =  data[706]; buffer[0][11] =  data[707];
+
+        }
+        if (partition ==  59) {
+            buffer[0][0] =  data[708]; buffer[0][1] =  data[709]; buffer[0][2] =  data[710]; buffer[0][3] =  data[711]; buffer[0][4] =  data[712]; buffer[0][5] =  data[713]; buffer[0][6] =  data[714]; buffer[0][7] =  data[715]; buffer[0][8] =  data[716]; buffer[0][9] =  data[717]; buffer[0][10] =  data[718]; buffer[0][11] =  data[719];
+
+        }
+        if (partition ==  60) {
+            buffer[0][0] =  data[720]; buffer[0][1] =  data[721]; buffer[0][2] =  data[722]; buffer[0][3] =  data[723]; buffer[0][4] =  data[724]; buffer[0][5] =  data[725]; buffer[0][6] =  data[726]; buffer[0][7] =  data[727]; buffer[0][8] =  data[728]; buffer[0][9] =  data[729]; buffer[0][10] =  data[730]; buffer[0][11] =  data[731];
+
+        }
+        if (partition ==  61) {
+            buffer[0][0] =  data[732]; buffer[0][1] =  data[733]; buffer[0][2] =  data[734]; buffer[0][3] =  data[735]; buffer[0][4] =  data[736]; buffer[0][5] =  data[737]; buffer[0][6] =  data[738]; buffer[0][7] =  data[739]; buffer[0][8] =  data[740]; buffer[0][9] =  data[741]; buffer[0][10] =  data[742]; buffer[0][11] =  data[743];
+
+        }
+        if (partition ==  62) {
+            buffer[0][0] =  data[744]; buffer[0][1] =  data[745]; buffer[0][2] =  data[746]; buffer[0][3] =  data[747]; buffer[0][4] =  data[748]; buffer[0][5] =  data[749]; buffer[0][6] =  data[750]; buffer[0][7] =  data[751]; buffer[0][8] =  data[752]; buffer[0][9] =  data[753]; buffer[0][10] =  data[754]; buffer[0][11] =  data[755];
+
+        }
+        if (partition ==  63) {
+            buffer[0][0] =  data[756]; buffer[0][1] =  data[757]; buffer[0][2] =  data[758]; buffer[0][3] =  data[759]; buffer[0][4] =  data[760]; buffer[0][5] =  data[761]; buffer[0][6] =  data[762]; buffer[0][7] =  data[763]; buffer[0][8] =  data[764]; buffer[0][9] =  data[765]; buffer[0][10] =  data[766]; buffer[0][11] =  data[767];
+
+        }
+        if (partition ==  64) {
+            buffer[0][0] =  data[768]; buffer[0][1] =  data[769]; buffer[0][2] =  data[770]; buffer[0][3] =  data[771]; buffer[0][4] =  data[772]; buffer[0][5] =  data[773]; buffer[0][6] =  data[774]; buffer[0][7] =  data[775]; buffer[0][8] =  data[776]; buffer[0][9] =  data[777]; buffer[0][10] =  data[778]; buffer[0][11] =  data[779];
+
+        }
+        if (partition ==  65) {
+            buffer[0][0] =  data[780]; buffer[0][1] =  data[781]; buffer[0][2] =  data[782]; buffer[0][3] =  data[783]; buffer[0][4] =  data[784]; buffer[0][5] =  data[785]; buffer[0][6] =  data[786]; buffer[0][7] =  data[787]; buffer[0][8] =  data[788]; buffer[0][9] =  data[789]; buffer[0][10] =  data[790]; buffer[0][11] =  data[791];
+
+        }
+        if (partition ==  66) {
+            buffer[0][0] =  data[792]; buffer[0][1] =  data[793]; buffer[0][2] =  data[794]; buffer[0][3] =  data[795]; buffer[0][4] =  data[796]; buffer[0][5] =  data[797]; buffer[0][6] =  data[798]; buffer[0][7] =  data[799]; buffer[0][8] =  data[800]; buffer[0][9] =  data[801]; buffer[0][10] =  data[802]; buffer[0][11] =  data[803];
+
+        }
+        if (partition ==  67) {
+            buffer[0][0] =  data[804]; buffer[0][1] =  data[805]; buffer[0][2] =  data[806]; buffer[0][3] =  data[807]; buffer[0][4] =  data[808]; buffer[0][5] =  data[809]; buffer[0][6] =  data[810]; buffer[0][7] =  data[811]; buffer[0][8] =  data[812]; buffer[0][9] =  data[813]; buffer[0][10] =  data[814]; buffer[0][11] =  data[815];
+
+        }
+        if (partition ==  68) {
+            buffer[0][0] =  data[816]; buffer[0][1] =  data[817]; buffer[0][2] =  data[818]; buffer[0][3] =  data[819]; buffer[0][4] =  data[820]; buffer[0][5] =  data[821]; buffer[0][6] =  data[822]; buffer[0][7] =  data[823]; buffer[0][8] =  data[824]; buffer[0][9] =  data[825]; buffer[0][10] =  data[826]; buffer[0][11] =  data[827];
+
+        }
+        if (partition ==  69) {
+            buffer[0][0] =  data[828]; buffer[0][1] =  data[829]; buffer[0][2] =  data[830]; buffer[0][3] =  data[831]; buffer[0][4] =  data[832]; buffer[0][5] =  data[833]; buffer[0][6] =  data[834]; buffer[0][7] =  data[835]; buffer[0][8] =  data[836]; buffer[0][9] =  data[837]; buffer[0][10] =  data[838]; buffer[0][11] =  data[839];
+
+        }
+        if (partition ==  70) {
+            buffer[0][0] =  data[840]; buffer[0][1] =  data[841]; buffer[0][2] =  data[842]; buffer[0][3] =  data[843]; buffer[0][4] =  data[844]; buffer[0][5] =  data[845]; buffer[0][6] =  data[846]; buffer[0][7] =  data[847]; buffer[0][8] =  data[848]; buffer[0][9] =  data[849]; buffer[0][10] =  data[850]; buffer[0][11] =  data[851];
+
+        }
+        if (partition ==  71) {
+            buffer[0][0] =  data[852]; buffer[0][1] =  data[853]; buffer[0][2] =  data[854]; buffer[0][3] =  data[855]; buffer[0][4] =  data[856]; buffer[0][5] =  data[857]; buffer[0][6] =  data[858]; buffer[0][7] =  data[859]; buffer[0][8] =  data[860]; buffer[0][9] =  data[861]; buffer[0][10] =  data[862]; buffer[0][11] =  data[863];
+
+        }
+        if (partition ==  72) {
+            buffer[0][0] =  data[864]; buffer[0][1] =  data[865]; buffer[0][2] =  data[866]; buffer[0][3] =  data[867]; buffer[0][4] =  data[868]; buffer[0][5] =  data[869]; buffer[0][6] =  data[870]; buffer[0][7] =  data[871]; buffer[0][8] =  data[872]; buffer[0][9] =  data[873]; buffer[0][10] =  data[874]; buffer[0][11] =  data[875];
+
+        }
+        if (partition ==  73) {
+            buffer[0][0] =  data[876]; buffer[0][1] =  data[877]; buffer[0][2] =  data[878]; buffer[0][3] =  data[879]; buffer[0][4] =  data[880]; buffer[0][5] =  data[881]; buffer[0][6] =  data[882]; buffer[0][7] =  data[883]; buffer[0][8] =  data[884]; buffer[0][9] =  data[885]; buffer[0][10] =  data[886]; buffer[0][11] =  data[887];
+
+        }
+        if (partition ==  74) {
+            buffer[0][0] =  data[888]; buffer[0][1] =  data[889]; buffer[0][2] =  data[890]; buffer[0][3] =  data[891]; buffer[0][4] =  data[892]; buffer[0][5] =  data[893]; buffer[0][6] =  data[894]; buffer[0][7] =  data[895]; buffer[0][8] =  data[896]; buffer[0][9] =  data[897]; buffer[0][10] =  data[898]; buffer[0][11] =  data[899];
+
+        }
+        if (partition ==  75) {
+            buffer[0][0] =  data[900]; buffer[0][1] =  data[901]; buffer[0][2] =  data[902]; buffer[0][3] =  data[903]; buffer[0][4] =  data[904]; buffer[0][5] =  data[905]; buffer[0][6] =  data[906]; buffer[0][7] =  data[907]; buffer[0][8] =  data[908]; buffer[0][9] =  data[909]; buffer[0][10] =  data[910]; buffer[0][11] =  data[911];
+
+        }
+        if (partition ==  76) {
+            buffer[0][0] =  data[912]; buffer[0][1] =  data[913]; buffer[0][2] =  data[914]; buffer[0][3] =  data[915]; buffer[0][4] =  data[916]; buffer[0][5] =  data[917]; buffer[0][6] =  data[918]; buffer[0][7] =  data[919]; buffer[0][8] =  data[920]; buffer[0][9] =  data[921]; buffer[0][10] =  data[922]; buffer[0][11] =  data[923];
+
+        }
+        if (partition ==  77) {
+            buffer[0][0] =  data[924]; buffer[0][1] =  data[925]; buffer[0][2] =  data[926]; buffer[0][3] =  data[927]; buffer[0][4] =  data[928]; buffer[0][5] =  data[929]; buffer[0][6] =  data[930]; buffer[0][7] =  data[931]; buffer[0][8] =  data[932]; buffer[0][9] =  data[933]; buffer[0][10] =  data[934]; buffer[0][11] =  data[935];
+
+        }
+        if (partition ==  78) {
+            buffer[0][0] =  data[936]; buffer[0][1] =  data[937]; buffer[0][2] =  data[938]; buffer[0][3] =  data[939]; buffer[0][4] =  data[940]; buffer[0][5] =  data[941]; buffer[0][6] =  data[942]; buffer[0][7] =  data[943]; buffer[0][8] =  data[944]; buffer[0][9] =  data[945]; buffer[0][10] =  data[946]; buffer[0][11] =  data[947];
+
+        }
+        if (partition ==  79) {
+            buffer[0][0] =  data[948]; buffer[0][1] =  data[949]; buffer[0][2] =  data[950]; buffer[0][3] =  data[951]; buffer[0][4] =  data[952]; buffer[0][5] =  data[953]; buffer[0][6] =  data[954]; buffer[0][7] =  data[955]; buffer[0][8] =  data[956]; buffer[0][9] =  data[957]; buffer[0][10] =  data[958]; buffer[0][11] =  data[959];
+
+        }
+        if (partition ==  80) {
+            buffer[0][0] =  data[960]; buffer[0][1] =  data[961]; buffer[0][2] =  data[962]; buffer[0][3] =  data[963]; buffer[0][4] =  data[964]; buffer[0][5] =  data[965]; buffer[0][6] =  data[966]; buffer[0][7] =  data[967]; buffer[0][8] =  data[968]; buffer[0][9] =  data[969]; buffer[0][10] =  data[970]; buffer[0][11] =  data[971];
+
+        }
+        if (partition ==  81) {
+            buffer[0][0] =  data[972]; buffer[0][1] =  data[973]; buffer[0][2] =  data[974]; buffer[0][3] =  data[975]; buffer[0][4] =  data[976]; buffer[0][5] =  data[977]; buffer[0][6] =  data[978]; buffer[0][7] =  data[979]; buffer[0][8] =  data[980]; buffer[0][9] =  data[981]; buffer[0][10] =  data[982]; buffer[0][11] =  data[983];
+
+        }
+        if (partition ==  82) {
+            buffer[0][0] =  data[984]; buffer[0][1] =  data[985]; buffer[0][2] =  data[986]; buffer[0][3] =  data[987]; buffer[0][4] =  data[988]; buffer[0][5] =  data[989]; buffer[0][6] =  data[990]; buffer[0][7] =  data[991]; buffer[0][8] =  data[992]; buffer[0][9] =  data[993]; buffer[0][10] =  data[994]; buffer[0][11] =  data[995];
+
+        }
+        if (partition ==  83) {
+            buffer[0][0] =  data[996]; buffer[0][1] =  data[997]; buffer[0][2] =  data[998]; buffer[0][3] =  data[999]; buffer[0][4] = data[1000]; buffer[0][5] = data[1001]; buffer[0][6] = data[1002]; buffer[0][7] = data[1003]; buffer[0][8] = data[1004]; buffer[0][9] = data[1005]; buffer[0][10] = data[1006]; buffer[0][11] = data[1007];
+
+        }
+        if (partition ==  84) {
+            buffer[0][0] = data[1008]; buffer[0][1] = data[1009]; buffer[0][2] = data[1010]; buffer[0][3] = data[1011]; buffer[0][4] = data[1012]; buffer[0][5] = data[1013]; buffer[0][6] = data[1014]; buffer[0][7] = data[1015]; buffer[0][8] = data[1016]; buffer[0][9] = data[1017]; buffer[0][10] = data[1018]; buffer[0][11] = data[1019];
+
+        }
+        if (partition ==  85) {
+            buffer[0][0] = data[1020]; buffer[0][1] = data[1021]; buffer[0][2] = data[1022]; buffer[0][3] = data[1023]; buffer[0][4] = data[1024]; buffer[0][5] = data[1025]; buffer[0][6] = data[1026]; buffer[0][7] = data[1027]; buffer[0][8] = data[1028]; buffer[0][9] = data[1029]; buffer[0][10] = data[1030]; buffer[0][11] = data[1031];
+
+        }
+        if (partition ==  86) {
+            buffer[0][0] = data[1032]; buffer[0][1] = data[1033]; buffer[0][2] = data[1034]; buffer[0][3] = data[1035]; buffer[0][4] = data[1036]; buffer[0][5] = data[1037]; buffer[0][6] = data[1038]; buffer[0][7] = data[1039]; buffer[0][8] = data[1040]; buffer[0][9] = data[1041]; buffer[0][10] = data[1042]; buffer[0][11] = data[1043];
+
+        }
+        if (partition ==  87) {
+            buffer[0][0] = data[1044]; buffer[0][1] = data[1045]; buffer[0][2] = data[1046]; buffer[0][3] = data[1047]; buffer[0][4] = data[1048]; buffer[0][5] = data[1049]; buffer[0][6] = data[1050]; buffer[0][7] = data[1051]; buffer[0][8] = data[1052]; buffer[0][9] = data[1053]; buffer[0][10] = data[1054]; buffer[0][11] = data[1055];
+
+        }
+        if (partition ==  88) {
+            buffer[0][0] = data[1056]; buffer[0][1] = data[1057]; buffer[0][2] = data[1058]; buffer[0][3] = data[1059]; buffer[0][4] = data[1060]; buffer[0][5] = data[1061]; buffer[0][6] = data[1062]; buffer[0][7] = data[1063]; buffer[0][8] = data[1064]; buffer[0][9] = data[1065]; buffer[0][10] = data[1066]; buffer[0][11] = data[1067];
+
+        }
+        if (partition ==  89) {
+            buffer[0][0] = data[1068]; buffer[0][1] = data[1069]; buffer[0][2] = data[1070]; buffer[0][3] = data[1071]; buffer[0][4] = data[1072]; buffer[0][5] = data[1073]; buffer[0][6] = data[1074]; buffer[0][7] = data[1075]; buffer[0][8] = data[1076]; buffer[0][9] = data[1077]; buffer[0][10] = data[1078]; buffer[0][11] = data[1079];
+
+        }
+        if (partition ==  90) {
+            buffer[0][0] = data[1080]; buffer[0][1] = data[1081]; buffer[0][2] = data[1082]; buffer[0][3] = data[1083]; buffer[0][4] = data[1084]; buffer[0][5] = data[1085]; buffer[0][6] = data[1086]; buffer[0][7] = data[1087]; buffer[0][8] = data[1088]; buffer[0][9] = data[1089]; buffer[0][10] = data[1090]; buffer[0][11] = data[1091];
+
+        }
+        if (partition ==  91) {
+            buffer[0][0] = data[1092]; buffer[0][1] = data[1093]; buffer[0][2] = data[1094]; buffer[0][3] = data[1095]; buffer[0][4] = data[1096]; buffer[0][5] = data[1097]; buffer[0][6] = data[1098]; buffer[0][7] = data[1099]; buffer[0][8] = data[1100]; buffer[0][9] = data[1101]; buffer[0][10] = data[1102]; buffer[0][11] = data[1103];
+
+        }
+        if (partition ==  92) {
+            buffer[0][0] = data[1104]; buffer[0][1] = data[1105]; buffer[0][2] = data[1106]; buffer[0][3] = data[1107]; buffer[0][4] = data[1108]; buffer[0][5] = data[1109]; buffer[0][6] = data[1110]; buffer[0][7] = data[1111]; buffer[0][8] = data[1112]; buffer[0][9] = data[1113]; buffer[0][10] = data[1114]; buffer[0][11] = data[1115];
+
+        }
+        if (partition ==  93) {
+            buffer[0][0] = data[1116]; buffer[0][1] = data[1117]; buffer[0][2] = data[1118]; buffer[0][3] = data[1119]; buffer[0][4] = data[1120]; buffer[0][5] = data[1121]; buffer[0][6] = data[1122]; buffer[0][7] = data[1123]; buffer[0][8] = data[1124]; buffer[0][9] = data[1125]; buffer[0][10] = data[1126]; buffer[0][11] = data[1127];
+
+        }
+        if (partition ==  94) {
+            buffer[0][0] = data[1128]; buffer[0][1] = data[1129]; buffer[0][2] = data[1130]; buffer[0][3] = data[1131]; buffer[0][4] = data[1132]; buffer[0][5] = data[1133]; buffer[0][6] = data[1134]; buffer[0][7] = data[1135]; buffer[0][8] = data[1136]; buffer[0][9] = data[1137]; buffer[0][10] = data[1138]; buffer[0][11] = data[1139];
+
+        }
+        if (partition ==  95) {
+            buffer[0][0] = data[1140]; buffer[0][1] = data[1141]; buffer[0][2] = data[1142]; buffer[0][3] = data[1143]; buffer[0][4] = data[1144]; buffer[0][5] = data[1145]; buffer[0][6] = data[1146]; buffer[0][7] = data[1147]; buffer[0][8] = data[1148]; buffer[0][9] = data[1149]; buffer[0][10] = data[1150]; buffer[0][11] = data[1151];
+
+        }
+        if (partition ==  96) {
+            buffer[0][0] = data[1152]; buffer[0][1] = data[1153]; buffer[0][2] = data[1154]; buffer[0][3] = data[1155]; buffer[0][4] = data[1156]; buffer[0][5] = data[1157]; buffer[0][6] = data[1158]; buffer[0][7] = data[1159]; buffer[0][8] = data[1160]; buffer[0][9] = data[1161]; buffer[0][10] = data[1162]; buffer[0][11] = data[1163];
+
+        }
+        if (partition ==  97) {
+            buffer[0][0] = data[1164]; buffer[0][1] = data[1165]; buffer[0][2] = data[1166]; buffer[0][3] = data[1167]; buffer[0][4] = data[1168]; buffer[0][5] = data[1169]; buffer[0][6] = data[1170]; buffer[0][7] = data[1171]; buffer[0][8] = data[1172]; buffer[0][9] = data[1173]; buffer[0][10] = data[1174]; buffer[0][11] = data[1175];
+
+        }
+        if (partition ==  98) {
+            buffer[0][0] = data[1176]; buffer[0][1] = data[1177]; buffer[0][2] = data[1178]; buffer[0][3] = data[1179]; buffer[0][4] = data[1180]; buffer[0][5] = data[1181]; buffer[0][6] = data[1182]; buffer[0][7] = data[1183]; buffer[0][8] = data[1184]; buffer[0][9] = data[1185]; buffer[0][10] = data[1186]; buffer[0][11] = data[1187];
+
+        }
+        if (partition ==  99) {
+            buffer[0][0] = data[1188]; buffer[0][1] = data[1189]; buffer[0][2] = data[1190]; buffer[0][3] = data[1191]; buffer[0][4] = data[1192]; buffer[0][5] = data[1193]; buffer[0][6] = data[1194]; buffer[0][7] = data[1195]; buffer[0][8] = data[1196]; buffer[0][9] = data[1197]; buffer[0][10] = data[1198]; buffer[0][11] = data[1199];
+
+        }
+    }
+};
+template<class data_T, typename CONFIG_T>
+class fill_buffer_24 : public FillConv1DBuffer<data_T, CONFIG_T> {
+    public:
+    static void fill_buffer(
+        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+        const unsigned partition
+    ) {
+        if (partition ==   0) {
+            buffer[0][0] =    data[0]; buffer[0][1] =    data[1]; buffer[0][2] =    data[2]; buffer[0][3] =    data[3]; buffer[0][4] =    data[4]; buffer[0][5] =    data[5]; buffer[0][6] =    data[6]; buffer[0][7] =    data[7]; buffer[0][8] =    data[8]; buffer[0][9] =    data[9]; buffer[0][10] =   data[10]; buffer[0][11] =   data[11]; buffer[0][12] =   data[12]; buffer[0][13] =   data[13]; buffer[0][14] =   data[14]; buffer[0][15] =   data[15]; buffer[0][16] =   data[16]; buffer[0][17] =   data[17]; buffer[0][18] =   data[18]; buffer[0][19] =   data[19]; buffer[0][20] =   data[20]; buffer[0][21] =   data[21]; buffer[0][22] =   data[22]; buffer[0][23] =   data[23]; buffer[0][24] =   data[24]; buffer[0][25] =   data[25]; buffer[0][26] =   data[26]; buffer[0][27] =   data[27]; buffer[0][28] =   data[28]; buffer[0][29] =   data[29]; buffer[0][30] =   data[30]; buffer[0][31] =   data[31]; buffer[0][32] =   data[32]; buffer[0][33] =   data[33]; buffer[0][34] =   data[34]; buffer[0][35] =   data[35];
+
+        }
+        if (partition ==   1) {
+            buffer[0][0] =   data[36]; buffer[0][1] =   data[37]; buffer[0][2] =   data[38]; buffer[0][3] =   data[39]; buffer[0][4] =   data[40]; buffer[0][5] =   data[41]; buffer[0][6] =   data[42]; buffer[0][7] =   data[43]; buffer[0][8] =   data[44]; buffer[0][9] =   data[45]; buffer[0][10] =   data[46]; buffer[0][11] =   data[47]; buffer[0][12] =   data[48]; buffer[0][13] =   data[49]; buffer[0][14] =   data[50]; buffer[0][15] =   data[51]; buffer[0][16] =   data[52]; buffer[0][17] =   data[53]; buffer[0][18] =   data[54]; buffer[0][19] =   data[55]; buffer[0][20] =   data[56]; buffer[0][21] =   data[57]; buffer[0][22] =   data[58]; buffer[0][23] =   data[59]; buffer[0][24] =   data[60]; buffer[0][25] =   data[61]; buffer[0][26] =   data[62]; buffer[0][27] =   data[63]; buffer[0][28] =   data[64]; buffer[0][29] =   data[65]; buffer[0][30] =   data[66]; buffer[0][31] =   data[67]; buffer[0][32] =   data[68]; buffer[0][33] =   data[69]; buffer[0][34] =   data[70]; buffer[0][35] =   data[71];
+
+        }
+        if (partition ==   2) {
+            buffer[0][0] =   data[72]; buffer[0][1] =   data[73]; buffer[0][2] =   data[74]; buffer[0][3] =   data[75]; buffer[0][4] =   data[76]; buffer[0][5] =   data[77]; buffer[0][6] =   data[78]; buffer[0][7] =   data[79]; buffer[0][8] =   data[80]; buffer[0][9] =   data[81]; buffer[0][10] =   data[82]; buffer[0][11] =   data[83]; buffer[0][12] =   data[84]; buffer[0][13] =   data[85]; buffer[0][14] =   data[86]; buffer[0][15] =   data[87]; buffer[0][16] =   data[88]; buffer[0][17] =   data[89]; buffer[0][18] =   data[90]; buffer[0][19] =   data[91]; buffer[0][20] =   data[92]; buffer[0][21] =   data[93]; buffer[0][22] =   data[94]; buffer[0][23] =   data[95]; buffer[0][24] =   data[96]; buffer[0][25] =   data[97]; buffer[0][26] =   data[98]; buffer[0][27] =   data[99]; buffer[0][28] =  data[100]; buffer[0][29] =  data[101]; buffer[0][30] =  data[102]; buffer[0][31] =  data[103]; buffer[0][32] =  data[104]; buffer[0][33] =  data[105]; buffer[0][34] =  data[106]; buffer[0][35] =  data[107];
+
+        }
+        if (partition ==   3) {
+            buffer[0][0] =  data[108]; buffer[0][1] =  data[109]; buffer[0][2] =  data[110]; buffer[0][3] =  data[111]; buffer[0][4] =  data[112]; buffer[0][5] =  data[113]; buffer[0][6] =  data[114]; buffer[0][7] =  data[115]; buffer[0][8] =  data[116]; buffer[0][9] =  data[117]; buffer[0][10] =  data[118]; buffer[0][11] =  data[119]; buffer[0][12] =  data[120]; buffer[0][13] =  data[121]; buffer[0][14] =  data[122]; buffer[0][15] =  data[123]; buffer[0][16] =  data[124]; buffer[0][17] =  data[125]; buffer[0][18] =  data[126]; buffer[0][19] =  data[127]; buffer[0][20] =  data[128]; buffer[0][21] =  data[129]; buffer[0][22] =  data[130]; buffer[0][23] =  data[131]; buffer[0][24] =  data[132]; buffer[0][25] =  data[133]; buffer[0][26] =  data[134]; buffer[0][27] =  data[135]; buffer[0][28] =  data[136]; buffer[0][29] =  data[137]; buffer[0][30] =  data[138]; buffer[0][31] =  data[139]; buffer[0][32] =  data[140]; buffer[0][33] =  data[141]; buffer[0][34] =  data[142]; buffer[0][35] =  data[143];
+
+        }
+        if (partition ==   4) {
+            buffer[0][0] =  data[144]; buffer[0][1] =  data[145]; buffer[0][2] =  data[146]; buffer[0][3] =  data[147]; buffer[0][4] =  data[148]; buffer[0][5] =  data[149]; buffer[0][6] =  data[150]; buffer[0][7] =  data[151]; buffer[0][8] =  data[152]; buffer[0][9] =  data[153]; buffer[0][10] =  data[154]; buffer[0][11] =  data[155]; buffer[0][12] =  data[156]; buffer[0][13] =  data[157]; buffer[0][14] =  data[158]; buffer[0][15] =  data[159]; buffer[0][16] =  data[160]; buffer[0][17] =  data[161]; buffer[0][18] =  data[162]; buffer[0][19] =  data[163]; buffer[0][20] =  data[164]; buffer[0][21] =  data[165]; buffer[0][22] =  data[166]; buffer[0][23] =  data[167]; buffer[0][24] =  data[168]; buffer[0][25] =  data[169]; buffer[0][26] =  data[170]; buffer[0][27] =  data[171]; buffer[0][28] =  data[172]; buffer[0][29] =  data[173]; buffer[0][30] =  data[174]; buffer[0][31] =  data[175]; buffer[0][32] =  data[176]; buffer[0][33] =  data[177]; buffer[0][34] =  data[178]; buffer[0][35] =  data[179];
+
+        }
+        if (partition ==   5) {
+            buffer[0][0] =  data[180]; buffer[0][1] =  data[181]; buffer[0][2] =  data[182]; buffer[0][3] =  data[183]; buffer[0][4] =  data[184]; buffer[0][5] =  data[185]; buffer[0][6] =  data[186]; buffer[0][7] =  data[187]; buffer[0][8] =  data[188]; buffer[0][9] =  data[189]; buffer[0][10] =  data[190]; buffer[0][11] =  data[191]; buffer[0][12] =  data[192]; buffer[0][13] =  data[193]; buffer[0][14] =  data[194]; buffer[0][15] =  data[195]; buffer[0][16] =  data[196]; buffer[0][17] =  data[197]; buffer[0][18] =  data[198]; buffer[0][19] =  data[199]; buffer[0][20] =  data[200]; buffer[0][21] =  data[201]; buffer[0][22] =  data[202]; buffer[0][23] =  data[203]; buffer[0][24] =  data[204]; buffer[0][25] =  data[205]; buffer[0][26] =  data[206]; buffer[0][27] =  data[207]; buffer[0][28] =  data[208]; buffer[0][29] =  data[209]; buffer[0][30] =  data[210]; buffer[0][31] =  data[211]; buffer[0][32] =  data[212]; buffer[0][33] =  data[213]; buffer[0][34] =  data[214]; buffer[0][35] =  data[215];
+
+        }
+        if (partition ==   6) {
+            buffer[0][0] =  data[216]; buffer[0][1] =  data[217]; buffer[0][2] =  data[218]; buffer[0][3] =  data[219]; buffer[0][4] =  data[220]; buffer[0][5] =  data[221]; buffer[0][6] =  data[222]; buffer[0][7] =  data[223]; buffer[0][8] =  data[224]; buffer[0][9] =  data[225]; buffer[0][10] =  data[226]; buffer[0][11] =  data[227]; buffer[0][12] =  data[228]; buffer[0][13] =  data[229]; buffer[0][14] =  data[230]; buffer[0][15] =  data[231]; buffer[0][16] =  data[232]; buffer[0][17] =  data[233]; buffer[0][18] =  data[234]; buffer[0][19] =  data[235]; buffer[0][20] =  data[236]; buffer[0][21] =  data[237]; buffer[0][22] =  data[238]; buffer[0][23] =  data[239]; buffer[0][24] =  data[240]; buffer[0][25] =  data[241]; buffer[0][26] =  data[242]; buffer[0][27] =  data[243]; buffer[0][28] =  data[244]; buffer[0][29] =  data[245]; buffer[0][30] =  data[246]; buffer[0][31] =  data[247]; buffer[0][32] =  data[248]; buffer[0][33] =  data[249]; buffer[0][34] =  data[250]; buffer[0][35] =  data[251];
+
+        }
+        if (partition ==   7) {
+            buffer[0][0] =  data[252]; buffer[0][1] =  data[253]; buffer[0][2] =  data[254]; buffer[0][3] =  data[255]; buffer[0][4] =  data[256]; buffer[0][5] =  data[257]; buffer[0][6] =  data[258]; buffer[0][7] =  data[259]; buffer[0][8] =  data[260]; buffer[0][9] =  data[261]; buffer[0][10] =  data[262]; buffer[0][11] =  data[263]; buffer[0][12] =  data[264]; buffer[0][13] =  data[265]; buffer[0][14] =  data[266]; buffer[0][15] =  data[267]; buffer[0][16] =  data[268]; buffer[0][17] =  data[269]; buffer[0][18] =  data[270]; buffer[0][19] =  data[271]; buffer[0][20] =  data[272]; buffer[0][21] =  data[273]; buffer[0][22] =  data[274]; buffer[0][23] =  data[275]; buffer[0][24] =  data[276]; buffer[0][25] =  data[277]; buffer[0][26] =  data[278]; buffer[0][27] =  data[279]; buffer[0][28] =  data[280]; buffer[0][29] =  data[281]; buffer[0][30] =  data[282]; buffer[0][31] =  data[283]; buffer[0][32] =  data[284]; buffer[0][33] =  data[285]; buffer[0][34] =  data[286]; buffer[0][35] =  data[287];
+
+        }
+        if (partition ==   8) {
+            buffer[0][0] =  data[288]; buffer[0][1] =  data[289]; buffer[0][2] =  data[290]; buffer[0][3] =  data[291]; buffer[0][4] =  data[292]; buffer[0][5] =  data[293]; buffer[0][6] =  data[294]; buffer[0][7] =  data[295]; buffer[0][8] =  data[296]; buffer[0][9] =  data[297]; buffer[0][10] =  data[298]; buffer[0][11] =  data[299]; buffer[0][12] =  data[300]; buffer[0][13] =  data[301]; buffer[0][14] =  data[302]; buffer[0][15] =  data[303]; buffer[0][16] =  data[304]; buffer[0][17] =  data[305]; buffer[0][18] =  data[306]; buffer[0][19] =  data[307]; buffer[0][20] =  data[308]; buffer[0][21] =  data[309]; buffer[0][22] =  data[310]; buffer[0][23] =  data[311]; buffer[0][24] =  data[312]; buffer[0][25] =  data[313]; buffer[0][26] =  data[314]; buffer[0][27] =  data[315]; buffer[0][28] =  data[316]; buffer[0][29] =  data[317]; buffer[0][30] =  data[318]; buffer[0][31] =  data[319]; buffer[0][32] =  data[320]; buffer[0][33] =  data[321]; buffer[0][34] =  data[322]; buffer[0][35] =  data[323];
+
+        }
+        if (partition ==   9) {
+            buffer[0][0] =  data[324]; buffer[0][1] =  data[325]; buffer[0][2] =  data[326]; buffer[0][3] =  data[327]; buffer[0][4] =  data[328]; buffer[0][5] =  data[329]; buffer[0][6] =  data[330]; buffer[0][7] =  data[331]; buffer[0][8] =  data[332]; buffer[0][9] =  data[333]; buffer[0][10] =  data[334]; buffer[0][11] =  data[335]; buffer[0][12] =  data[336]; buffer[0][13] =  data[337]; buffer[0][14] =  data[338]; buffer[0][15] =  data[339]; buffer[0][16] =  data[340]; buffer[0][17] =  data[341]; buffer[0][18] =  data[342]; buffer[0][19] =  data[343]; buffer[0][20] =  data[344]; buffer[0][21] =  data[345]; buffer[0][22] =  data[346]; buffer[0][23] =  data[347]; buffer[0][24] =  data[348]; buffer[0][25] =  data[349]; buffer[0][26] =  data[350]; buffer[0][27] =  data[351]; buffer[0][28] =  data[352]; buffer[0][29] =  data[353]; buffer[0][30] =  data[354]; buffer[0][31] =  data[355]; buffer[0][32] =  data[356]; buffer[0][33] =  data[357]; buffer[0][34] =  data[358]; buffer[0][35] =  data[359];
+
+        }
+        if (partition ==  10) {
+            buffer[0][0] =  data[360]; buffer[0][1] =  data[361]; buffer[0][2] =  data[362]; buffer[0][3] =  data[363]; buffer[0][4] =  data[364]; buffer[0][5] =  data[365]; buffer[0][6] =  data[366]; buffer[0][7] =  data[367]; buffer[0][8] =  data[368]; buffer[0][9] =  data[369]; buffer[0][10] =  data[370]; buffer[0][11] =  data[371]; buffer[0][12] =  data[372]; buffer[0][13] =  data[373]; buffer[0][14] =  data[374]; buffer[0][15] =  data[375]; buffer[0][16] =  data[376]; buffer[0][17] =  data[377]; buffer[0][18] =  data[378]; buffer[0][19] =  data[379]; buffer[0][20] =  data[380]; buffer[0][21] =  data[381]; buffer[0][22] =  data[382]; buffer[0][23] =  data[383]; buffer[0][24] =  data[384]; buffer[0][25] =  data[385]; buffer[0][26] =  data[386]; buffer[0][27] =  data[387]; buffer[0][28] =  data[388]; buffer[0][29] =  data[389]; buffer[0][30] =  data[390]; buffer[0][31] =  data[391]; buffer[0][32] =  data[392]; buffer[0][33] =  data[393]; buffer[0][34] =  data[394]; buffer[0][35] =  data[395];
+
+        }
+        if (partition ==  11) {
+            buffer[0][0] =  data[396]; buffer[0][1] =  data[397]; buffer[0][2] =  data[398]; buffer[0][3] =  data[399]; buffer[0][4] =  data[400]; buffer[0][5] =  data[401]; buffer[0][6] =  data[402]; buffer[0][7] =  data[403]; buffer[0][8] =  data[404]; buffer[0][9] =  data[405]; buffer[0][10] =  data[406]; buffer[0][11] =  data[407]; buffer[0][12] =  data[408]; buffer[0][13] =  data[409]; buffer[0][14] =  data[410]; buffer[0][15] =  data[411]; buffer[0][16] =  data[412]; buffer[0][17] =  data[413]; buffer[0][18] =  data[414]; buffer[0][19] =  data[415]; buffer[0][20] =  data[416]; buffer[0][21] =  data[417]; buffer[0][22] =  data[418]; buffer[0][23] =  data[419]; buffer[0][24] =  data[420]; buffer[0][25] =  data[421]; buffer[0][26] =  data[422]; buffer[0][27] =  data[423]; buffer[0][28] =  data[424]; buffer[0][29] =  data[425]; buffer[0][30] =  data[426]; buffer[0][31] =  data[427]; buffer[0][32] =  data[428]; buffer[0][33] =  data[429]; buffer[0][34] =  data[430]; buffer[0][35] =  data[431];
+
+        }
+        if (partition ==  12) {
+            buffer[0][0] =  data[432]; buffer[0][1] =  data[433]; buffer[0][2] =  data[434]; buffer[0][3] =  data[435]; buffer[0][4] =  data[436]; buffer[0][5] =  data[437]; buffer[0][6] =  data[438]; buffer[0][7] =  data[439]; buffer[0][8] =  data[440]; buffer[0][9] =  data[441]; buffer[0][10] =  data[442]; buffer[0][11] =  data[443]; buffer[0][12] =  data[444]; buffer[0][13] =  data[445]; buffer[0][14] =  data[446]; buffer[0][15] =  data[447]; buffer[0][16] =  data[448]; buffer[0][17] =  data[449]; buffer[0][18] =  data[450]; buffer[0][19] =  data[451]; buffer[0][20] =  data[452]; buffer[0][21] =  data[453]; buffer[0][22] =  data[454]; buffer[0][23] =  data[455]; buffer[0][24] =  data[456]; buffer[0][25] =  data[457]; buffer[0][26] =  data[458]; buffer[0][27] =  data[459]; buffer[0][28] =  data[460]; buffer[0][29] =  data[461]; buffer[0][30] =  data[462]; buffer[0][31] =  data[463]; buffer[0][32] =  data[464]; buffer[0][33] =  data[465]; buffer[0][34] =  data[466]; buffer[0][35] =  data[467];
+
+        }
+        if (partition ==  13) {
+            buffer[0][0] =  data[468]; buffer[0][1] =  data[469]; buffer[0][2] =  data[470]; buffer[0][3] =  data[471]; buffer[0][4] =  data[472]; buffer[0][5] =  data[473]; buffer[0][6] =  data[474]; buffer[0][7] =  data[475]; buffer[0][8] =  data[476]; buffer[0][9] =  data[477]; buffer[0][10] =  data[478]; buffer[0][11] =  data[479]; buffer[0][12] =  data[480]; buffer[0][13] =  data[481]; buffer[0][14] =  data[482]; buffer[0][15] =  data[483]; buffer[0][16] =  data[484]; buffer[0][17] =  data[485]; buffer[0][18] =  data[486]; buffer[0][19] =  data[487]; buffer[0][20] =  data[488]; buffer[0][21] =  data[489]; buffer[0][22] =  data[490]; buffer[0][23] =  data[491]; buffer[0][24] =  data[492]; buffer[0][25] =  data[493]; buffer[0][26] =  data[494]; buffer[0][27] =  data[495]; buffer[0][28] =  data[496]; buffer[0][29] =  data[497]; buffer[0][30] =  data[498]; buffer[0][31] =  data[499]; buffer[0][32] =  data[500]; buffer[0][33] =  data[501]; buffer[0][34] =  data[502]; buffer[0][35] =  data[503];
+
+        }
+        if (partition ==  14) {
+            buffer[0][0] =  data[504]; buffer[0][1] =  data[505]; buffer[0][2] =  data[506]; buffer[0][3] =  data[507]; buffer[0][4] =  data[508]; buffer[0][5] =  data[509]; buffer[0][6] =  data[510]; buffer[0][7] =  data[511]; buffer[0][8] =  data[512]; buffer[0][9] =  data[513]; buffer[0][10] =  data[514]; buffer[0][11] =  data[515]; buffer[0][12] =  data[516]; buffer[0][13] =  data[517]; buffer[0][14] =  data[518]; buffer[0][15] =  data[519]; buffer[0][16] =  data[520]; buffer[0][17] =  data[521]; buffer[0][18] =  data[522]; buffer[0][19] =  data[523]; buffer[0][20] =  data[524]; buffer[0][21] =  data[525]; buffer[0][22] =  data[526]; buffer[0][23] =  data[527]; buffer[0][24] =  data[528]; buffer[0][25] =  data[529]; buffer[0][26] =  data[530]; buffer[0][27] =  data[531]; buffer[0][28] =  data[532]; buffer[0][29] =  data[533]; buffer[0][30] =  data[534]; buffer[0][31] =  data[535]; buffer[0][32] =  data[536]; buffer[0][33] =  data[537]; buffer[0][34] =  data[538]; buffer[0][35] =  data[539];
+
+        }
+        if (partition ==  15) {
+            buffer[0][0] =  data[540]; buffer[0][1] =  data[541]; buffer[0][2] =  data[542]; buffer[0][3] =  data[543]; buffer[0][4] =  data[544]; buffer[0][5] =  data[545]; buffer[0][6] =  data[546]; buffer[0][7] =  data[547]; buffer[0][8] =  data[548]; buffer[0][9] =  data[549]; buffer[0][10] =  data[550]; buffer[0][11] =  data[551]; buffer[0][12] =  data[552]; buffer[0][13] =  data[553]; buffer[0][14] =  data[554]; buffer[0][15] =  data[555]; buffer[0][16] =  data[556]; buffer[0][17] =  data[557]; buffer[0][18] =  data[558]; buffer[0][19] =  data[559]; buffer[0][20] =  data[560]; buffer[0][21] =  data[561]; buffer[0][22] =  data[562]; buffer[0][23] =  data[563]; buffer[0][24] =  data[564]; buffer[0][25] =  data[565]; buffer[0][26] =  data[566]; buffer[0][27] =  data[567]; buffer[0][28] =  data[568]; buffer[0][29] =  data[569]; buffer[0][30] =  data[570]; buffer[0][31] =  data[571]; buffer[0][32] =  data[572]; buffer[0][33] =  data[573]; buffer[0][34] =  data[574]; buffer[0][35] =  data[575];
+
+        }
+        if (partition ==  16) {
+            buffer[0][0] =  data[576]; buffer[0][1] =  data[577]; buffer[0][2] =  data[578]; buffer[0][3] =  data[579]; buffer[0][4] =  data[580]; buffer[0][5] =  data[581]; buffer[0][6] =  data[582]; buffer[0][7] =  data[583]; buffer[0][8] =  data[584]; buffer[0][9] =  data[585]; buffer[0][10] =  data[586]; buffer[0][11] =  data[587]; buffer[0][12] =  data[588]; buffer[0][13] =  data[589]; buffer[0][14] =  data[590]; buffer[0][15] =  data[591]; buffer[0][16] =  data[592]; buffer[0][17] =  data[593]; buffer[0][18] =  data[594]; buffer[0][19] =  data[595]; buffer[0][20] =  data[596]; buffer[0][21] =  data[597]; buffer[0][22] =  data[598]; buffer[0][23] =  data[599]; buffer[0][24] =  data[600]; buffer[0][25] =  data[601]; buffer[0][26] =  data[602]; buffer[0][27] =  data[603]; buffer[0][28] =  data[604]; buffer[0][29] =  data[605]; buffer[0][30] =  data[606]; buffer[0][31] =  data[607]; buffer[0][32] =  data[608]; buffer[0][33] =  data[609]; buffer[0][34] =  data[610]; buffer[0][35] =  data[611];
+
+        }
+        if (partition ==  17) {
+            buffer[0][0] =  data[612]; buffer[0][1] =  data[613]; buffer[0][2] =  data[614]; buffer[0][3] =  data[615]; buffer[0][4] =  data[616]; buffer[0][5] =  data[617]; buffer[0][6] =  data[618]; buffer[0][7] =  data[619]; buffer[0][8] =  data[620]; buffer[0][9] =  data[621]; buffer[0][10] =  data[622]; buffer[0][11] =  data[623]; buffer[0][12] =  data[624]; buffer[0][13] =  data[625]; buffer[0][14] =  data[626]; buffer[0][15] =  data[627]; buffer[0][16] =  data[628]; buffer[0][17] =  data[629]; buffer[0][18] =  data[630]; buffer[0][19] =  data[631]; buffer[0][20] =  data[632]; buffer[0][21] =  data[633]; buffer[0][22] =  data[634]; buffer[0][23] =  data[635]; buffer[0][24] =  data[636]; buffer[0][25] =  data[637]; buffer[0][26] =  data[638]; buffer[0][27] =  data[639]; buffer[0][28] =  data[640]; buffer[0][29] =  data[641]; buffer[0][30] =  data[642]; buffer[0][31] =  data[643]; buffer[0][32] =  data[644]; buffer[0][33] =  data[645]; buffer[0][34] =  data[646]; buffer[0][35] =  data[647];
+
+        }
+        if (partition ==  18) {
+            buffer[0][0] =  data[648]; buffer[0][1] =  data[649]; buffer[0][2] =  data[650]; buffer[0][3] =  data[651]; buffer[0][4] =  data[652]; buffer[0][5] =  data[653]; buffer[0][6] =  data[654]; buffer[0][7] =  data[655]; buffer[0][8] =  data[656]; buffer[0][9] =  data[657]; buffer[0][10] =  data[658]; buffer[0][11] =  data[659]; buffer[0][12] =  data[660]; buffer[0][13] =  data[661]; buffer[0][14] =  data[662]; buffer[0][15] =  data[663]; buffer[0][16] =  data[664]; buffer[0][17] =  data[665]; buffer[0][18] =  data[666]; buffer[0][19] =  data[667]; buffer[0][20] =  data[668]; buffer[0][21] =  data[669]; buffer[0][22] =  data[670]; buffer[0][23] =  data[671]; buffer[0][24] =  data[672]; buffer[0][25] =  data[673]; buffer[0][26] =  data[674]; buffer[0][27] =  data[675]; buffer[0][28] =  data[676]; buffer[0][29] =  data[677]; buffer[0][30] =  data[678]; buffer[0][31] =  data[679]; buffer[0][32] =  data[680]; buffer[0][33] =  data[681]; buffer[0][34] =  data[682]; buffer[0][35] =  data[683];
+
+        }
+        if (partition ==  19) {
+            buffer[0][0] =  data[684]; buffer[0][1] =  data[685]; buffer[0][2] =  data[686]; buffer[0][3] =  data[687]; buffer[0][4] =  data[688]; buffer[0][5] =  data[689]; buffer[0][6] =  data[690]; buffer[0][7] =  data[691]; buffer[0][8] =  data[692]; buffer[0][9] =  data[693]; buffer[0][10] =  data[694]; buffer[0][11] =  data[695]; buffer[0][12] =  data[696]; buffer[0][13] =  data[697]; buffer[0][14] =  data[698]; buffer[0][15] =  data[699]; buffer[0][16] =  data[700]; buffer[0][17] =  data[701]; buffer[0][18] =  data[702]; buffer[0][19] =  data[703]; buffer[0][20] =  data[704]; buffer[0][21] =  data[705]; buffer[0][22] =  data[706]; buffer[0][23] =  data[707]; buffer[0][24] =  data[708]; buffer[0][25] =  data[709]; buffer[0][26] =  data[710]; buffer[0][27] =  data[711]; buffer[0][28] =  data[712]; buffer[0][29] =  data[713]; buffer[0][30] =  data[714]; buffer[0][31] =  data[715]; buffer[0][32] =  data[716]; buffer[0][33] =  data[717]; buffer[0][34] =  data[718]; buffer[0][35] =  data[719];
+
+        }
+        if (partition ==  20) {
+            buffer[0][0] =  data[720]; buffer[0][1] =  data[721]; buffer[0][2] =  data[722]; buffer[0][3] =  data[723]; buffer[0][4] =  data[724]; buffer[0][5] =  data[725]; buffer[0][6] =  data[726]; buffer[0][7] =  data[727]; buffer[0][8] =  data[728]; buffer[0][9] =  data[729]; buffer[0][10] =  data[730]; buffer[0][11] =  data[731]; buffer[0][12] =  data[732]; buffer[0][13] =  data[733]; buffer[0][14] =  data[734]; buffer[0][15] =  data[735]; buffer[0][16] =  data[736]; buffer[0][17] =  data[737]; buffer[0][18] =  data[738]; buffer[0][19] =  data[739]; buffer[0][20] =  data[740]; buffer[0][21] =  data[741]; buffer[0][22] =  data[742]; buffer[0][23] =  data[743]; buffer[0][24] =  data[744]; buffer[0][25] =  data[745]; buffer[0][26] =  data[746]; buffer[0][27] =  data[747]; buffer[0][28] =  data[748]; buffer[0][29] =  data[749]; buffer[0][30] =  data[750]; buffer[0][31] =  data[751]; buffer[0][32] =  data[752]; buffer[0][33] =  data[753]; buffer[0][34] =  data[754]; buffer[0][35] =  data[755];
+
+        }
+        if (partition ==  21) {
+            buffer[0][0] =  data[756]; buffer[0][1] =  data[757]; buffer[0][2] =  data[758]; buffer[0][3] =  data[759]; buffer[0][4] =  data[760]; buffer[0][5] =  data[761]; buffer[0][6] =  data[762]; buffer[0][7] =  data[763]; buffer[0][8] =  data[764]; buffer[0][9] =  data[765]; buffer[0][10] =  data[766]; buffer[0][11] =  data[767]; buffer[0][12] =  data[768]; buffer[0][13] =  data[769]; buffer[0][14] =  data[770]; buffer[0][15] =  data[771]; buffer[0][16] =  data[772]; buffer[0][17] =  data[773]; buffer[0][18] =  data[774]; buffer[0][19] =  data[775]; buffer[0][20] =  data[776]; buffer[0][21] =  data[777]; buffer[0][22] =  data[778]; buffer[0][23] =  data[779]; buffer[0][24] =  data[780]; buffer[0][25] =  data[781]; buffer[0][26] =  data[782]; buffer[0][27] =  data[783]; buffer[0][28] =  data[784]; buffer[0][29] =  data[785]; buffer[0][30] =  data[786]; buffer[0][31] =  data[787]; buffer[0][32] =  data[788]; buffer[0][33] =  data[789]; buffer[0][34] =  data[790]; buffer[0][35] =  data[791];
+
+        }
+        if (partition ==  22) {
+            buffer[0][0] =  data[792]; buffer[0][1] =  data[793]; buffer[0][2] =  data[794]; buffer[0][3] =  data[795]; buffer[0][4] =  data[796]; buffer[0][5] =  data[797]; buffer[0][6] =  data[798]; buffer[0][7] =  data[799]; buffer[0][8] =  data[800]; buffer[0][9] =  data[801]; buffer[0][10] =  data[802]; buffer[0][11] =  data[803]; buffer[0][12] =  data[804]; buffer[0][13] =  data[805]; buffer[0][14] =  data[806]; buffer[0][15] =  data[807]; buffer[0][16] =  data[808]; buffer[0][17] =  data[809]; buffer[0][18] =  data[810]; buffer[0][19] =  data[811]; buffer[0][20] =  data[812]; buffer[0][21] =  data[813]; buffer[0][22] =  data[814]; buffer[0][23] =  data[815]; buffer[0][24] =  data[816]; buffer[0][25] =  data[817]; buffer[0][26] =  data[818]; buffer[0][27] =  data[819]; buffer[0][28] =  data[820]; buffer[0][29] =  data[821]; buffer[0][30] =  data[822]; buffer[0][31] =  data[823]; buffer[0][32] =  data[824]; buffer[0][33] =  data[825]; buffer[0][34] =  data[826]; buffer[0][35] =  data[827];
+
+        }
+        if (partition ==  23) {
+            buffer[0][0] =  data[828]; buffer[0][1] =  data[829]; buffer[0][2] =  data[830]; buffer[0][3] =  data[831]; buffer[0][4] =  data[832]; buffer[0][5] =  data[833]; buffer[0][6] =  data[834]; buffer[0][7] =  data[835]; buffer[0][8] =  data[836]; buffer[0][9] =  data[837]; buffer[0][10] =  data[838]; buffer[0][11] =  data[839]; buffer[0][12] =  data[840]; buffer[0][13] =  data[841]; buffer[0][14] =  data[842]; buffer[0][15] =  data[843]; buffer[0][16] =  data[844]; buffer[0][17] =  data[845]; buffer[0][18] =  data[846]; buffer[0][19] =  data[847]; buffer[0][20] =  data[848]; buffer[0][21] =  data[849]; buffer[0][22] =  data[850]; buffer[0][23] =  data[851]; buffer[0][24] =  data[852]; buffer[0][25] =  data[853]; buffer[0][26] =  data[854]; buffer[0][27] =  data[855]; buffer[0][28] =  data[856]; buffer[0][29] =  data[857]; buffer[0][30] =  data[858]; buffer[0][31] =  data[859]; buffer[0][32] =  data[860]; buffer[0][33] =  data[861]; buffer[0][34] =  data[862]; buffer[0][35] =  data[863];
+
+        }
+        if (partition ==  24) {
+            buffer[0][0] =  data[864]; buffer[0][1] =  data[865]; buffer[0][2] =  data[866]; buffer[0][3] =  data[867]; buffer[0][4] =  data[868]; buffer[0][5] =  data[869]; buffer[0][6] =  data[870]; buffer[0][7] =  data[871]; buffer[0][8] =  data[872]; buffer[0][9] =  data[873]; buffer[0][10] =  data[874]; buffer[0][11] =  data[875]; buffer[0][12] =  data[876]; buffer[0][13] =  data[877]; buffer[0][14] =  data[878]; buffer[0][15] =  data[879]; buffer[0][16] =  data[880]; buffer[0][17] =  data[881]; buffer[0][18] =  data[882]; buffer[0][19] =  data[883]; buffer[0][20] =  data[884]; buffer[0][21] =  data[885]; buffer[0][22] =  data[886]; buffer[0][23] =  data[887]; buffer[0][24] =  data[888]; buffer[0][25] =  data[889]; buffer[0][26] =  data[890]; buffer[0][27] =  data[891]; buffer[0][28] =  data[892]; buffer[0][29] =  data[893]; buffer[0][30] =  data[894]; buffer[0][31] =  data[895]; buffer[0][32] =  data[896]; buffer[0][33] =  data[897]; buffer[0][34] =  data[898]; buffer[0][35] =  data[899];
+
+        }
+        if (partition ==  25) {
+            buffer[0][0] =  data[900]; buffer[0][1] =  data[901]; buffer[0][2] =  data[902]; buffer[0][3] =  data[903]; buffer[0][4] =  data[904]; buffer[0][5] =  data[905]; buffer[0][6] =  data[906]; buffer[0][7] =  data[907]; buffer[0][8] =  data[908]; buffer[0][9] =  data[909]; buffer[0][10] =  data[910]; buffer[0][11] =  data[911]; buffer[0][12] =  data[912]; buffer[0][13] =  data[913]; buffer[0][14] =  data[914]; buffer[0][15] =  data[915]; buffer[0][16] =  data[916]; buffer[0][17] =  data[917]; buffer[0][18] =  data[918]; buffer[0][19] =  data[919]; buffer[0][20] =  data[920]; buffer[0][21] =  data[921]; buffer[0][22] =  data[922]; buffer[0][23] =  data[923]; buffer[0][24] =  data[924]; buffer[0][25] =  data[925]; buffer[0][26] =  data[926]; buffer[0][27] =  data[927]; buffer[0][28] =  data[928]; buffer[0][29] =  data[929]; buffer[0][30] =  data[930]; buffer[0][31] =  data[931]; buffer[0][32] =  data[932]; buffer[0][33] =  data[933]; buffer[0][34] =  data[934]; buffer[0][35] =  data[935];
+
+        }
+        if (partition ==  26) {
+            buffer[0][0] =  data[936]; buffer[0][1] =  data[937]; buffer[0][2] =  data[938]; buffer[0][3] =  data[939]; buffer[0][4] =  data[940]; buffer[0][5] =  data[941]; buffer[0][6] =  data[942]; buffer[0][7] =  data[943]; buffer[0][8] =  data[944]; buffer[0][9] =  data[945]; buffer[0][10] =  data[946]; buffer[0][11] =  data[947]; buffer[0][12] =  data[948]; buffer[0][13] =  data[949]; buffer[0][14] =  data[950]; buffer[0][15] =  data[951]; buffer[0][16] =  data[952]; buffer[0][17] =  data[953]; buffer[0][18] =  data[954]; buffer[0][19] =  data[955]; buffer[0][20] =  data[956]; buffer[0][21] =  data[957]; buffer[0][22] =  data[958]; buffer[0][23] =  data[959]; buffer[0][24] =  data[960]; buffer[0][25] =  data[961]; buffer[0][26] =  data[962]; buffer[0][27] =  data[963]; buffer[0][28] =  data[964]; buffer[0][29] =  data[965]; buffer[0][30] =  data[966]; buffer[0][31] =  data[967]; buffer[0][32] =  data[968]; buffer[0][33] =  data[969]; buffer[0][34] =  data[970]; buffer[0][35] =  data[971];
+
+        }
+        if (partition ==  27) {
+            buffer[0][0] =  data[972]; buffer[0][1] =  data[973]; buffer[0][2] =  data[974]; buffer[0][3] =  data[975]; buffer[0][4] =  data[976]; buffer[0][5] =  data[977]; buffer[0][6] =  data[978]; buffer[0][7] =  data[979]; buffer[0][8] =  data[980]; buffer[0][9] =  data[981]; buffer[0][10] =  data[982]; buffer[0][11] =  data[983]; buffer[0][12] =  data[984]; buffer[0][13] =  data[985]; buffer[0][14] =  data[986]; buffer[0][15] =  data[987]; buffer[0][16] =  data[988]; buffer[0][17] =  data[989]; buffer[0][18] =  data[990]; buffer[0][19] =  data[991]; buffer[0][20] =  data[992]; buffer[0][21] =  data[993]; buffer[0][22] =  data[994]; buffer[0][23] =  data[995]; buffer[0][24] =  data[996]; buffer[0][25] =  data[997]; buffer[0][26] =  data[998]; buffer[0][27] =  data[999]; buffer[0][28] = data[1000]; buffer[0][29] = data[1001]; buffer[0][30] = data[1002]; buffer[0][31] = data[1003]; buffer[0][32] = data[1004]; buffer[0][33] = data[1005]; buffer[0][34] = data[1006]; buffer[0][35] = data[1007];
+
+        }
+        if (partition ==  28) {
+            buffer[0][0] = data[1008]; buffer[0][1] = data[1009]; buffer[0][2] = data[1010]; buffer[0][3] = data[1011]; buffer[0][4] = data[1012]; buffer[0][5] = data[1013]; buffer[0][6] = data[1014]; buffer[0][7] = data[1015]; buffer[0][8] = data[1016]; buffer[0][9] = data[1017]; buffer[0][10] = data[1018]; buffer[0][11] = data[1019]; buffer[0][12] = data[1020]; buffer[0][13] = data[1021]; buffer[0][14] = data[1022]; buffer[0][15] = data[1023]; buffer[0][16] = data[1024]; buffer[0][17] = data[1025]; buffer[0][18] = data[1026]; buffer[0][19] = data[1027]; buffer[0][20] = data[1028]; buffer[0][21] = data[1029]; buffer[0][22] = data[1030]; buffer[0][23] = data[1031]; buffer[0][24] = data[1032]; buffer[0][25] = data[1033]; buffer[0][26] = data[1034]; buffer[0][27] = data[1035]; buffer[0][28] = data[1036]; buffer[0][29] = data[1037]; buffer[0][30] = data[1038]; buffer[0][31] = data[1039]; buffer[0][32] = data[1040]; buffer[0][33] = data[1041]; buffer[0][34] = data[1042]; buffer[0][35] = data[1043];
+
+        }
+        if (partition ==  29) {
+            buffer[0][0] = data[1044]; buffer[0][1] = data[1045]; buffer[0][2] = data[1046]; buffer[0][3] = data[1047]; buffer[0][4] = data[1048]; buffer[0][5] = data[1049]; buffer[0][6] = data[1050]; buffer[0][7] = data[1051]; buffer[0][8] = data[1052]; buffer[0][9] = data[1053]; buffer[0][10] = data[1054]; buffer[0][11] = data[1055]; buffer[0][12] = data[1056]; buffer[0][13] = data[1057]; buffer[0][14] = data[1058]; buffer[0][15] = data[1059]; buffer[0][16] = data[1060]; buffer[0][17] = data[1061]; buffer[0][18] = data[1062]; buffer[0][19] = data[1063]; buffer[0][20] = data[1064]; buffer[0][21] = data[1065]; buffer[0][22] = data[1066]; buffer[0][23] = data[1067]; buffer[0][24] = data[1068]; buffer[0][25] = data[1069]; buffer[0][26] = data[1070]; buffer[0][27] = data[1071]; buffer[0][28] = data[1072]; buffer[0][29] = data[1073]; buffer[0][30] = data[1074]; buffer[0][31] = data[1075]; buffer[0][32] = data[1076]; buffer[0][33] = data[1077]; buffer[0][34] = data[1078]; buffer[0][35] = data[1079];
+
+        }
+        if (partition ==  30) {
+            buffer[0][0] = data[1080]; buffer[0][1] = data[1081]; buffer[0][2] = data[1082]; buffer[0][3] = data[1083]; buffer[0][4] = data[1084]; buffer[0][5] = data[1085]; buffer[0][6] = data[1086]; buffer[0][7] = data[1087]; buffer[0][8] = data[1088]; buffer[0][9] = data[1089]; buffer[0][10] = data[1090]; buffer[0][11] = data[1091]; buffer[0][12] = data[1092]; buffer[0][13] = data[1093]; buffer[0][14] = data[1094]; buffer[0][15] = data[1095]; buffer[0][16] = data[1096]; buffer[0][17] = data[1097]; buffer[0][18] = data[1098]; buffer[0][19] = data[1099]; buffer[0][20] = data[1100]; buffer[0][21] = data[1101]; buffer[0][22] = data[1102]; buffer[0][23] = data[1103]; buffer[0][24] = data[1104]; buffer[0][25] = data[1105]; buffer[0][26] = data[1106]; buffer[0][27] = data[1107]; buffer[0][28] = data[1108]; buffer[0][29] = data[1109]; buffer[0][30] = data[1110]; buffer[0][31] = data[1111]; buffer[0][32] = data[1112]; buffer[0][33] = data[1113]; buffer[0][34] = data[1114]; buffer[0][35] = data[1115];
+
+        }
+        if (partition ==  31) {
+            buffer[0][0] = data[1116]; buffer[0][1] = data[1117]; buffer[0][2] = data[1118]; buffer[0][3] = data[1119]; buffer[0][4] = data[1120]; buffer[0][5] = data[1121]; buffer[0][6] = data[1122]; buffer[0][7] = data[1123]; buffer[0][8] = data[1124]; buffer[0][9] = data[1125]; buffer[0][10] = data[1126]; buffer[0][11] = data[1127]; buffer[0][12] = data[1128]; buffer[0][13] = data[1129]; buffer[0][14] = data[1130]; buffer[0][15] = data[1131]; buffer[0][16] = data[1132]; buffer[0][17] = data[1133]; buffer[0][18] = data[1134]; buffer[0][19] = data[1135]; buffer[0][20] = data[1136]; buffer[0][21] = data[1137]; buffer[0][22] = data[1138]; buffer[0][23] = data[1139]; buffer[0][24] = data[1140]; buffer[0][25] = data[1141]; buffer[0][26] = data[1142]; buffer[0][27] = data[1143]; buffer[0][28] = data[1144]; buffer[0][29] = data[1145]; buffer[0][30] = data[1146]; buffer[0][31] = data[1147]; buffer[0][32] = data[1148]; buffer[0][33] = data[1149]; buffer[0][34] = data[1150]; buffer[0][35] = data[1151];
+
+        }
+        if (partition ==  32) {
+            buffer[0][0] = data[1152]; buffer[0][1] = data[1153]; buffer[0][2] = data[1154]; buffer[0][3] = data[1155]; buffer[0][4] = data[1156]; buffer[0][5] = data[1157]; buffer[0][6] = data[1158]; buffer[0][7] = data[1159]; buffer[0][8] = data[1160]; buffer[0][9] = data[1161]; buffer[0][10] = data[1162]; buffer[0][11] = data[1163]; buffer[0][12] = data[1164]; buffer[0][13] = data[1165]; buffer[0][14] = data[1166]; buffer[0][15] = data[1167]; buffer[0][16] = data[1168]; buffer[0][17] = data[1169]; buffer[0][18] = data[1170]; buffer[0][19] = data[1171]; buffer[0][20] = data[1172]; buffer[0][21] = data[1173]; buffer[0][22] = data[1174]; buffer[0][23] = data[1175]; buffer[0][24] = data[1176]; buffer[0][25] = data[1177]; buffer[0][26] = data[1178]; buffer[0][27] = data[1179]; buffer[0][28] = data[1180]; buffer[0][29] = data[1181]; buffer[0][30] = data[1182]; buffer[0][31] = data[1183]; buffer[0][32] = data[1184]; buffer[0][33] = data[1185]; buffer[0][34] = data[1186]; buffer[0][35] = data[1187];
+
+        }
+        if (partition ==  33) {
+            buffer[0][0] = data[1188]; buffer[0][1] = data[1189]; buffer[0][2] = data[1190]; buffer[0][3] = data[1191]; buffer[0][4] = data[1192]; buffer[0][5] = data[1193]; buffer[0][6] = data[1194]; buffer[0][7] = data[1195]; buffer[0][8] = data[1196]; buffer[0][9] = data[1197]; buffer[0][10] = data[1198]; buffer[0][11] = data[1199]; buffer[0][12] = data[1200]; buffer[0][13] = data[1201]; buffer[0][14] = data[1202]; buffer[0][15] = data[1203]; buffer[0][16] = data[1204]; buffer[0][17] = data[1205]; buffer[0][18] = data[1206]; buffer[0][19] = data[1207]; buffer[0][20] = data[1208]; buffer[0][21] = data[1209]; buffer[0][22] = data[1210]; buffer[0][23] = data[1211]; buffer[0][24] = data[1212]; buffer[0][25] = data[1213]; buffer[0][26] = data[1214]; buffer[0][27] = data[1215]; buffer[0][28] = data[1216]; buffer[0][29] = data[1217]; buffer[0][30] = data[1218]; buffer[0][31] = data[1219]; buffer[0][32] = data[1220]; buffer[0][33] = data[1221]; buffer[0][34] = data[1222]; buffer[0][35] = data[1223];
+
+        }
+        if (partition ==  34) {
+            buffer[0][0] = data[1224]; buffer[0][1] = data[1225]; buffer[0][2] = data[1226]; buffer[0][3] = data[1227]; buffer[0][4] = data[1228]; buffer[0][5] = data[1229]; buffer[0][6] = data[1230]; buffer[0][7] = data[1231]; buffer[0][8] = data[1232]; buffer[0][9] = data[1233]; buffer[0][10] = data[1234]; buffer[0][11] = data[1235]; buffer[0][12] = data[1236]; buffer[0][13] = data[1237]; buffer[0][14] = data[1238]; buffer[0][15] = data[1239]; buffer[0][16] = data[1240]; buffer[0][17] = data[1241]; buffer[0][18] = data[1242]; buffer[0][19] = data[1243]; buffer[0][20] = data[1244]; buffer[0][21] = data[1245]; buffer[0][22] = data[1246]; buffer[0][23] = data[1247]; buffer[0][24] = data[1248]; buffer[0][25] = data[1249]; buffer[0][26] = data[1250]; buffer[0][27] = data[1251]; buffer[0][28] = data[1252]; buffer[0][29] = data[1253]; buffer[0][30] = data[1254]; buffer[0][31] = data[1255]; buffer[0][32] = data[1256]; buffer[0][33] = data[1257]; buffer[0][34] = data[1258]; buffer[0][35] = data[1259];
+
+        }
+        if (partition ==  35) {
+            buffer[0][0] = data[1260]; buffer[0][1] = data[1261]; buffer[0][2] = data[1262]; buffer[0][3] = data[1263]; buffer[0][4] = data[1264]; buffer[0][5] = data[1265]; buffer[0][6] = data[1266]; buffer[0][7] = data[1267]; buffer[0][8] = data[1268]; buffer[0][9] = data[1269]; buffer[0][10] = data[1270]; buffer[0][11] = data[1271]; buffer[0][12] = data[1272]; buffer[0][13] = data[1273]; buffer[0][14] = data[1274]; buffer[0][15] = data[1275]; buffer[0][16] = data[1276]; buffer[0][17] = data[1277]; buffer[0][18] = data[1278]; buffer[0][19] = data[1279]; buffer[0][20] = data[1280]; buffer[0][21] = data[1281]; buffer[0][22] = data[1282]; buffer[0][23] = data[1283]; buffer[0][24] = data[1284]; buffer[0][25] = data[1285]; buffer[0][26] = data[1286]; buffer[0][27] = data[1287]; buffer[0][28] = data[1288]; buffer[0][29] = data[1289]; buffer[0][30] = data[1290]; buffer[0][31] = data[1291]; buffer[0][32] = data[1292]; buffer[0][33] = data[1293]; buffer[0][34] = data[1294]; buffer[0][35] = data[1295];
+
+        }
+        if (partition ==  36) {
+            buffer[0][0] = data[1296]; buffer[0][1] = data[1297]; buffer[0][2] = data[1298]; buffer[0][3] = data[1299]; buffer[0][4] = data[1300]; buffer[0][5] = data[1301]; buffer[0][6] = data[1302]; buffer[0][7] = data[1303]; buffer[0][8] = data[1304]; buffer[0][9] = data[1305]; buffer[0][10] = data[1306]; buffer[0][11] = data[1307]; buffer[0][12] = data[1308]; buffer[0][13] = data[1309]; buffer[0][14] = data[1310]; buffer[0][15] = data[1311]; buffer[0][16] = data[1312]; buffer[0][17] = data[1313]; buffer[0][18] = data[1314]; buffer[0][19] = data[1315]; buffer[0][20] = data[1316]; buffer[0][21] = data[1317]; buffer[0][22] = data[1318]; buffer[0][23] = data[1319]; buffer[0][24] = data[1320]; buffer[0][25] = data[1321]; buffer[0][26] = data[1322]; buffer[0][27] = data[1323]; buffer[0][28] = data[1324]; buffer[0][29] = data[1325]; buffer[0][30] = data[1326]; buffer[0][31] = data[1327]; buffer[0][32] = data[1328]; buffer[0][33] = data[1329]; buffer[0][34] = data[1330]; buffer[0][35] = data[1331];
+
+        }
+        if (partition ==  37) {
+            buffer[0][0] = data[1332]; buffer[0][1] = data[1333]; buffer[0][2] = data[1334]; buffer[0][3] = data[1335]; buffer[0][4] = data[1336]; buffer[0][5] = data[1337]; buffer[0][6] = data[1338]; buffer[0][7] = data[1339]; buffer[0][8] = data[1340]; buffer[0][9] = data[1341]; buffer[0][10] = data[1342]; buffer[0][11] = data[1343]; buffer[0][12] = data[1344]; buffer[0][13] = data[1345]; buffer[0][14] = data[1346]; buffer[0][15] = data[1347]; buffer[0][16] = data[1348]; buffer[0][17] = data[1349]; buffer[0][18] = data[1350]; buffer[0][19] = data[1351]; buffer[0][20] = data[1352]; buffer[0][21] = data[1353]; buffer[0][22] = data[1354]; buffer[0][23] = data[1355]; buffer[0][24] = data[1356]; buffer[0][25] = data[1357]; buffer[0][26] = data[1358]; buffer[0][27] = data[1359]; buffer[0][28] = data[1360]; buffer[0][29] = data[1361]; buffer[0][30] = data[1362]; buffer[0][31] = data[1363]; buffer[0][32] = data[1364]; buffer[0][33] = data[1365]; buffer[0][34] = data[1366]; buffer[0][35] = data[1367];
+
+        }
+        if (partition ==  38) {
+            buffer[0][0] = data[1368]; buffer[0][1] = data[1369]; buffer[0][2] = data[1370]; buffer[0][3] = data[1371]; buffer[0][4] = data[1372]; buffer[0][5] = data[1373]; buffer[0][6] = data[1374]; buffer[0][7] = data[1375]; buffer[0][8] = data[1376]; buffer[0][9] = data[1377]; buffer[0][10] = data[1378]; buffer[0][11] = data[1379]; buffer[0][12] = data[1380]; buffer[0][13] = data[1381]; buffer[0][14] = data[1382]; buffer[0][15] = data[1383]; buffer[0][16] = data[1384]; buffer[0][17] = data[1385]; buffer[0][18] = data[1386]; buffer[0][19] = data[1387]; buffer[0][20] = data[1388]; buffer[0][21] = data[1389]; buffer[0][22] = data[1390]; buffer[0][23] = data[1391]; buffer[0][24] = data[1392]; buffer[0][25] = data[1393]; buffer[0][26] = data[1394]; buffer[0][27] = data[1395]; buffer[0][28] = data[1396]; buffer[0][29] = data[1397]; buffer[0][30] = data[1398]; buffer[0][31] = data[1399]; buffer[0][32] = data[1400]; buffer[0][33] = data[1401]; buffer[0][34] = data[1402]; buffer[0][35] = data[1403];
+
+        }
+        if (partition ==  39) {
+            buffer[0][0] = data[1404]; buffer[0][1] = data[1405]; buffer[0][2] = data[1406]; buffer[0][3] = data[1407]; buffer[0][4] = data[1408]; buffer[0][5] = data[1409]; buffer[0][6] = data[1410]; buffer[0][7] = data[1411]; buffer[0][8] = data[1412]; buffer[0][9] = data[1413]; buffer[0][10] = data[1414]; buffer[0][11] = data[1415]; buffer[0][12] = data[1416]; buffer[0][13] = data[1417]; buffer[0][14] = data[1418]; buffer[0][15] = data[1419]; buffer[0][16] = data[1420]; buffer[0][17] = data[1421]; buffer[0][18] = data[1422]; buffer[0][19] = data[1423]; buffer[0][20] = data[1424]; buffer[0][21] = data[1425]; buffer[0][22] = data[1426]; buffer[0][23] = data[1427]; buffer[0][24] = data[1428]; buffer[0][25] = data[1429]; buffer[0][26] = data[1430]; buffer[0][27] = data[1431]; buffer[0][28] = data[1432]; buffer[0][29] = data[1433]; buffer[0][30] = data[1434]; buffer[0][31] = data[1435]; buffer[0][32] = data[1436]; buffer[0][33] = data[1437]; buffer[0][34] = data[1438]; buffer[0][35] = data[1439];
+
+        }
+        if (partition ==  40) {
+            buffer[0][0] = data[1440]; buffer[0][1] = data[1441]; buffer[0][2] = data[1442]; buffer[0][3] = data[1443]; buffer[0][4] = data[1444]; buffer[0][5] = data[1445]; buffer[0][6] = data[1446]; buffer[0][7] = data[1447]; buffer[0][8] = data[1448]; buffer[0][9] = data[1449]; buffer[0][10] = data[1450]; buffer[0][11] = data[1451]; buffer[0][12] = data[1452]; buffer[0][13] = data[1453]; buffer[0][14] = data[1454]; buffer[0][15] = data[1455]; buffer[0][16] = data[1456]; buffer[0][17] = data[1457]; buffer[0][18] = data[1458]; buffer[0][19] = data[1459]; buffer[0][20] = data[1460]; buffer[0][21] = data[1461]; buffer[0][22] = data[1462]; buffer[0][23] = data[1463]; buffer[0][24] = data[1464]; buffer[0][25] = data[1465]; buffer[0][26] = data[1466]; buffer[0][27] = data[1467]; buffer[0][28] = data[1468]; buffer[0][29] = data[1469]; buffer[0][30] = data[1470]; buffer[0][31] = data[1471]; buffer[0][32] = data[1472]; buffer[0][33] = data[1473]; buffer[0][34] = data[1474]; buffer[0][35] = data[1475];
+
+        }
+        if (partition ==  41) {
+            buffer[0][0] = data[1476]; buffer[0][1] = data[1477]; buffer[0][2] = data[1478]; buffer[0][3] = data[1479]; buffer[0][4] = data[1480]; buffer[0][5] = data[1481]; buffer[0][6] = data[1482]; buffer[0][7] = data[1483]; buffer[0][8] = data[1484]; buffer[0][9] = data[1485]; buffer[0][10] = data[1486]; buffer[0][11] = data[1487]; buffer[0][12] = data[1488]; buffer[0][13] = data[1489]; buffer[0][14] = data[1490]; buffer[0][15] = data[1491]; buffer[0][16] = data[1492]; buffer[0][17] = data[1493]; buffer[0][18] = data[1494]; buffer[0][19] = data[1495]; buffer[0][20] = data[1496]; buffer[0][21] = data[1497]; buffer[0][22] = data[1498]; buffer[0][23] = data[1499]; buffer[0][24] = data[1500]; buffer[0][25] = data[1501]; buffer[0][26] = data[1502]; buffer[0][27] = data[1503]; buffer[0][28] = data[1504]; buffer[0][29] = data[1505]; buffer[0][30] = data[1506]; buffer[0][31] = data[1507]; buffer[0][32] = data[1508]; buffer[0][33] = data[1509]; buffer[0][34] = data[1510]; buffer[0][35] = data[1511];
+
+        }
+        if (partition ==  42) {
+            buffer[0][0] = data[1512]; buffer[0][1] = data[1513]; buffer[0][2] = data[1514]; buffer[0][3] = data[1515]; buffer[0][4] = data[1516]; buffer[0][5] = data[1517]; buffer[0][6] = data[1518]; buffer[0][7] = data[1519]; buffer[0][8] = data[1520]; buffer[0][9] = data[1521]; buffer[0][10] = data[1522]; buffer[0][11] = data[1523]; buffer[0][12] = data[1524]; buffer[0][13] = data[1525]; buffer[0][14] = data[1526]; buffer[0][15] = data[1527]; buffer[0][16] = data[1528]; buffer[0][17] = data[1529]; buffer[0][18] = data[1530]; buffer[0][19] = data[1531]; buffer[0][20] = data[1532]; buffer[0][21] = data[1533]; buffer[0][22] = data[1534]; buffer[0][23] = data[1535]; buffer[0][24] = data[1536]; buffer[0][25] = data[1537]; buffer[0][26] = data[1538]; buffer[0][27] = data[1539]; buffer[0][28] = data[1540]; buffer[0][29] = data[1541]; buffer[0][30] = data[1542]; buffer[0][31] = data[1543]; buffer[0][32] = data[1544]; buffer[0][33] = data[1545]; buffer[0][34] = data[1546]; buffer[0][35] = data[1547];
+
+        }
+        if (partition ==  43) {
+            buffer[0][0] = data[1548]; buffer[0][1] = data[1549]; buffer[0][2] = data[1550]; buffer[0][3] = data[1551]; buffer[0][4] = data[1552]; buffer[0][5] = data[1553]; buffer[0][6] = data[1554]; buffer[0][7] = data[1555]; buffer[0][8] = data[1556]; buffer[0][9] = data[1557]; buffer[0][10] = data[1558]; buffer[0][11] = data[1559]; buffer[0][12] = data[1560]; buffer[0][13] = data[1561]; buffer[0][14] = data[1562]; buffer[0][15] = data[1563]; buffer[0][16] = data[1564]; buffer[0][17] = data[1565]; buffer[0][18] = data[1566]; buffer[0][19] = data[1567]; buffer[0][20] = data[1568]; buffer[0][21] = data[1569]; buffer[0][22] = data[1570]; buffer[0][23] = data[1571]; buffer[0][24] = data[1572]; buffer[0][25] = data[1573]; buffer[0][26] = data[1574]; buffer[0][27] = data[1575]; buffer[0][28] = data[1576]; buffer[0][29] = data[1577]; buffer[0][30] = data[1578]; buffer[0][31] = data[1579]; buffer[0][32] = data[1580]; buffer[0][33] = data[1581]; buffer[0][34] = data[1582]; buffer[0][35] = data[1583];
+
+        }
+        if (partition ==  44) {
+            buffer[0][0] = data[1584]; buffer[0][1] = data[1585]; buffer[0][2] = data[1586]; buffer[0][3] = data[1587]; buffer[0][4] = data[1588]; buffer[0][5] = data[1589]; buffer[0][6] = data[1590]; buffer[0][7] = data[1591]; buffer[0][8] = data[1592]; buffer[0][9] = data[1593]; buffer[0][10] = data[1594]; buffer[0][11] = data[1595]; buffer[0][12] = data[1596]; buffer[0][13] = data[1597]; buffer[0][14] = data[1598]; buffer[0][15] = data[1599]; buffer[0][16] = data[1600]; buffer[0][17] = data[1601]; buffer[0][18] = data[1602]; buffer[0][19] = data[1603]; buffer[0][20] = data[1604]; buffer[0][21] = data[1605]; buffer[0][22] = data[1606]; buffer[0][23] = data[1607]; buffer[0][24] = data[1608]; buffer[0][25] = data[1609]; buffer[0][26] = data[1610]; buffer[0][27] = data[1611]; buffer[0][28] = data[1612]; buffer[0][29] = data[1613]; buffer[0][30] = data[1614]; buffer[0][31] = data[1615]; buffer[0][32] = data[1616]; buffer[0][33] = data[1617]; buffer[0][34] = data[1618]; buffer[0][35] = data[1619];
+
+        }
+        if (partition ==  45) {
+            buffer[0][0] = data[1620]; buffer[0][1] = data[1621]; buffer[0][2] = data[1622]; buffer[0][3] = data[1623]; buffer[0][4] = data[1624]; buffer[0][5] = data[1625]; buffer[0][6] = data[1626]; buffer[0][7] = data[1627]; buffer[0][8] = data[1628]; buffer[0][9] = data[1629]; buffer[0][10] = data[1630]; buffer[0][11] = data[1631]; buffer[0][12] = data[1632]; buffer[0][13] = data[1633]; buffer[0][14] = data[1634]; buffer[0][15] = data[1635]; buffer[0][16] = data[1636]; buffer[0][17] = data[1637]; buffer[0][18] = data[1638]; buffer[0][19] = data[1639]; buffer[0][20] = data[1640]; buffer[0][21] = data[1641]; buffer[0][22] = data[1642]; buffer[0][23] = data[1643]; buffer[0][24] = data[1644]; buffer[0][25] = data[1645]; buffer[0][26] = data[1646]; buffer[0][27] = data[1647]; buffer[0][28] = data[1648]; buffer[0][29] = data[1649]; buffer[0][30] = data[1650]; buffer[0][31] = data[1651]; buffer[0][32] = data[1652]; buffer[0][33] = data[1653]; buffer[0][34] = data[1654]; buffer[0][35] = data[1655];
+
+        }
+        if (partition ==  46) {
+            buffer[0][0] = data[1656]; buffer[0][1] = data[1657]; buffer[0][2] = data[1658]; buffer[0][3] = data[1659]; buffer[0][4] = data[1660]; buffer[0][5] = data[1661]; buffer[0][6] = data[1662]; buffer[0][7] = data[1663]; buffer[0][8] = data[1664]; buffer[0][9] = data[1665]; buffer[0][10] = data[1666]; buffer[0][11] = data[1667]; buffer[0][12] = data[1668]; buffer[0][13] = data[1669]; buffer[0][14] = data[1670]; buffer[0][15] = data[1671]; buffer[0][16] = data[1672]; buffer[0][17] = data[1673]; buffer[0][18] = data[1674]; buffer[0][19] = data[1675]; buffer[0][20] = data[1676]; buffer[0][21] = data[1677]; buffer[0][22] = data[1678]; buffer[0][23] = data[1679]; buffer[0][24] = data[1680]; buffer[0][25] = data[1681]; buffer[0][26] = data[1682]; buffer[0][27] = data[1683]; buffer[0][28] = data[1684]; buffer[0][29] = data[1685]; buffer[0][30] = data[1686]; buffer[0][31] = data[1687]; buffer[0][32] = data[1688]; buffer[0][33] = data[1689]; buffer[0][34] = data[1690]; buffer[0][35] = data[1691];
+
+        }
+        if (partition ==  47) {
+            buffer[0][0] = data[1692]; buffer[0][1] = data[1693]; buffer[0][2] = data[1694]; buffer[0][3] = data[1695]; buffer[0][4] = data[1696]; buffer[0][5] = data[1697]; buffer[0][6] = data[1698]; buffer[0][7] = data[1699]; buffer[0][8] = data[1700]; buffer[0][9] = data[1701]; buffer[0][10] = data[1702]; buffer[0][11] = data[1703]; buffer[0][12] = data[1704]; buffer[0][13] = data[1705]; buffer[0][14] = data[1706]; buffer[0][15] = data[1707]; buffer[0][16] = data[1708]; buffer[0][17] = data[1709]; buffer[0][18] = data[1710]; buffer[0][19] = data[1711]; buffer[0][20] = data[1712]; buffer[0][21] = data[1713]; buffer[0][22] = data[1714]; buffer[0][23] = data[1715]; buffer[0][24] = data[1716]; buffer[0][25] = data[1717]; buffer[0][26] = data[1718]; buffer[0][27] = data[1719]; buffer[0][28] = data[1720]; buffer[0][29] = data[1721]; buffer[0][30] = data[1722]; buffer[0][31] = data[1723]; buffer[0][32] = data[1724]; buffer[0][33] = data[1725]; buffer[0][34] = data[1726]; buffer[0][35] = data[1727];
+
+        }
+        if (partition ==  48) {
+            buffer[0][0] = data[1728]; buffer[0][1] = data[1729]; buffer[0][2] = data[1730]; buffer[0][3] = data[1731]; buffer[0][4] = data[1732]; buffer[0][5] = data[1733]; buffer[0][6] = data[1734]; buffer[0][7] = data[1735]; buffer[0][8] = data[1736]; buffer[0][9] = data[1737]; buffer[0][10] = data[1738]; buffer[0][11] = data[1739]; buffer[0][12] = data[1740]; buffer[0][13] = data[1741]; buffer[0][14] = data[1742]; buffer[0][15] = data[1743]; buffer[0][16] = data[1744]; buffer[0][17] = data[1745]; buffer[0][18] = data[1746]; buffer[0][19] = data[1747]; buffer[0][20] = data[1748]; buffer[0][21] = data[1749]; buffer[0][22] = data[1750]; buffer[0][23] = data[1751]; buffer[0][24] = data[1752]; buffer[0][25] = data[1753]; buffer[0][26] = data[1754]; buffer[0][27] = data[1755]; buffer[0][28] = data[1756]; buffer[0][29] = data[1757]; buffer[0][30] = data[1758]; buffer[0][31] = data[1759]; buffer[0][32] = data[1760]; buffer[0][33] = data[1761]; buffer[0][34] = data[1762]; buffer[0][35] = data[1763];
+
+        }
+        if (partition ==  49) {
+            buffer[0][0] = data[1764]; buffer[0][1] = data[1765]; buffer[0][2] = data[1766]; buffer[0][3] = data[1767]; buffer[0][4] = data[1768]; buffer[0][5] = data[1769]; buffer[0][6] = data[1770]; buffer[0][7] = data[1771]; buffer[0][8] = data[1772]; buffer[0][9] = data[1773]; buffer[0][10] = data[1774]; buffer[0][11] = data[1775]; buffer[0][12] = data[1776]; buffer[0][13] = data[1777]; buffer[0][14] = data[1778]; buffer[0][15] = data[1779]; buffer[0][16] = data[1780]; buffer[0][17] = data[1781]; buffer[0][18] = data[1782]; buffer[0][19] = data[1783]; buffer[0][20] = data[1784]; buffer[0][21] = data[1785]; buffer[0][22] = data[1786]; buffer[0][23] = data[1787]; buffer[0][24] = data[1788]; buffer[0][25] = data[1789]; buffer[0][26] = data[1790]; buffer[0][27] = data[1791]; buffer[0][28] = data[1792]; buffer[0][29] = data[1793]; buffer[0][30] = data[1794]; buffer[0][31] = data[1795]; buffer[0][32] = data[1796]; buffer[0][33] = data[1797]; buffer[0][34] = data[1798]; buffer[0][35] = data[1799];
+
+        }
+        if (partition ==  50) {
+            buffer[0][0] = data[1800]; buffer[0][1] = data[1801]; buffer[0][2] = data[1802]; buffer[0][3] = data[1803]; buffer[0][4] = data[1804]; buffer[0][5] = data[1805]; buffer[0][6] = data[1806]; buffer[0][7] = data[1807]; buffer[0][8] = data[1808]; buffer[0][9] = data[1809]; buffer[0][10] = data[1810]; buffer[0][11] = data[1811]; buffer[0][12] = data[1812]; buffer[0][13] = data[1813]; buffer[0][14] = data[1814]; buffer[0][15] = data[1815]; buffer[0][16] = data[1816]; buffer[0][17] = data[1817]; buffer[0][18] = data[1818]; buffer[0][19] = data[1819]; buffer[0][20] = data[1820]; buffer[0][21] = data[1821]; buffer[0][22] = data[1822]; buffer[0][23] = data[1823]; buffer[0][24] = data[1824]; buffer[0][25] = data[1825]; buffer[0][26] = data[1826]; buffer[0][27] = data[1827]; buffer[0][28] = data[1828]; buffer[0][29] = data[1829]; buffer[0][30] = data[1830]; buffer[0][31] = data[1831]; buffer[0][32] = data[1832]; buffer[0][33] = data[1833]; buffer[0][34] = data[1834]; buffer[0][35] = data[1835];
+
+        }
+        if (partition ==  51) {
+            buffer[0][0] = data[1836]; buffer[0][1] = data[1837]; buffer[0][2] = data[1838]; buffer[0][3] = data[1839]; buffer[0][4] = data[1840]; buffer[0][5] = data[1841]; buffer[0][6] = data[1842]; buffer[0][7] = data[1843]; buffer[0][8] = data[1844]; buffer[0][9] = data[1845]; buffer[0][10] = data[1846]; buffer[0][11] = data[1847]; buffer[0][12] = data[1848]; buffer[0][13] = data[1849]; buffer[0][14] = data[1850]; buffer[0][15] = data[1851]; buffer[0][16] = data[1852]; buffer[0][17] = data[1853]; buffer[0][18] = data[1854]; buffer[0][19] = data[1855]; buffer[0][20] = data[1856]; buffer[0][21] = data[1857]; buffer[0][22] = data[1858]; buffer[0][23] = data[1859]; buffer[0][24] = data[1860]; buffer[0][25] = data[1861]; buffer[0][26] = data[1862]; buffer[0][27] = data[1863]; buffer[0][28] = data[1864]; buffer[0][29] = data[1865]; buffer[0][30] = data[1866]; buffer[0][31] = data[1867]; buffer[0][32] = data[1868]; buffer[0][33] = data[1869]; buffer[0][34] = data[1870]; buffer[0][35] = data[1871];
+
+        }
+        if (partition ==  52) {
+            buffer[0][0] = data[1872]; buffer[0][1] = data[1873]; buffer[0][2] = data[1874]; buffer[0][3] = data[1875]; buffer[0][4] = data[1876]; buffer[0][5] = data[1877]; buffer[0][6] = data[1878]; buffer[0][7] = data[1879]; buffer[0][8] = data[1880]; buffer[0][9] = data[1881]; buffer[0][10] = data[1882]; buffer[0][11] = data[1883]; buffer[0][12] = data[1884]; buffer[0][13] = data[1885]; buffer[0][14] = data[1886]; buffer[0][15] = data[1887]; buffer[0][16] = data[1888]; buffer[0][17] = data[1889]; buffer[0][18] = data[1890]; buffer[0][19] = data[1891]; buffer[0][20] = data[1892]; buffer[0][21] = data[1893]; buffer[0][22] = data[1894]; buffer[0][23] = data[1895]; buffer[0][24] = data[1896]; buffer[0][25] = data[1897]; buffer[0][26] = data[1898]; buffer[0][27] = data[1899]; buffer[0][28] = data[1900]; buffer[0][29] = data[1901]; buffer[0][30] = data[1902]; buffer[0][31] = data[1903]; buffer[0][32] = data[1904]; buffer[0][33] = data[1905]; buffer[0][34] = data[1906]; buffer[0][35] = data[1907];
+
+        }
+        if (partition ==  53) {
+            buffer[0][0] = data[1908]; buffer[0][1] = data[1909]; buffer[0][2] = data[1910]; buffer[0][3] = data[1911]; buffer[0][4] = data[1912]; buffer[0][5] = data[1913]; buffer[0][6] = data[1914]; buffer[0][7] = data[1915]; buffer[0][8] = data[1916]; buffer[0][9] = data[1917]; buffer[0][10] = data[1918]; buffer[0][11] = data[1919]; buffer[0][12] = data[1920]; buffer[0][13] = data[1921]; buffer[0][14] = data[1922]; buffer[0][15] = data[1923]; buffer[0][16] = data[1924]; buffer[0][17] = data[1925]; buffer[0][18] = data[1926]; buffer[0][19] = data[1927]; buffer[0][20] = data[1928]; buffer[0][21] = data[1929]; buffer[0][22] = data[1930]; buffer[0][23] = data[1931]; buffer[0][24] = data[1932]; buffer[0][25] = data[1933]; buffer[0][26] = data[1934]; buffer[0][27] = data[1935]; buffer[0][28] = data[1936]; buffer[0][29] = data[1937]; buffer[0][30] = data[1938]; buffer[0][31] = data[1939]; buffer[0][32] = data[1940]; buffer[0][33] = data[1941]; buffer[0][34] = data[1942]; buffer[0][35] = data[1943];
+
+        }
+        if (partition ==  54) {
+            buffer[0][0] = data[1944]; buffer[0][1] = data[1945]; buffer[0][2] = data[1946]; buffer[0][3] = data[1947]; buffer[0][4] = data[1948]; buffer[0][5] = data[1949]; buffer[0][6] = data[1950]; buffer[0][7] = data[1951]; buffer[0][8] = data[1952]; buffer[0][9] = data[1953]; buffer[0][10] = data[1954]; buffer[0][11] = data[1955]; buffer[0][12] = data[1956]; buffer[0][13] = data[1957]; buffer[0][14] = data[1958]; buffer[0][15] = data[1959]; buffer[0][16] = data[1960]; buffer[0][17] = data[1961]; buffer[0][18] = data[1962]; buffer[0][19] = data[1963]; buffer[0][20] = data[1964]; buffer[0][21] = data[1965]; buffer[0][22] = data[1966]; buffer[0][23] = data[1967]; buffer[0][24] = data[1968]; buffer[0][25] = data[1969]; buffer[0][26] = data[1970]; buffer[0][27] = data[1971]; buffer[0][28] = data[1972]; buffer[0][29] = data[1973]; buffer[0][30] = data[1974]; buffer[0][31] = data[1975]; buffer[0][32] = data[1976]; buffer[0][33] = data[1977]; buffer[0][34] = data[1978]; buffer[0][35] = data[1979];
+
+        }
+        if (partition ==  55) {
+            buffer[0][0] = data[1980]; buffer[0][1] = data[1981]; buffer[0][2] = data[1982]; buffer[0][3] = data[1983]; buffer[0][4] = data[1984]; buffer[0][5] = data[1985]; buffer[0][6] = data[1986]; buffer[0][7] = data[1987]; buffer[0][8] = data[1988]; buffer[0][9] = data[1989]; buffer[0][10] = data[1990]; buffer[0][11] = data[1991]; buffer[0][12] = data[1992]; buffer[0][13] = data[1993]; buffer[0][14] = data[1994]; buffer[0][15] = data[1995]; buffer[0][16] = data[1996]; buffer[0][17] = data[1997]; buffer[0][18] = data[1998]; buffer[0][19] = data[1999]; buffer[0][20] = data[2000]; buffer[0][21] = data[2001]; buffer[0][22] = data[2002]; buffer[0][23] = data[2003]; buffer[0][24] = data[2004]; buffer[0][25] = data[2005]; buffer[0][26] = data[2006]; buffer[0][27] = data[2007]; buffer[0][28] = data[2008]; buffer[0][29] = data[2009]; buffer[0][30] = data[2010]; buffer[0][31] = data[2011]; buffer[0][32] = data[2012]; buffer[0][33] = data[2013]; buffer[0][34] = data[2014]; buffer[0][35] = data[2015];
+
+        }
+        if (partition ==  56) {
+            buffer[0][0] = data[2016]; buffer[0][1] = data[2017]; buffer[0][2] = data[2018]; buffer[0][3] = data[2019]; buffer[0][4] = data[2020]; buffer[0][5] = data[2021]; buffer[0][6] = data[2022]; buffer[0][7] = data[2023]; buffer[0][8] = data[2024]; buffer[0][9] = data[2025]; buffer[0][10] = data[2026]; buffer[0][11] = data[2027]; buffer[0][12] = data[2028]; buffer[0][13] = data[2029]; buffer[0][14] = data[2030]; buffer[0][15] = data[2031]; buffer[0][16] = data[2032]; buffer[0][17] = data[2033]; buffer[0][18] = data[2034]; buffer[0][19] = data[2035]; buffer[0][20] = data[2036]; buffer[0][21] = data[2037]; buffer[0][22] = data[2038]; buffer[0][23] = data[2039]; buffer[0][24] = data[2040]; buffer[0][25] = data[2041]; buffer[0][26] = data[2042]; buffer[0][27] = data[2043]; buffer[0][28] = data[2044]; buffer[0][29] = data[2045]; buffer[0][30] = data[2046]; buffer[0][31] = data[2047]; buffer[0][32] = data[2048]; buffer[0][33] = data[2049]; buffer[0][34] = data[2050]; buffer[0][35] = data[2051];
+
+        }
+        if (partition ==  57) {
+            buffer[0][0] = data[2052]; buffer[0][1] = data[2053]; buffer[0][2] = data[2054]; buffer[0][3] = data[2055]; buffer[0][4] = data[2056]; buffer[0][5] = data[2057]; buffer[0][6] = data[2058]; buffer[0][7] = data[2059]; buffer[0][8] = data[2060]; buffer[0][9] = data[2061]; buffer[0][10] = data[2062]; buffer[0][11] = data[2063]; buffer[0][12] = data[2064]; buffer[0][13] = data[2065]; buffer[0][14] = data[2066]; buffer[0][15] = data[2067]; buffer[0][16] = data[2068]; buffer[0][17] = data[2069]; buffer[0][18] = data[2070]; buffer[0][19] = data[2071]; buffer[0][20] = data[2072]; buffer[0][21] = data[2073]; buffer[0][22] = data[2074]; buffer[0][23] = data[2075]; buffer[0][24] = data[2076]; buffer[0][25] = data[2077]; buffer[0][26] = data[2078]; buffer[0][27] = data[2079]; buffer[0][28] = data[2080]; buffer[0][29] = data[2081]; buffer[0][30] = data[2082]; buffer[0][31] = data[2083]; buffer[0][32] = data[2084]; buffer[0][33] = data[2085]; buffer[0][34] = data[2086]; buffer[0][35] = data[2087];
+
+        }
+        if (partition ==  58) {
+            buffer[0][0] = data[2088]; buffer[0][1] = data[2089]; buffer[0][2] = data[2090]; buffer[0][3] = data[2091]; buffer[0][4] = data[2092]; buffer[0][5] = data[2093]; buffer[0][6] = data[2094]; buffer[0][7] = data[2095]; buffer[0][8] = data[2096]; buffer[0][9] = data[2097]; buffer[0][10] = data[2098]; buffer[0][11] = data[2099]; buffer[0][12] = data[2100]; buffer[0][13] = data[2101]; buffer[0][14] = data[2102]; buffer[0][15] = data[2103]; buffer[0][16] = data[2104]; buffer[0][17] = data[2105]; buffer[0][18] = data[2106]; buffer[0][19] = data[2107]; buffer[0][20] = data[2108]; buffer[0][21] = data[2109]; buffer[0][22] = data[2110]; buffer[0][23] = data[2111]; buffer[0][24] = data[2112]; buffer[0][25] = data[2113]; buffer[0][26] = data[2114]; buffer[0][27] = data[2115]; buffer[0][28] = data[2116]; buffer[0][29] = data[2117]; buffer[0][30] = data[2118]; buffer[0][31] = data[2119]; buffer[0][32] = data[2120]; buffer[0][33] = data[2121]; buffer[0][34] = data[2122]; buffer[0][35] = data[2123];
+
+        }
+        if (partition ==  59) {
+            buffer[0][0] = data[2124]; buffer[0][1] = data[2125]; buffer[0][2] = data[2126]; buffer[0][3] = data[2127]; buffer[0][4] = data[2128]; buffer[0][5] = data[2129]; buffer[0][6] = data[2130]; buffer[0][7] = data[2131]; buffer[0][8] = data[2132]; buffer[0][9] = data[2133]; buffer[0][10] = data[2134]; buffer[0][11] = data[2135]; buffer[0][12] = data[2136]; buffer[0][13] = data[2137]; buffer[0][14] = data[2138]; buffer[0][15] = data[2139]; buffer[0][16] = data[2140]; buffer[0][17] = data[2141]; buffer[0][18] = data[2142]; buffer[0][19] = data[2143]; buffer[0][20] = data[2144]; buffer[0][21] = data[2145]; buffer[0][22] = data[2146]; buffer[0][23] = data[2147]; buffer[0][24] = data[2148]; buffer[0][25] = data[2149]; buffer[0][26] = data[2150]; buffer[0][27] = data[2151]; buffer[0][28] = data[2152]; buffer[0][29] = data[2153]; buffer[0][30] = data[2154]; buffer[0][31] = data[2155]; buffer[0][32] = data[2156]; buffer[0][33] = data[2157]; buffer[0][34] = data[2158]; buffer[0][35] = data[2159];
+
+        }
+        if (partition ==  60) {
+            buffer[0][0] = data[2160]; buffer[0][1] = data[2161]; buffer[0][2] = data[2162]; buffer[0][3] = data[2163]; buffer[0][4] = data[2164]; buffer[0][5] = data[2165]; buffer[0][6] = data[2166]; buffer[0][7] = data[2167]; buffer[0][8] = data[2168]; buffer[0][9] = data[2169]; buffer[0][10] = data[2170]; buffer[0][11] = data[2171]; buffer[0][12] = data[2172]; buffer[0][13] = data[2173]; buffer[0][14] = data[2174]; buffer[0][15] = data[2175]; buffer[0][16] = data[2176]; buffer[0][17] = data[2177]; buffer[0][18] = data[2178]; buffer[0][19] = data[2179]; buffer[0][20] = data[2180]; buffer[0][21] = data[2181]; buffer[0][22] = data[2182]; buffer[0][23] = data[2183]; buffer[0][24] = data[2184]; buffer[0][25] = data[2185]; buffer[0][26] = data[2186]; buffer[0][27] = data[2187]; buffer[0][28] = data[2188]; buffer[0][29] = data[2189]; buffer[0][30] = data[2190]; buffer[0][31] = data[2191]; buffer[0][32] = data[2192]; buffer[0][33] = data[2193]; buffer[0][34] = data[2194]; buffer[0][35] = data[2195];
+
+        }
+        if (partition ==  61) {
+            buffer[0][0] = data[2196]; buffer[0][1] = data[2197]; buffer[0][2] = data[2198]; buffer[0][3] = data[2199]; buffer[0][4] = data[2200]; buffer[0][5] = data[2201]; buffer[0][6] = data[2202]; buffer[0][7] = data[2203]; buffer[0][8] = data[2204]; buffer[0][9] = data[2205]; buffer[0][10] = data[2206]; buffer[0][11] = data[2207]; buffer[0][12] = data[2208]; buffer[0][13] = data[2209]; buffer[0][14] = data[2210]; buffer[0][15] = data[2211]; buffer[0][16] = data[2212]; buffer[0][17] = data[2213]; buffer[0][18] = data[2214]; buffer[0][19] = data[2215]; buffer[0][20] = data[2216]; buffer[0][21] = data[2217]; buffer[0][22] = data[2218]; buffer[0][23] = data[2219]; buffer[0][24] = data[2220]; buffer[0][25] = data[2221]; buffer[0][26] = data[2222]; buffer[0][27] = data[2223]; buffer[0][28] = data[2224]; buffer[0][29] = data[2225]; buffer[0][30] = data[2226]; buffer[0][31] = data[2227]; buffer[0][32] = data[2228]; buffer[0][33] = data[2229]; buffer[0][34] = data[2230]; buffer[0][35] = data[2231];
+
+        }
+        if (partition ==  62) {
+            buffer[0][0] = data[2232]; buffer[0][1] = data[2233]; buffer[0][2] = data[2234]; buffer[0][3] = data[2235]; buffer[0][4] = data[2236]; buffer[0][5] = data[2237]; buffer[0][6] = data[2238]; buffer[0][7] = data[2239]; buffer[0][8] = data[2240]; buffer[0][9] = data[2241]; buffer[0][10] = data[2242]; buffer[0][11] = data[2243]; buffer[0][12] = data[2244]; buffer[0][13] = data[2245]; buffer[0][14] = data[2246]; buffer[0][15] = data[2247]; buffer[0][16] = data[2248]; buffer[0][17] = data[2249]; buffer[0][18] = data[2250]; buffer[0][19] = data[2251]; buffer[0][20] = data[2252]; buffer[0][21] = data[2253]; buffer[0][22] = data[2254]; buffer[0][23] = data[2255]; buffer[0][24] = data[2256]; buffer[0][25] = data[2257]; buffer[0][26] = data[2258]; buffer[0][27] = data[2259]; buffer[0][28] = data[2260]; buffer[0][29] = data[2261]; buffer[0][30] = data[2262]; buffer[0][31] = data[2263]; buffer[0][32] = data[2264]; buffer[0][33] = data[2265]; buffer[0][34] = data[2266]; buffer[0][35] = data[2267];
+
+        }
+        if (partition ==  63) {
+            buffer[0][0] = data[2268]; buffer[0][1] = data[2269]; buffer[0][2] = data[2270]; buffer[0][3] = data[2271]; buffer[0][4] = data[2272]; buffer[0][5] = data[2273]; buffer[0][6] = data[2274]; buffer[0][7] = data[2275]; buffer[0][8] = data[2276]; buffer[0][9] = data[2277]; buffer[0][10] = data[2278]; buffer[0][11] = data[2279]; buffer[0][12] = data[2280]; buffer[0][13] = data[2281]; buffer[0][14] = data[2282]; buffer[0][15] = data[2283]; buffer[0][16] = data[2284]; buffer[0][17] = data[2285]; buffer[0][18] = data[2286]; buffer[0][19] = data[2287]; buffer[0][20] = data[2288]; buffer[0][21] = data[2289]; buffer[0][22] = data[2290]; buffer[0][23] = data[2291]; buffer[0][24] = data[2292]; buffer[0][25] = data[2293]; buffer[0][26] = data[2294]; buffer[0][27] = data[2295]; buffer[0][28] = data[2296]; buffer[0][29] = data[2297]; buffer[0][30] = data[2298]; buffer[0][31] = data[2299]; buffer[0][32] = data[2300]; buffer[0][33] = data[2301]; buffer[0][34] = data[2302]; buffer[0][35] = data[2303];
+
+        }
+        if (partition ==  64) {
+            buffer[0][0] = data[2304]; buffer[0][1] = data[2305]; buffer[0][2] = data[2306]; buffer[0][3] = data[2307]; buffer[0][4] = data[2308]; buffer[0][5] = data[2309]; buffer[0][6] = data[2310]; buffer[0][7] = data[2311]; buffer[0][8] = data[2312]; buffer[0][9] = data[2313]; buffer[0][10] = data[2314]; buffer[0][11] = data[2315]; buffer[0][12] = data[2316]; buffer[0][13] = data[2317]; buffer[0][14] = data[2318]; buffer[0][15] = data[2319]; buffer[0][16] = data[2320]; buffer[0][17] = data[2321]; buffer[0][18] = data[2322]; buffer[0][19] = data[2323]; buffer[0][20] = data[2324]; buffer[0][21] = data[2325]; buffer[0][22] = data[2326]; buffer[0][23] = data[2327]; buffer[0][24] = data[2328]; buffer[0][25] = data[2329]; buffer[0][26] = data[2330]; buffer[0][27] = data[2331]; buffer[0][28] = data[2332]; buffer[0][29] = data[2333]; buffer[0][30] = data[2334]; buffer[0][31] = data[2335]; buffer[0][32] = data[2336]; buffer[0][33] = data[2337]; buffer[0][34] = data[2338]; buffer[0][35] = data[2339];
+
+        }
+        if (partition ==  65) {
+            buffer[0][0] = data[2340]; buffer[0][1] = data[2341]; buffer[0][2] = data[2342]; buffer[0][3] = data[2343]; buffer[0][4] = data[2344]; buffer[0][5] = data[2345]; buffer[0][6] = data[2346]; buffer[0][7] = data[2347]; buffer[0][8] = data[2348]; buffer[0][9] = data[2349]; buffer[0][10] = data[2350]; buffer[0][11] = data[2351]; buffer[0][12] = data[2352]; buffer[0][13] = data[2353]; buffer[0][14] = data[2354]; buffer[0][15] = data[2355]; buffer[0][16] = data[2356]; buffer[0][17] = data[2357]; buffer[0][18] = data[2358]; buffer[0][19] = data[2359]; buffer[0][20] = data[2360]; buffer[0][21] = data[2361]; buffer[0][22] = data[2362]; buffer[0][23] = data[2363]; buffer[0][24] = data[2364]; buffer[0][25] = data[2365]; buffer[0][26] = data[2366]; buffer[0][27] = data[2367]; buffer[0][28] = data[2368]; buffer[0][29] = data[2369]; buffer[0][30] = data[2370]; buffer[0][31] = data[2371]; buffer[0][32] = data[2372]; buffer[0][33] = data[2373]; buffer[0][34] = data[2374]; buffer[0][35] = data[2375];
+
+        }
+        if (partition ==  66) {
+            buffer[0][0] = data[2376]; buffer[0][1] = data[2377]; buffer[0][2] = data[2378]; buffer[0][3] = data[2379]; buffer[0][4] = data[2380]; buffer[0][5] = data[2381]; buffer[0][6] = data[2382]; buffer[0][7] = data[2383]; buffer[0][8] = data[2384]; buffer[0][9] = data[2385]; buffer[0][10] = data[2386]; buffer[0][11] = data[2387]; buffer[0][12] = data[2388]; buffer[0][13] = data[2389]; buffer[0][14] = data[2390]; buffer[0][15] = data[2391]; buffer[0][16] = data[2392]; buffer[0][17] = data[2393]; buffer[0][18] = data[2394]; buffer[0][19] = data[2395]; buffer[0][20] = data[2396]; buffer[0][21] = data[2397]; buffer[0][22] = data[2398]; buffer[0][23] = data[2399]; buffer[0][24] = data[2400]; buffer[0][25] = data[2401]; buffer[0][26] = data[2402]; buffer[0][27] = data[2403]; buffer[0][28] = data[2404]; buffer[0][29] = data[2405]; buffer[0][30] = data[2406]; buffer[0][31] = data[2407]; buffer[0][32] = data[2408]; buffer[0][33] = data[2409]; buffer[0][34] = data[2410]; buffer[0][35] = data[2411];
+
+        }
+        if (partition ==  67) {
+            buffer[0][0] = data[2412]; buffer[0][1] = data[2413]; buffer[0][2] = data[2414]; buffer[0][3] = data[2415]; buffer[0][4] = data[2416]; buffer[0][5] = data[2417]; buffer[0][6] = data[2418]; buffer[0][7] = data[2419]; buffer[0][8] = data[2420]; buffer[0][9] = data[2421]; buffer[0][10] = data[2422]; buffer[0][11] = data[2423]; buffer[0][12] = data[2424]; buffer[0][13] = data[2425]; buffer[0][14] = data[2426]; buffer[0][15] = data[2427]; buffer[0][16] = data[2428]; buffer[0][17] = data[2429]; buffer[0][18] = data[2430]; buffer[0][19] = data[2431]; buffer[0][20] = data[2432]; buffer[0][21] = data[2433]; buffer[0][22] = data[2434]; buffer[0][23] = data[2435]; buffer[0][24] = data[2436]; buffer[0][25] = data[2437]; buffer[0][26] = data[2438]; buffer[0][27] = data[2439]; buffer[0][28] = data[2440]; buffer[0][29] = data[2441]; buffer[0][30] = data[2442]; buffer[0][31] = data[2443]; buffer[0][32] = data[2444]; buffer[0][33] = data[2445]; buffer[0][34] = data[2446]; buffer[0][35] = data[2447];
+
+        }
+        if (partition ==  68) {
+            buffer[0][0] = data[2448]; buffer[0][1] = data[2449]; buffer[0][2] = data[2450]; buffer[0][3] = data[2451]; buffer[0][4] = data[2452]; buffer[0][5] = data[2453]; buffer[0][6] = data[2454]; buffer[0][7] = data[2455]; buffer[0][8] = data[2456]; buffer[0][9] = data[2457]; buffer[0][10] = data[2458]; buffer[0][11] = data[2459]; buffer[0][12] = data[2460]; buffer[0][13] = data[2461]; buffer[0][14] = data[2462]; buffer[0][15] = data[2463]; buffer[0][16] = data[2464]; buffer[0][17] = data[2465]; buffer[0][18] = data[2466]; buffer[0][19] = data[2467]; buffer[0][20] = data[2468]; buffer[0][21] = data[2469]; buffer[0][22] = data[2470]; buffer[0][23] = data[2471]; buffer[0][24] = data[2472]; buffer[0][25] = data[2473]; buffer[0][26] = data[2474]; buffer[0][27] = data[2475]; buffer[0][28] = data[2476]; buffer[0][29] = data[2477]; buffer[0][30] = data[2478]; buffer[0][31] = data[2479]; buffer[0][32] = data[2480]; buffer[0][33] = data[2481]; buffer[0][34] = data[2482]; buffer[0][35] = data[2483];
+
+        }
+        if (partition ==  69) {
+            buffer[0][0] = data[2484]; buffer[0][1] = data[2485]; buffer[0][2] = data[2486]; buffer[0][3] = data[2487]; buffer[0][4] = data[2488]; buffer[0][5] = data[2489]; buffer[0][6] = data[2490]; buffer[0][7] = data[2491]; buffer[0][8] = data[2492]; buffer[0][9] = data[2493]; buffer[0][10] = data[2494]; buffer[0][11] = data[2495]; buffer[0][12] = data[2496]; buffer[0][13] = data[2497]; buffer[0][14] = data[2498]; buffer[0][15] = data[2499]; buffer[0][16] = data[2500]; buffer[0][17] = data[2501]; buffer[0][18] = data[2502]; buffer[0][19] = data[2503]; buffer[0][20] = data[2504]; buffer[0][21] = data[2505]; buffer[0][22] = data[2506]; buffer[0][23] = data[2507]; buffer[0][24] = data[2508]; buffer[0][25] = data[2509]; buffer[0][26] = data[2510]; buffer[0][27] = data[2511]; buffer[0][28] = data[2512]; buffer[0][29] = data[2513]; buffer[0][30] = data[2514]; buffer[0][31] = data[2515]; buffer[0][32] = data[2516]; buffer[0][33] = data[2517]; buffer[0][34] = data[2518]; buffer[0][35] = data[2519];
+
+        }
+        if (partition ==  70) {
+            buffer[0][0] = data[2520]; buffer[0][1] = data[2521]; buffer[0][2] = data[2522]; buffer[0][3] = data[2523]; buffer[0][4] = data[2524]; buffer[0][5] = data[2525]; buffer[0][6] = data[2526]; buffer[0][7] = data[2527]; buffer[0][8] = data[2528]; buffer[0][9] = data[2529]; buffer[0][10] = data[2530]; buffer[0][11] = data[2531]; buffer[0][12] = data[2532]; buffer[0][13] = data[2533]; buffer[0][14] = data[2534]; buffer[0][15] = data[2535]; buffer[0][16] = data[2536]; buffer[0][17] = data[2537]; buffer[0][18] = data[2538]; buffer[0][19] = data[2539]; buffer[0][20] = data[2540]; buffer[0][21] = data[2541]; buffer[0][22] = data[2542]; buffer[0][23] = data[2543]; buffer[0][24] = data[2544]; buffer[0][25] = data[2545]; buffer[0][26] = data[2546]; buffer[0][27] = data[2547]; buffer[0][28] = data[2548]; buffer[0][29] = data[2549]; buffer[0][30] = data[2550]; buffer[0][31] = data[2551]; buffer[0][32] = data[2552]; buffer[0][33] = data[2553]; buffer[0][34] = data[2554]; buffer[0][35] = data[2555];
+
+        }
+        if (partition ==  71) {
+            buffer[0][0] = data[2556]; buffer[0][1] = data[2557]; buffer[0][2] = data[2558]; buffer[0][3] = data[2559]; buffer[0][4] = data[2560]; buffer[0][5] = data[2561]; buffer[0][6] = data[2562]; buffer[0][7] = data[2563]; buffer[0][8] = data[2564]; buffer[0][9] = data[2565]; buffer[0][10] = data[2566]; buffer[0][11] = data[2567]; buffer[0][12] = data[2568]; buffer[0][13] = data[2569]; buffer[0][14] = data[2570]; buffer[0][15] = data[2571]; buffer[0][16] = data[2572]; buffer[0][17] = data[2573]; buffer[0][18] = data[2574]; buffer[0][19] = data[2575]; buffer[0][20] = data[2576]; buffer[0][21] = data[2577]; buffer[0][22] = data[2578]; buffer[0][23] = data[2579]; buffer[0][24] = data[2580]; buffer[0][25] = data[2581]; buffer[0][26] = data[2582]; buffer[0][27] = data[2583]; buffer[0][28] = data[2584]; buffer[0][29] = data[2585]; buffer[0][30] = data[2586]; buffer[0][31] = data[2587]; buffer[0][32] = data[2588]; buffer[0][33] = data[2589]; buffer[0][34] = data[2590]; buffer[0][35] = data[2591];
+
+        }
+        if (partition ==  72) {
+            buffer[0][0] = data[2592]; buffer[0][1] = data[2593]; buffer[0][2] = data[2594]; buffer[0][3] = data[2595]; buffer[0][4] = data[2596]; buffer[0][5] = data[2597]; buffer[0][6] = data[2598]; buffer[0][7] = data[2599]; buffer[0][8] = data[2600]; buffer[0][9] = data[2601]; buffer[0][10] = data[2602]; buffer[0][11] = data[2603]; buffer[0][12] = data[2604]; buffer[0][13] = data[2605]; buffer[0][14] = data[2606]; buffer[0][15] = data[2607]; buffer[0][16] = data[2608]; buffer[0][17] = data[2609]; buffer[0][18] = data[2610]; buffer[0][19] = data[2611]; buffer[0][20] = data[2612]; buffer[0][21] = data[2613]; buffer[0][22] = data[2614]; buffer[0][23] = data[2615]; buffer[0][24] = data[2616]; buffer[0][25] = data[2617]; buffer[0][26] = data[2618]; buffer[0][27] = data[2619]; buffer[0][28] = data[2620]; buffer[0][29] = data[2621]; buffer[0][30] = data[2622]; buffer[0][31] = data[2623]; buffer[0][32] = data[2624]; buffer[0][33] = data[2625]; buffer[0][34] = data[2626]; buffer[0][35] = data[2627];
+
+        }
+        if (partition ==  73) {
+            buffer[0][0] = data[2628]; buffer[0][1] = data[2629]; buffer[0][2] = data[2630]; buffer[0][3] = data[2631]; buffer[0][4] = data[2632]; buffer[0][5] = data[2633]; buffer[0][6] = data[2634]; buffer[0][7] = data[2635]; buffer[0][8] = data[2636]; buffer[0][9] = data[2637]; buffer[0][10] = data[2638]; buffer[0][11] = data[2639]; buffer[0][12] = data[2640]; buffer[0][13] = data[2641]; buffer[0][14] = data[2642]; buffer[0][15] = data[2643]; buffer[0][16] = data[2644]; buffer[0][17] = data[2645]; buffer[0][18] = data[2646]; buffer[0][19] = data[2647]; buffer[0][20] = data[2648]; buffer[0][21] = data[2649]; buffer[0][22] = data[2650]; buffer[0][23] = data[2651]; buffer[0][24] = data[2652]; buffer[0][25] = data[2653]; buffer[0][26] = data[2654]; buffer[0][27] = data[2655]; buffer[0][28] = data[2656]; buffer[0][29] = data[2657]; buffer[0][30] = data[2658]; buffer[0][31] = data[2659]; buffer[0][32] = data[2660]; buffer[0][33] = data[2661]; buffer[0][34] = data[2662]; buffer[0][35] = data[2663];
+
+        }
+        if (partition ==  74) {
+            buffer[0][0] = data[2664]; buffer[0][1] = data[2665]; buffer[0][2] = data[2666]; buffer[0][3] = data[2667]; buffer[0][4] = data[2668]; buffer[0][5] = data[2669]; buffer[0][6] = data[2670]; buffer[0][7] = data[2671]; buffer[0][8] = data[2672]; buffer[0][9] = data[2673]; buffer[0][10] = data[2674]; buffer[0][11] = data[2675]; buffer[0][12] = data[2676]; buffer[0][13] = data[2677]; buffer[0][14] = data[2678]; buffer[0][15] = data[2679]; buffer[0][16] = data[2680]; buffer[0][17] = data[2681]; buffer[0][18] = data[2682]; buffer[0][19] = data[2683]; buffer[0][20] = data[2684]; buffer[0][21] = data[2685]; buffer[0][22] = data[2686]; buffer[0][23] = data[2687]; buffer[0][24] = data[2688]; buffer[0][25] = data[2689]; buffer[0][26] = data[2690]; buffer[0][27] = data[2691]; buffer[0][28] = data[2692]; buffer[0][29] = data[2693]; buffer[0][30] = data[2694]; buffer[0][31] = data[2695]; buffer[0][32] = data[2696]; buffer[0][33] = data[2697]; buffer[0][34] = data[2698]; buffer[0][35] = data[2699];
+
+        }
+        if (partition ==  75) {
+            buffer[0][0] = data[2700]; buffer[0][1] = data[2701]; buffer[0][2] = data[2702]; buffer[0][3] = data[2703]; buffer[0][4] = data[2704]; buffer[0][5] = data[2705]; buffer[0][6] = data[2706]; buffer[0][7] = data[2707]; buffer[0][8] = data[2708]; buffer[0][9] = data[2709]; buffer[0][10] = data[2710]; buffer[0][11] = data[2711]; buffer[0][12] = data[2712]; buffer[0][13] = data[2713]; buffer[0][14] = data[2714]; buffer[0][15] = data[2715]; buffer[0][16] = data[2716]; buffer[0][17] = data[2717]; buffer[0][18] = data[2718]; buffer[0][19] = data[2719]; buffer[0][20] = data[2720]; buffer[0][21] = data[2721]; buffer[0][22] = data[2722]; buffer[0][23] = data[2723]; buffer[0][24] = data[2724]; buffer[0][25] = data[2725]; buffer[0][26] = data[2726]; buffer[0][27] = data[2727]; buffer[0][28] = data[2728]; buffer[0][29] = data[2729]; buffer[0][30] = data[2730]; buffer[0][31] = data[2731]; buffer[0][32] = data[2732]; buffer[0][33] = data[2733]; buffer[0][34] = data[2734]; buffer[0][35] = data[2735];
+
+        }
+        if (partition ==  76) {
+            buffer[0][0] = data[2736]; buffer[0][1] = data[2737]; buffer[0][2] = data[2738]; buffer[0][3] = data[2739]; buffer[0][4] = data[2740]; buffer[0][5] = data[2741]; buffer[0][6] = data[2742]; buffer[0][7] = data[2743]; buffer[0][8] = data[2744]; buffer[0][9] = data[2745]; buffer[0][10] = data[2746]; buffer[0][11] = data[2747]; buffer[0][12] = data[2748]; buffer[0][13] = data[2749]; buffer[0][14] = data[2750]; buffer[0][15] = data[2751]; buffer[0][16] = data[2752]; buffer[0][17] = data[2753]; buffer[0][18] = data[2754]; buffer[0][19] = data[2755]; buffer[0][20] = data[2756]; buffer[0][21] = data[2757]; buffer[0][22] = data[2758]; buffer[0][23] = data[2759]; buffer[0][24] = data[2760]; buffer[0][25] = data[2761]; buffer[0][26] = data[2762]; buffer[0][27] = data[2763]; buffer[0][28] = data[2764]; buffer[0][29] = data[2765]; buffer[0][30] = data[2766]; buffer[0][31] = data[2767]; buffer[0][32] = data[2768]; buffer[0][33] = data[2769]; buffer[0][34] = data[2770]; buffer[0][35] = data[2771];
+
+        }
+        if (partition ==  77) {
+            buffer[0][0] = data[2772]; buffer[0][1] = data[2773]; buffer[0][2] = data[2774]; buffer[0][3] = data[2775]; buffer[0][4] = data[2776]; buffer[0][5] = data[2777]; buffer[0][6] = data[2778]; buffer[0][7] = data[2779]; buffer[0][8] = data[2780]; buffer[0][9] = data[2781]; buffer[0][10] = data[2782]; buffer[0][11] = data[2783]; buffer[0][12] = data[2784]; buffer[0][13] = data[2785]; buffer[0][14] = data[2786]; buffer[0][15] = data[2787]; buffer[0][16] = data[2788]; buffer[0][17] = data[2789]; buffer[0][18] = data[2790]; buffer[0][19] = data[2791]; buffer[0][20] = data[2792]; buffer[0][21] = data[2793]; buffer[0][22] = data[2794]; buffer[0][23] = data[2795]; buffer[0][24] = data[2796]; buffer[0][25] = data[2797]; buffer[0][26] = data[2798]; buffer[0][27] = data[2799]; buffer[0][28] = data[2800]; buffer[0][29] = data[2801]; buffer[0][30] = data[2802]; buffer[0][31] = data[2803]; buffer[0][32] = data[2804]; buffer[0][33] = data[2805]; buffer[0][34] = data[2806]; buffer[0][35] = data[2807];
+
+        }
+        if (partition ==  78) {
+            buffer[0][0] = data[2808]; buffer[0][1] = data[2809]; buffer[0][2] = data[2810]; buffer[0][3] = data[2811]; buffer[0][4] = data[2812]; buffer[0][5] = data[2813]; buffer[0][6] = data[2814]; buffer[0][7] = data[2815]; buffer[0][8] = data[2816]; buffer[0][9] = data[2817]; buffer[0][10] = data[2818]; buffer[0][11] = data[2819]; buffer[0][12] = data[2820]; buffer[0][13] = data[2821]; buffer[0][14] = data[2822]; buffer[0][15] = data[2823]; buffer[0][16] = data[2824]; buffer[0][17] = data[2825]; buffer[0][18] = data[2826]; buffer[0][19] = data[2827]; buffer[0][20] = data[2828]; buffer[0][21] = data[2829]; buffer[0][22] = data[2830]; buffer[0][23] = data[2831]; buffer[0][24] = data[2832]; buffer[0][25] = data[2833]; buffer[0][26] = data[2834]; buffer[0][27] = data[2835]; buffer[0][28] = data[2836]; buffer[0][29] = data[2837]; buffer[0][30] = data[2838]; buffer[0][31] = data[2839]; buffer[0][32] = data[2840]; buffer[0][33] = data[2841]; buffer[0][34] = data[2842]; buffer[0][35] = data[2843];
+
+        }
+        if (partition ==  79) {
+            buffer[0][0] = data[2844]; buffer[0][1] = data[2845]; buffer[0][2] = data[2846]; buffer[0][3] = data[2847]; buffer[0][4] = data[2848]; buffer[0][5] = data[2849]; buffer[0][6] = data[2850]; buffer[0][7] = data[2851]; buffer[0][8] = data[2852]; buffer[0][9] = data[2853]; buffer[0][10] = data[2854]; buffer[0][11] = data[2855]; buffer[0][12] = data[2856]; buffer[0][13] = data[2857]; buffer[0][14] = data[2858]; buffer[0][15] = data[2859]; buffer[0][16] = data[2860]; buffer[0][17] = data[2861]; buffer[0][18] = data[2862]; buffer[0][19] = data[2863]; buffer[0][20] = data[2864]; buffer[0][21] = data[2865]; buffer[0][22] = data[2866]; buffer[0][23] = data[2867]; buffer[0][24] = data[2868]; buffer[0][25] = data[2869]; buffer[0][26] = data[2870]; buffer[0][27] = data[2871]; buffer[0][28] = data[2872]; buffer[0][29] = data[2873]; buffer[0][30] = data[2874]; buffer[0][31] = data[2875]; buffer[0][32] = data[2876]; buffer[0][33] = data[2877]; buffer[0][34] = data[2878]; buffer[0][35] = data[2879];
+
+        }
+        if (partition ==  80) {
+            buffer[0][0] = data[2880]; buffer[0][1] = data[2881]; buffer[0][2] = data[2882]; buffer[0][3] = data[2883]; buffer[0][4] = data[2884]; buffer[0][5] = data[2885]; buffer[0][6] = data[2886]; buffer[0][7] = data[2887]; buffer[0][8] = data[2888]; buffer[0][9] = data[2889]; buffer[0][10] = data[2890]; buffer[0][11] = data[2891]; buffer[0][12] = data[2892]; buffer[0][13] = data[2893]; buffer[0][14] = data[2894]; buffer[0][15] = data[2895]; buffer[0][16] = data[2896]; buffer[0][17] = data[2897]; buffer[0][18] = data[2898]; buffer[0][19] = data[2899]; buffer[0][20] = data[2900]; buffer[0][21] = data[2901]; buffer[0][22] = data[2902]; buffer[0][23] = data[2903]; buffer[0][24] = data[2904]; buffer[0][25] = data[2905]; buffer[0][26] = data[2906]; buffer[0][27] = data[2907]; buffer[0][28] = data[2908]; buffer[0][29] = data[2909]; buffer[0][30] = data[2910]; buffer[0][31] = data[2911]; buffer[0][32] = data[2912]; buffer[0][33] = data[2913]; buffer[0][34] = data[2914]; buffer[0][35] = data[2915];
+
+        }
+        if (partition ==  81) {
+            buffer[0][0] = data[2916]; buffer[0][1] = data[2917]; buffer[0][2] = data[2918]; buffer[0][3] = data[2919]; buffer[0][4] = data[2920]; buffer[0][5] = data[2921]; buffer[0][6] = data[2922]; buffer[0][7] = data[2923]; buffer[0][8] = data[2924]; buffer[0][9] = data[2925]; buffer[0][10] = data[2926]; buffer[0][11] = data[2927]; buffer[0][12] = data[2928]; buffer[0][13] = data[2929]; buffer[0][14] = data[2930]; buffer[0][15] = data[2931]; buffer[0][16] = data[2932]; buffer[0][17] = data[2933]; buffer[0][18] = data[2934]; buffer[0][19] = data[2935]; buffer[0][20] = data[2936]; buffer[0][21] = data[2937]; buffer[0][22] = data[2938]; buffer[0][23] = data[2939]; buffer[0][24] = data[2940]; buffer[0][25] = data[2941]; buffer[0][26] = data[2942]; buffer[0][27] = data[2943]; buffer[0][28] = data[2944]; buffer[0][29] = data[2945]; buffer[0][30] = data[2946]; buffer[0][31] = data[2947]; buffer[0][32] = data[2948]; buffer[0][33] = data[2949]; buffer[0][34] = data[2950]; buffer[0][35] = data[2951];
+
+        }
+        if (partition ==  82) {
+            buffer[0][0] = data[2952]; buffer[0][1] = data[2953]; buffer[0][2] = data[2954]; buffer[0][3] = data[2955]; buffer[0][4] = data[2956]; buffer[0][5] = data[2957]; buffer[0][6] = data[2958]; buffer[0][7] = data[2959]; buffer[0][8] = data[2960]; buffer[0][9] = data[2961]; buffer[0][10] = data[2962]; buffer[0][11] = data[2963]; buffer[0][12] = data[2964]; buffer[0][13] = data[2965]; buffer[0][14] = data[2966]; buffer[0][15] = data[2967]; buffer[0][16] = data[2968]; buffer[0][17] = data[2969]; buffer[0][18] = data[2970]; buffer[0][19] = data[2971]; buffer[0][20] = data[2972]; buffer[0][21] = data[2973]; buffer[0][22] = data[2974]; buffer[0][23] = data[2975]; buffer[0][24] = data[2976]; buffer[0][25] = data[2977]; buffer[0][26] = data[2978]; buffer[0][27] = data[2979]; buffer[0][28] = data[2980]; buffer[0][29] = data[2981]; buffer[0][30] = data[2982]; buffer[0][31] = data[2983]; buffer[0][32] = data[2984]; buffer[0][33] = data[2985]; buffer[0][34] = data[2986]; buffer[0][35] = data[2987];
+
+        }
+        if (partition ==  83) {
+            buffer[0][0] = data[2988]; buffer[0][1] = data[2989]; buffer[0][2] = data[2990]; buffer[0][3] = data[2991]; buffer[0][4] = data[2992]; buffer[0][5] = data[2993]; buffer[0][6] = data[2994]; buffer[0][7] = data[2995]; buffer[0][8] = data[2996]; buffer[0][9] = data[2997]; buffer[0][10] = data[2998]; buffer[0][11] = data[2999]; buffer[0][12] = data[3000]; buffer[0][13] = data[3001]; buffer[0][14] = data[3002]; buffer[0][15] = data[3003]; buffer[0][16] = data[3004]; buffer[0][17] = data[3005]; buffer[0][18] = data[3006]; buffer[0][19] = data[3007]; buffer[0][20] = data[3008]; buffer[0][21] = data[3009]; buffer[0][22] = data[3010]; buffer[0][23] = data[3011]; buffer[0][24] = data[3012]; buffer[0][25] = data[3013]; buffer[0][26] = data[3014]; buffer[0][27] = data[3015]; buffer[0][28] = data[3016]; buffer[0][29] = data[3017]; buffer[0][30] = data[3018]; buffer[0][31] = data[3019]; buffer[0][32] = data[3020]; buffer[0][33] = data[3021]; buffer[0][34] = data[3022]; buffer[0][35] = data[3023];
+
+        }
+        if (partition ==  84) {
+            buffer[0][0] = data[3024]; buffer[0][1] = data[3025]; buffer[0][2] = data[3026]; buffer[0][3] = data[3027]; buffer[0][4] = data[3028]; buffer[0][5] = data[3029]; buffer[0][6] = data[3030]; buffer[0][7] = data[3031]; buffer[0][8] = data[3032]; buffer[0][9] = data[3033]; buffer[0][10] = data[3034]; buffer[0][11] = data[3035]; buffer[0][12] = data[3036]; buffer[0][13] = data[3037]; buffer[0][14] = data[3038]; buffer[0][15] = data[3039]; buffer[0][16] = data[3040]; buffer[0][17] = data[3041]; buffer[0][18] = data[3042]; buffer[0][19] = data[3043]; buffer[0][20] = data[3044]; buffer[0][21] = data[3045]; buffer[0][22] = data[3046]; buffer[0][23] = data[3047]; buffer[0][24] = data[3048]; buffer[0][25] = data[3049]; buffer[0][26] = data[3050]; buffer[0][27] = data[3051]; buffer[0][28] = data[3052]; buffer[0][29] = data[3053]; buffer[0][30] = data[3054]; buffer[0][31] = data[3055]; buffer[0][32] = data[3056]; buffer[0][33] = data[3057]; buffer[0][34] = data[3058]; buffer[0][35] = data[3059];
+
+        }
+        if (partition ==  85) {
+            buffer[0][0] = data[3060]; buffer[0][1] = data[3061]; buffer[0][2] = data[3062]; buffer[0][3] = data[3063]; buffer[0][4] = data[3064]; buffer[0][5] = data[3065]; buffer[0][6] = data[3066]; buffer[0][7] = data[3067]; buffer[0][8] = data[3068]; buffer[0][9] = data[3069]; buffer[0][10] = data[3070]; buffer[0][11] = data[3071]; buffer[0][12] = data[3072]; buffer[0][13] = data[3073]; buffer[0][14] = data[3074]; buffer[0][15] = data[3075]; buffer[0][16] = data[3076]; buffer[0][17] = data[3077]; buffer[0][18] = data[3078]; buffer[0][19] = data[3079]; buffer[0][20] = data[3080]; buffer[0][21] = data[3081]; buffer[0][22] = data[3082]; buffer[0][23] = data[3083]; buffer[0][24] = data[3084]; buffer[0][25] = data[3085]; buffer[0][26] = data[3086]; buffer[0][27] = data[3087]; buffer[0][28] = data[3088]; buffer[0][29] = data[3089]; buffer[0][30] = data[3090]; buffer[0][31] = data[3091]; buffer[0][32] = data[3092]; buffer[0][33] = data[3093]; buffer[0][34] = data[3094]; buffer[0][35] = data[3095];
+
+        }
+        if (partition ==  86) {
+            buffer[0][0] = data[3096]; buffer[0][1] = data[3097]; buffer[0][2] = data[3098]; buffer[0][3] = data[3099]; buffer[0][4] = data[3100]; buffer[0][5] = data[3101]; buffer[0][6] = data[3102]; buffer[0][7] = data[3103]; buffer[0][8] = data[3104]; buffer[0][9] = data[3105]; buffer[0][10] = data[3106]; buffer[0][11] = data[3107]; buffer[0][12] = data[3108]; buffer[0][13] = data[3109]; buffer[0][14] = data[3110]; buffer[0][15] = data[3111]; buffer[0][16] = data[3112]; buffer[0][17] = data[3113]; buffer[0][18] = data[3114]; buffer[0][19] = data[3115]; buffer[0][20] = data[3116]; buffer[0][21] = data[3117]; buffer[0][22] = data[3118]; buffer[0][23] = data[3119]; buffer[0][24] = data[3120]; buffer[0][25] = data[3121]; buffer[0][26] = data[3122]; buffer[0][27] = data[3123]; buffer[0][28] = data[3124]; buffer[0][29] = data[3125]; buffer[0][30] = data[3126]; buffer[0][31] = data[3127]; buffer[0][32] = data[3128]; buffer[0][33] = data[3129]; buffer[0][34] = data[3130]; buffer[0][35] = data[3131];
+
+        }
+        if (partition ==  87) {
+            buffer[0][0] = data[3132]; buffer[0][1] = data[3133]; buffer[0][2] = data[3134]; buffer[0][3] = data[3135]; buffer[0][4] = data[3136]; buffer[0][5] = data[3137]; buffer[0][6] = data[3138]; buffer[0][7] = data[3139]; buffer[0][8] = data[3140]; buffer[0][9] = data[3141]; buffer[0][10] = data[3142]; buffer[0][11] = data[3143]; buffer[0][12] = data[3144]; buffer[0][13] = data[3145]; buffer[0][14] = data[3146]; buffer[0][15] = data[3147]; buffer[0][16] = data[3148]; buffer[0][17] = data[3149]; buffer[0][18] = data[3150]; buffer[0][19] = data[3151]; buffer[0][20] = data[3152]; buffer[0][21] = data[3153]; buffer[0][22] = data[3154]; buffer[0][23] = data[3155]; buffer[0][24] = data[3156]; buffer[0][25] = data[3157]; buffer[0][26] = data[3158]; buffer[0][27] = data[3159]; buffer[0][28] = data[3160]; buffer[0][29] = data[3161]; buffer[0][30] = data[3162]; buffer[0][31] = data[3163]; buffer[0][32] = data[3164]; buffer[0][33] = data[3165]; buffer[0][34] = data[3166]; buffer[0][35] = data[3167];
+
+        }
+        if (partition ==  88) {
+            buffer[0][0] = data[3168]; buffer[0][1] = data[3169]; buffer[0][2] = data[3170]; buffer[0][3] = data[3171]; buffer[0][4] = data[3172]; buffer[0][5] = data[3173]; buffer[0][6] = data[3174]; buffer[0][7] = data[3175]; buffer[0][8] = data[3176]; buffer[0][9] = data[3177]; buffer[0][10] = data[3178]; buffer[0][11] = data[3179]; buffer[0][12] = data[3180]; buffer[0][13] = data[3181]; buffer[0][14] = data[3182]; buffer[0][15] = data[3183]; buffer[0][16] = data[3184]; buffer[0][17] = data[3185]; buffer[0][18] = data[3186]; buffer[0][19] = data[3187]; buffer[0][20] = data[3188]; buffer[0][21] = data[3189]; buffer[0][22] = data[3190]; buffer[0][23] = data[3191]; buffer[0][24] = data[3192]; buffer[0][25] = data[3193]; buffer[0][26] = data[3194]; buffer[0][27] = data[3195]; buffer[0][28] = data[3196]; buffer[0][29] = data[3197]; buffer[0][30] = data[3198]; buffer[0][31] = data[3199]; buffer[0][32] = data[3200]; buffer[0][33] = data[3201]; buffer[0][34] = data[3202]; buffer[0][35] = data[3203];
+
+        }
+        if (partition ==  89) {
+            buffer[0][0] = data[3204]; buffer[0][1] = data[3205]; buffer[0][2] = data[3206]; buffer[0][3] = data[3207]; buffer[0][4] = data[3208]; buffer[0][5] = data[3209]; buffer[0][6] = data[3210]; buffer[0][7] = data[3211]; buffer[0][8] = data[3212]; buffer[0][9] = data[3213]; buffer[0][10] = data[3214]; buffer[0][11] = data[3215]; buffer[0][12] = data[3216]; buffer[0][13] = data[3217]; buffer[0][14] = data[3218]; buffer[0][15] = data[3219]; buffer[0][16] = data[3220]; buffer[0][17] = data[3221]; buffer[0][18] = data[3222]; buffer[0][19] = data[3223]; buffer[0][20] = data[3224]; buffer[0][21] = data[3225]; buffer[0][22] = data[3226]; buffer[0][23] = data[3227]; buffer[0][24] = data[3228]; buffer[0][25] = data[3229]; buffer[0][26] = data[3230]; buffer[0][27] = data[3231]; buffer[0][28] = data[3232]; buffer[0][29] = data[3233]; buffer[0][30] = data[3234]; buffer[0][31] = data[3235]; buffer[0][32] = data[3236]; buffer[0][33] = data[3237]; buffer[0][34] = data[3238]; buffer[0][35] = data[3239];
+
+        }
+        if (partition ==  90) {
+            buffer[0][0] = data[3240]; buffer[0][1] = data[3241]; buffer[0][2] = data[3242]; buffer[0][3] = data[3243]; buffer[0][4] = data[3244]; buffer[0][5] = data[3245]; buffer[0][6] = data[3246]; buffer[0][7] = data[3247]; buffer[0][8] = data[3248]; buffer[0][9] = data[3249]; buffer[0][10] = data[3250]; buffer[0][11] = data[3251]; buffer[0][12] = data[3252]; buffer[0][13] = data[3253]; buffer[0][14] = data[3254]; buffer[0][15] = data[3255]; buffer[0][16] = data[3256]; buffer[0][17] = data[3257]; buffer[0][18] = data[3258]; buffer[0][19] = data[3259]; buffer[0][20] = data[3260]; buffer[0][21] = data[3261]; buffer[0][22] = data[3262]; buffer[0][23] = data[3263]; buffer[0][24] = data[3264]; buffer[0][25] = data[3265]; buffer[0][26] = data[3266]; buffer[0][27] = data[3267]; buffer[0][28] = data[3268]; buffer[0][29] = data[3269]; buffer[0][30] = data[3270]; buffer[0][31] = data[3271]; buffer[0][32] = data[3272]; buffer[0][33] = data[3273]; buffer[0][34] = data[3274]; buffer[0][35] = data[3275];
+
+        }
+        if (partition ==  91) {
+            buffer[0][0] = data[3276]; buffer[0][1] = data[3277]; buffer[0][2] = data[3278]; buffer[0][3] = data[3279]; buffer[0][4] = data[3280]; buffer[0][5] = data[3281]; buffer[0][6] = data[3282]; buffer[0][7] = data[3283]; buffer[0][8] = data[3284]; buffer[0][9] = data[3285]; buffer[0][10] = data[3286]; buffer[0][11] = data[3287]; buffer[0][12] = data[3288]; buffer[0][13] = data[3289]; buffer[0][14] = data[3290]; buffer[0][15] = data[3291]; buffer[0][16] = data[3292]; buffer[0][17] = data[3293]; buffer[0][18] = data[3294]; buffer[0][19] = data[3295]; buffer[0][20] = data[3296]; buffer[0][21] = data[3297]; buffer[0][22] = data[3298]; buffer[0][23] = data[3299]; buffer[0][24] = data[3300]; buffer[0][25] = data[3301]; buffer[0][26] = data[3302]; buffer[0][27] = data[3303]; buffer[0][28] = data[3304]; buffer[0][29] = data[3305]; buffer[0][30] = data[3306]; buffer[0][31] = data[3307]; buffer[0][32] = data[3308]; buffer[0][33] = data[3309]; buffer[0][34] = data[3310]; buffer[0][35] = data[3311];
+
+        }
+        if (partition ==  92) {
+            buffer[0][0] = data[3312]; buffer[0][1] = data[3313]; buffer[0][2] = data[3314]; buffer[0][3] = data[3315]; buffer[0][4] = data[3316]; buffer[0][5] = data[3317]; buffer[0][6] = data[3318]; buffer[0][7] = data[3319]; buffer[0][8] = data[3320]; buffer[0][9] = data[3321]; buffer[0][10] = data[3322]; buffer[0][11] = data[3323]; buffer[0][12] = data[3324]; buffer[0][13] = data[3325]; buffer[0][14] = data[3326]; buffer[0][15] = data[3327]; buffer[0][16] = data[3328]; buffer[0][17] = data[3329]; buffer[0][18] = data[3330]; buffer[0][19] = data[3331]; buffer[0][20] = data[3332]; buffer[0][21] = data[3333]; buffer[0][22] = data[3334]; buffer[0][23] = data[3335]; buffer[0][24] = data[3336]; buffer[0][25] = data[3337]; buffer[0][26] = data[3338]; buffer[0][27] = data[3339]; buffer[0][28] = data[3340]; buffer[0][29] = data[3341]; buffer[0][30] = data[3342]; buffer[0][31] = data[3343]; buffer[0][32] = data[3344]; buffer[0][33] = data[3345]; buffer[0][34] = data[3346]; buffer[0][35] = data[3347];
+
+        }
+        if (partition ==  93) {
+            buffer[0][0] = data[3348]; buffer[0][1] = data[3349]; buffer[0][2] = data[3350]; buffer[0][3] = data[3351]; buffer[0][4] = data[3352]; buffer[0][5] = data[3353]; buffer[0][6] = data[3354]; buffer[0][7] = data[3355]; buffer[0][8] = data[3356]; buffer[0][9] = data[3357]; buffer[0][10] = data[3358]; buffer[0][11] = data[3359]; buffer[0][12] = data[3360]; buffer[0][13] = data[3361]; buffer[0][14] = data[3362]; buffer[0][15] = data[3363]; buffer[0][16] = data[3364]; buffer[0][17] = data[3365]; buffer[0][18] = data[3366]; buffer[0][19] = data[3367]; buffer[0][20] = data[3368]; buffer[0][21] = data[3369]; buffer[0][22] = data[3370]; buffer[0][23] = data[3371]; buffer[0][24] = data[3372]; buffer[0][25] = data[3373]; buffer[0][26] = data[3374]; buffer[0][27] = data[3375]; buffer[0][28] = data[3376]; buffer[0][29] = data[3377]; buffer[0][30] = data[3378]; buffer[0][31] = data[3379]; buffer[0][32] = data[3380]; buffer[0][33] = data[3381]; buffer[0][34] = data[3382]; buffer[0][35] = data[3383];
+
+        }
+        if (partition ==  94) {
+            buffer[0][0] = data[3384]; buffer[0][1] = data[3385]; buffer[0][2] = data[3386]; buffer[0][3] = data[3387]; buffer[0][4] = data[3388]; buffer[0][5] = data[3389]; buffer[0][6] = data[3390]; buffer[0][7] = data[3391]; buffer[0][8] = data[3392]; buffer[0][9] = data[3393]; buffer[0][10] = data[3394]; buffer[0][11] = data[3395]; buffer[0][12] = data[3396]; buffer[0][13] = data[3397]; buffer[0][14] = data[3398]; buffer[0][15] = data[3399]; buffer[0][16] = data[3400]; buffer[0][17] = data[3401]; buffer[0][18] = data[3402]; buffer[0][19] = data[3403]; buffer[0][20] = data[3404]; buffer[0][21] = data[3405]; buffer[0][22] = data[3406]; buffer[0][23] = data[3407]; buffer[0][24] = data[3408]; buffer[0][25] = data[3409]; buffer[0][26] = data[3410]; buffer[0][27] = data[3411]; buffer[0][28] = data[3412]; buffer[0][29] = data[3413]; buffer[0][30] = data[3414]; buffer[0][31] = data[3415]; buffer[0][32] = data[3416]; buffer[0][33] = data[3417]; buffer[0][34] = data[3418]; buffer[0][35] = data[3419];
+
+        }
+        if (partition ==  95) {
+            buffer[0][0] = data[3420]; buffer[0][1] = data[3421]; buffer[0][2] = data[3422]; buffer[0][3] = data[3423]; buffer[0][4] = data[3424]; buffer[0][5] = data[3425]; buffer[0][6] = data[3426]; buffer[0][7] = data[3427]; buffer[0][8] = data[3428]; buffer[0][9] = data[3429]; buffer[0][10] = data[3430]; buffer[0][11] = data[3431]; buffer[0][12] = data[3432]; buffer[0][13] = data[3433]; buffer[0][14] = data[3434]; buffer[0][15] = data[3435]; buffer[0][16] = data[3436]; buffer[0][17] = data[3437]; buffer[0][18] = data[3438]; buffer[0][19] = data[3439]; buffer[0][20] = data[3440]; buffer[0][21] = data[3441]; buffer[0][22] = data[3442]; buffer[0][23] = data[3443]; buffer[0][24] = data[3444]; buffer[0][25] = data[3445]; buffer[0][26] = data[3446]; buffer[0][27] = data[3447]; buffer[0][28] = data[3448]; buffer[0][29] = data[3449]; buffer[0][30] = data[3450]; buffer[0][31] = data[3451]; buffer[0][32] = data[3452]; buffer[0][33] = data[3453]; buffer[0][34] = data[3454]; buffer[0][35] = data[3455];
+
+        }
+        if (partition ==  96) {
+            buffer[0][0] = data[3456]; buffer[0][1] = data[3457]; buffer[0][2] = data[3458]; buffer[0][3] = data[3459]; buffer[0][4] = data[3460]; buffer[0][5] = data[3461]; buffer[0][6] = data[3462]; buffer[0][7] = data[3463]; buffer[0][8] = data[3464]; buffer[0][9] = data[3465]; buffer[0][10] = data[3466]; buffer[0][11] = data[3467]; buffer[0][12] = data[3468]; buffer[0][13] = data[3469]; buffer[0][14] = data[3470]; buffer[0][15] = data[3471]; buffer[0][16] = data[3472]; buffer[0][17] = data[3473]; buffer[0][18] = data[3474]; buffer[0][19] = data[3475]; buffer[0][20] = data[3476]; buffer[0][21] = data[3477]; buffer[0][22] = data[3478]; buffer[0][23] = data[3479]; buffer[0][24] = data[3480]; buffer[0][25] = data[3481]; buffer[0][26] = data[3482]; buffer[0][27] = data[3483]; buffer[0][28] = data[3484]; buffer[0][29] = data[3485]; buffer[0][30] = data[3486]; buffer[0][31] = data[3487]; buffer[0][32] = data[3488]; buffer[0][33] = data[3489]; buffer[0][34] = data[3490]; buffer[0][35] = data[3491];
+
+        }
+        if (partition ==  97) {
+            buffer[0][0] = data[3492]; buffer[0][1] = data[3493]; buffer[0][2] = data[3494]; buffer[0][3] = data[3495]; buffer[0][4] = data[3496]; buffer[0][5] = data[3497]; buffer[0][6] = data[3498]; buffer[0][7] = data[3499]; buffer[0][8] = data[3500]; buffer[0][9] = data[3501]; buffer[0][10] = data[3502]; buffer[0][11] = data[3503]; buffer[0][12] = data[3504]; buffer[0][13] = data[3505]; buffer[0][14] = data[3506]; buffer[0][15] = data[3507]; buffer[0][16] = data[3508]; buffer[0][17] = data[3509]; buffer[0][18] = data[3510]; buffer[0][19] = data[3511]; buffer[0][20] = data[3512]; buffer[0][21] = data[3513]; buffer[0][22] = data[3514]; buffer[0][23] = data[3515]; buffer[0][24] = data[3516]; buffer[0][25] = data[3517]; buffer[0][26] = data[3518]; buffer[0][27] = data[3519]; buffer[0][28] = data[3520]; buffer[0][29] = data[3521]; buffer[0][30] = data[3522]; buffer[0][31] = data[3523]; buffer[0][32] = data[3524]; buffer[0][33] = data[3525]; buffer[0][34] = data[3526]; buffer[0][35] = data[3527];
+
+        }
+        if (partition ==  98) {
+            buffer[0][0] = data[3528]; buffer[0][1] = data[3529]; buffer[0][2] = data[3530]; buffer[0][3] = data[3531]; buffer[0][4] = data[3532]; buffer[0][5] = data[3533]; buffer[0][6] = data[3534]; buffer[0][7] = data[3535]; buffer[0][8] = data[3536]; buffer[0][9] = data[3537]; buffer[0][10] = data[3538]; buffer[0][11] = data[3539]; buffer[0][12] = data[3540]; buffer[0][13] = data[3541]; buffer[0][14] = data[3542]; buffer[0][15] = data[3543]; buffer[0][16] = data[3544]; buffer[0][17] = data[3545]; buffer[0][18] = data[3546]; buffer[0][19] = data[3547]; buffer[0][20] = data[3548]; buffer[0][21] = data[3549]; buffer[0][22] = data[3550]; buffer[0][23] = data[3551]; buffer[0][24] = data[3552]; buffer[0][25] = data[3553]; buffer[0][26] = data[3554]; buffer[0][27] = data[3555]; buffer[0][28] = data[3556]; buffer[0][29] = data[3557]; buffer[0][30] = data[3558]; buffer[0][31] = data[3559]; buffer[0][32] = data[3560]; buffer[0][33] = data[3561]; buffer[0][34] = data[3562]; buffer[0][35] = data[3563];
+
+        }
+        if (partition ==  99) {
+            buffer[0][0] = data[3564]; buffer[0][1] = data[3565]; buffer[0][2] = data[3566]; buffer[0][3] = data[3567]; buffer[0][4] = data[3568]; buffer[0][5] = data[3569]; buffer[0][6] = data[3570]; buffer[0][7] = data[3571]; buffer[0][8] = data[3572]; buffer[0][9] = data[3573]; buffer[0][10] = data[3574]; buffer[0][11] = data[3575]; buffer[0][12] = data[3576]; buffer[0][13] = data[3577]; buffer[0][14] = data[3578]; buffer[0][15] = data[3579]; buffer[0][16] = data[3580]; buffer[0][17] = data[3581]; buffer[0][18] = data[3582]; buffer[0][19] = data[3583]; buffer[0][20] = data[3584]; buffer[0][21] = data[3585]; buffer[0][22] = data[3586]; buffer[0][23] = data[3587]; buffer[0][24] = data[3588]; buffer[0][25] = data[3589]; buffer[0][26] = data[3590]; buffer[0][27] = data[3591]; buffer[0][28] = data[3592]; buffer[0][29] = data[3593]; buffer[0][30] = data[3594]; buffer[0][31] = data[3595]; buffer[0][32] = data[3596]; buffer[0][33] = data[3597]; buffer[0][34] = data[3598]; buffer[0][35] = data[3599];
+
+        }
+    }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_common.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_common.h
new file mode 100644
index 00000000..e942a1dc
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_common.h
@@ -0,0 +1,76 @@
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "ap_fixed.h"
+
+// This is a substitute for "ceil(n/(float)d)".
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n > d ? n : d)
+
+#define STRINGIFY(x) #x
+#define EXPAND_STRING(x) STRINGIFY(x)
+
+#ifndef __VITIS_HLS__
+#define DATA_PACK_TXT HLS DATA_PACK variable =
+#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
+#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
+#else
+#define PRAGMA_DATA_PACK(variable)
+#endif
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+enum strategy { latency, resource };
+enum class conv_implementation { linebuffer = 0, encoded = 1, pointwise = 2 };
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Vivado cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_and {
+  public:
+    T operator()(T a, T b) { return a && b; }
+};
+
+template <class T> class Op_or {
+  public:
+    T operator()(T a, T b) { return a || b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+template <class T> class Op_min {
+  public:
+    T operator()(T a, T b) { return a <= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d.h
new file mode 100644
index 00000000..0f2e89ac
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d.h
@@ -0,0 +1,76 @@
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_latency.h"
+#include "nnet_conv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv1d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 0;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        if (CONFIG_T::implementation == conv_implementation::pointwise) {
+            // Use pointwise unrolled implementation
+            if (CONFIG_T::reuse_factor > 1 && CONFIG_T::reuse_factor <= 120) {
+                pointwise_conv_1d_latency_cl_split_by_rf<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            } else {
+                assert(CONFIG_T::reuse_factor == 1);
+                pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+            }
+        } else {
+            // Use standard unrolled implementation
+            conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_latency.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_latency.h
new file mode 100644
index 00000000..aabc8698
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_latency.h
@@ -0,0 +1,439 @@
+#ifndef NNET_CONV1D_LATENCY_H_
+#define NNET_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                        res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    int multiplier_limit =
+        ceil((float(CONFIG_T::out_width) / float(CONFIG_T::reuse_factor) * CONFIG_T::n_filt * CONFIG_T::n_chan) /
+             float(CONFIG_T::reuse_factor));
+#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = data[index_data] * weights[index_weight];
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl_split_by_rf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                              res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                              typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                              typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
+    res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];
+    #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
+
+RFInputLoop:
+    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+    #pragma HLS UNROLL
+    InnerInputLoop:
+        for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {
+            #pragma HLS UNROLL
+            data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];
+        }
+    }
+
+    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[0], res_tmp[0], weights, biases);
+    pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[1], res_tmp[1], weights, biases);
+    if (CONFIG_T::reuse_factor > 2)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[2], res_tmp[2], weights, biases);
+    if (CONFIG_T::reuse_factor > 3)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[3], res_tmp[3], weights, biases);
+    if (CONFIG_T::reuse_factor > 4)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[4], res_tmp[4], weights, biases);
+    if (CONFIG_T::reuse_factor > 5)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[5], res_tmp[5], weights, biases);
+    if (CONFIG_T::reuse_factor > 6)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[6], res_tmp[6], weights, biases);
+    if (CONFIG_T::reuse_factor > 7)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[7], res_tmp[7], weights, biases);
+    if (CONFIG_T::reuse_factor > 8)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[8], res_tmp[8], weights, biases);
+    if (CONFIG_T::reuse_factor > 9)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[9], res_tmp[9], weights, biases);
+    if (CONFIG_T::reuse_factor > 10)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[10], res_tmp[10], weights, biases);
+    if (CONFIG_T::reuse_factor > 11)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[11], res_tmp[11], weights, biases);
+    if (CONFIG_T::reuse_factor > 12)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[12], res_tmp[12], weights, biases);
+    if (CONFIG_T::reuse_factor > 13)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[13], res_tmp[13], weights, biases);
+    if (CONFIG_T::reuse_factor > 14)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[14], res_tmp[14], weights, biases);
+    if (CONFIG_T::reuse_factor > 15)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[15], res_tmp[15], weights, biases);
+    if (CONFIG_T::reuse_factor > 16)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[16], res_tmp[16], weights, biases);
+    if (CONFIG_T::reuse_factor > 17)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[17], res_tmp[17], weights, biases);
+    if (CONFIG_T::reuse_factor > 18)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[18], res_tmp[18], weights, biases);
+    if (CONFIG_T::reuse_factor > 19)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[19], res_tmp[19], weights, biases);
+    if (CONFIG_T::reuse_factor > 20)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[20], res_tmp[20], weights, biases);
+    if (CONFIG_T::reuse_factor > 21)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[21], res_tmp[21], weights, biases);
+    if (CONFIG_T::reuse_factor > 22)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[22], res_tmp[22], weights, biases);
+    if (CONFIG_T::reuse_factor > 23)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[23], res_tmp[23], weights, biases);
+    if (CONFIG_T::reuse_factor > 24)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[24], res_tmp[24], weights, biases);
+    if (CONFIG_T::reuse_factor > 25)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[25], res_tmp[25], weights, biases);
+    if (CONFIG_T::reuse_factor > 26)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[26], res_tmp[26], weights, biases);
+    if (CONFIG_T::reuse_factor > 27)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[27], res_tmp[27], weights, biases);
+    if (CONFIG_T::reuse_factor > 28)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[28], res_tmp[28], weights, biases);
+    if (CONFIG_T::reuse_factor > 29)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[29], res_tmp[29], weights, biases);
+    if (CONFIG_T::reuse_factor > 30)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[30], res_tmp[30], weights, biases);
+    if (CONFIG_T::reuse_factor > 31)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[31], res_tmp[31], weights, biases);
+    if (CONFIG_T::reuse_factor > 32)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[32], res_tmp[32], weights, biases);
+    if (CONFIG_T::reuse_factor > 33)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[33], res_tmp[33], weights, biases);
+    if (CONFIG_T::reuse_factor > 34)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[34], res_tmp[34], weights, biases);
+    if (CONFIG_T::reuse_factor > 35)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[35], res_tmp[35], weights, biases);
+    if (CONFIG_T::reuse_factor > 36)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[36], res_tmp[36], weights, biases);
+    if (CONFIG_T::reuse_factor > 37)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[37], res_tmp[37], weights, biases);
+    if (CONFIG_T::reuse_factor > 38)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[38], res_tmp[38], weights, biases);
+    if (CONFIG_T::reuse_factor > 39)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[39], res_tmp[39], weights, biases);
+    if (CONFIG_T::reuse_factor > 40)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[40], res_tmp[40], weights, biases);
+    if (CONFIG_T::reuse_factor > 41)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[41], res_tmp[41], weights, biases);
+    if (CONFIG_T::reuse_factor > 42)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[42], res_tmp[42], weights, biases);
+    if (CONFIG_T::reuse_factor > 43)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[43], res_tmp[43], weights, biases);
+    if (CONFIG_T::reuse_factor > 44)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[44], res_tmp[44], weights, biases);
+    if (CONFIG_T::reuse_factor > 45)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[45], res_tmp[45], weights, biases);
+    if (CONFIG_T::reuse_factor > 46)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[46], res_tmp[45], weights, biases);
+    if (CONFIG_T::reuse_factor > 47)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[47], res_tmp[47], weights, biases);
+    if (CONFIG_T::reuse_factor > 48)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[48], res_tmp[48], weights, biases);
+    if (CONFIG_T::reuse_factor > 49)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[49], res_tmp[49], weights, biases);
+    if (CONFIG_T::reuse_factor > 50)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[50], res_tmp[50], weights, biases);
+    if (CONFIG_T::reuse_factor > 51)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[51], res_tmp[51], weights, biases);
+    if (CONFIG_T::reuse_factor > 52)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[52], res_tmp[52], weights, biases);
+    if (CONFIG_T::reuse_factor > 53)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[53], res_tmp[53], weights, biases);
+    if (CONFIG_T::reuse_factor > 54)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[54], res_tmp[54], weights, biases);
+    if (CONFIG_T::reuse_factor > 55)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[55], res_tmp[55], weights, biases);
+    if (CONFIG_T::reuse_factor > 56)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[56], res_tmp[55], weights, biases);
+    if (CONFIG_T::reuse_factor > 57)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[57], res_tmp[57], weights, biases);
+    if (CONFIG_T::reuse_factor > 58)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[58], res_tmp[58], weights, biases);
+    if (CONFIG_T::reuse_factor > 59)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[59], res_tmp[59], weights, biases);
+    if (CONFIG_T::reuse_factor > 60)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[60], res_tmp[60], weights, biases);
+    if (CONFIG_T::reuse_factor > 61)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[61], res_tmp[61], weights, biases);
+    if (CONFIG_T::reuse_factor > 62)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[62], res_tmp[62], weights, biases);
+    if (CONFIG_T::reuse_factor > 63)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[63], res_tmp[63], weights, biases);
+    if (CONFIG_T::reuse_factor > 64)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[64], res_tmp[64], weights, biases);
+    if (CONFIG_T::reuse_factor > 65)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[65], res_tmp[65], weights, biases);
+    if (CONFIG_T::reuse_factor > 66)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[66], res_tmp[66], weights, biases);
+    if (CONFIG_T::reuse_factor > 67)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[67], res_tmp[67], weights, biases);
+    if (CONFIG_T::reuse_factor > 68)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[68], res_tmp[68], weights, biases);
+    if (CONFIG_T::reuse_factor > 69)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[69], res_tmp[69], weights, biases);
+    if (CONFIG_T::reuse_factor > 70)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[70], res_tmp[70], weights, biases);
+    if (CONFIG_T::reuse_factor > 71)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[71], res_tmp[71], weights, biases);
+    if (CONFIG_T::reuse_factor > 72)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[72], res_tmp[72], weights, biases);
+    if (CONFIG_T::reuse_factor > 73)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[73], res_tmp[73], weights, biases);
+    if (CONFIG_T::reuse_factor > 74)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[74], res_tmp[74], weights, biases);
+    if (CONFIG_T::reuse_factor > 75)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[75], res_tmp[75], weights, biases);
+    if (CONFIG_T::reuse_factor > 76)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[76], res_tmp[76], weights, biases);
+    if (CONFIG_T::reuse_factor > 77)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[77], res_tmp[77], weights, biases);
+    if (CONFIG_T::reuse_factor > 78)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[78], res_tmp[78], weights, biases);
+    if (CONFIG_T::reuse_factor > 79)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[79], res_tmp[79], weights, biases);
+    if (CONFIG_T::reuse_factor > 80)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[80], res_tmp[80], weights, biases);
+    if (CONFIG_T::reuse_factor > 81)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[81], res_tmp[81], weights, biases);
+    if (CONFIG_T::reuse_factor > 82)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[82], res_tmp[82], weights, biases);
+    if (CONFIG_T::reuse_factor > 83)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[83], res_tmp[83], weights, biases);
+    if (CONFIG_T::reuse_factor > 84)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[84], res_tmp[84], weights, biases);
+    if (CONFIG_T::reuse_factor > 85)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[85], res_tmp[85], weights, biases);
+    if (CONFIG_T::reuse_factor > 86)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[86], res_tmp[86], weights, biases);
+    if (CONFIG_T::reuse_factor > 87)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[87], res_tmp[87], weights, biases);
+    if (CONFIG_T::reuse_factor > 88)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[88], res_tmp[88], weights, biases);
+    if (CONFIG_T::reuse_factor > 89)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[89], res_tmp[89], weights, biases);
+    if (CONFIG_T::reuse_factor > 90)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[90], res_tmp[90], weights, biases);
+    if (CONFIG_T::reuse_factor > 91)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[91], res_tmp[91], weights, biases);
+    if (CONFIG_T::reuse_factor > 92)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[92], res_tmp[92], weights, biases);
+    if (CONFIG_T::reuse_factor > 93)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[93], res_tmp[93], weights, biases);
+    if (CONFIG_T::reuse_factor > 94)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[94], res_tmp[94], weights, biases);
+    if (CONFIG_T::reuse_factor > 95)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[95], res_tmp[95], weights, biases);
+    if (CONFIG_T::reuse_factor > 96)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[96], res_tmp[96], weights, biases);
+    if (CONFIG_T::reuse_factor > 97)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[97], res_tmp[97], weights, biases);
+    if (CONFIG_T::reuse_factor > 98)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[98], res_tmp[98], weights, biases);
+    if (CONFIG_T::reuse_factor > 99)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[99], res_tmp[99], weights, biases);
+    if (CONFIG_T::reuse_factor > 100)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[100], res_tmp[100], weights, biases);
+    if (CONFIG_T::reuse_factor > 101)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[101], res_tmp[101], weights, biases);
+    if (CONFIG_T::reuse_factor > 102)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[102], res_tmp[102], weights, biases);
+    if (CONFIG_T::reuse_factor > 103)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[103], res_tmp[103], weights, biases);
+    if (CONFIG_T::reuse_factor > 104)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[104], res_tmp[104], weights, biases);
+    if (CONFIG_T::reuse_factor > 105)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[105], res_tmp[105], weights, biases);
+    if (CONFIG_T::reuse_factor > 106)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[106], res_tmp[106], weights, biases);
+    if (CONFIG_T::reuse_factor > 107)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[107], res_tmp[107], weights, biases);
+    if (CONFIG_T::reuse_factor > 108)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[108], res_tmp[108], weights, biases);
+    if (CONFIG_T::reuse_factor > 109)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[109], res_tmp[109], weights, biases);
+    if (CONFIG_T::reuse_factor > 110)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[110], res_tmp[110], weights, biases);
+    if (CONFIG_T::reuse_factor > 111)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[111], res_tmp[111], weights, biases);
+    if (CONFIG_T::reuse_factor > 112)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[112], res_tmp[112], weights, biases);
+    if (CONFIG_T::reuse_factor > 113)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[113], res_tmp[113], weights, biases);
+    if (CONFIG_T::reuse_factor > 114)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[114], res_tmp[114], weights, biases);
+    if (CONFIG_T::reuse_factor > 115)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[115], res_tmp[115], weights, biases);
+    if (CONFIG_T::reuse_factor > 116)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[116], res_tmp[116], weights, biases);
+    if (CONFIG_T::reuse_factor > 117)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[117], res_tmp[117], weights, biases);
+    if (CONFIG_T::reuse_factor > 118)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[118], res_tmp[118], weights, biases);
+    if (CONFIG_T::reuse_factor > 119)
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[119], res_tmp[119], weights, biases);
+
+RFOutputLoop:
+    for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {
+    #pragma HLS UNROLL
+    InnerOutputLoop:
+        for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {
+            #pragma HLS UNROLL
+            res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_resource.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_resource.h
new file mode 100644
index 00000000..6e70158a
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,103 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                         res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                         typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                         typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+    constexpr unsigned multscale = block_factor / mult_n_out;
+
+    assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_width * CONFIG_T::n_chan) &&
+           "This function is correct only for RF <= FILT_WIDTH * N_CHAN");
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+        InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+        }
+
+    ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_w = i_rf;
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+        MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+            PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            data_buf[i_pxl][i_in], weights[i_w]));
+                }
+
+                // Increment i_w
+                i_w += CONFIG_T::reuse_factor;
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+    PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+        #pragma HLS UNROLL
+        // Cast to "res_t" type
+        ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_stream.h
new file mode 100644
index 00000000..b23c330c
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,89 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        #pragma HLS UNROLL
+        unsigned sw_idx =
+            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
+                wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                        biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d.h
new file mode 100644
index 00000000..71a88f44
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d.h
@@ -0,0 +1,75 @@
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_latency.h"
+#include "nnet_conv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv2d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_latency.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_latency.h
new file mode 100644
index 00000000..5114af78
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_latency.h
@@ -0,0 +1,89 @@
+#ifndef NNET_CONV2D_LATENCY_H_
+#define NNET_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_resource.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_resource.h
new file mode 100644
index 00000000..eb7e18e4
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -0,0 +1,105 @@
+#ifndef NNET_CONV2D_RESOURCE_H_
+#define NNET_CONV2D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+    constexpr unsigned block_factor = DIV_ROUNDUP(mult_n_in * mult_n_out, CONFIG_T::reuse_factor);
+
+    constexpr unsigned multscale = block_factor / mult_n_out;
+
+    assert((block_factor % mult_n_out == 0 || CONFIG_T::reuse_factor >= mult_n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor <= CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan) &&
+           "This function is correct only for RF <= FILT_HEIGHT * FILT_WIDTH * N_CHAN");
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_pixels][mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+PartitionLoop:
+    for (unsigned i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS UNROLL // We don't want this loop unrolled
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelInitAccumLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+        InitAccumLoop:
+            for (unsigned i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_pxl][i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+        }
+
+    ReuseLoop:
+        for (unsigned i_rf = 0; i_rf < CONFIG_T::reuse_factor; i_rf++) {
+            #pragma HLS PIPELINE II=1 rewind
+
+            unsigned i_w = i_rf;
+            unsigned i_in = i_rf;
+            unsigned i_out = 0;
+            unsigned i_acc = 0;
+
+        MultLoop:
+            for (unsigned i_blk = 0; i_blk < block_factor; i_blk++) {
+                #pragma HLS UNROLL
+
+            PixelMultLoop:
+                for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+                    #pragma HLS UNROLL
+
+                    acc[i_pxl][i_out] += static_cast<typename CONFIG_T::accum_t>(
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            data_buf[i_pxl][i_in], weights[i_w]));
+                }
+
+                // Increment i_w
+                i_w += CONFIG_T::reuse_factor;
+                // Increment i_in
+                i_in += CONFIG_T::reuse_factor;
+                if (i_in >= mult_n_in) {
+                    i_in = i_rf;
+                }
+                // Increment i_out
+                if (i_acc + 1 >= multscale) {
+                    i_acc = 0;
+                    i_out++;
+                } else {
+                    i_acc++;
+                }
+            }
+        }
+
+    PixelResultLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+        #pragma HLS UNROLL
+        // Cast to "res_t" type
+        ResultLoop:
+            for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_stream.h
new file mode 100644
index 00000000..8a4fb6be
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv2d_stream.h
@@ -0,0 +1,112 @@
+#ifndef NNET_CONV2D_STREAM_H_
+#define NNET_CONV2D_STREAM_H_
+
+#include "ap_shift_reg.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_2d(const unsigned h_idx, const unsigned w_idx,
+                               ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> *pixel_idx) {
+    const unsigned sh_idx = CONFIG_T::template scale_index_height<CONFIG_T::filt_height, CONFIG_T::stride_height,
+                                                                  CONFIG_T::in_height>::scale_index(h_idx);
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        #pragma HLS UNROLL
+
+        unsigned sw_idx = CONFIG_T::template scale_index_width<CONFIG_T::filt_width, CONFIG_T::stride_width,
+                                                               CONFIG_T::in_width>::scale_index(wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sh_idx * CONFIG_T::min_width + sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_encoded_cl(
+    hls::stream<data_T> &data, hls::stream<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == CONFIG_T::filt_width);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            compute_scaled_indices_2d<data_T, CONFIG_T>(i_ih, i_iw, pixel_idx);
+            compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                            biases, pixel_idx);
+        }
+    }
+}
+
+// Line Buffer
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_buffer_cl(
+    hls::stream<data_T> &data, hls::stream<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1, 1)]
+                                                                                    [CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            if (CONFIG_T::strategy == nnet::latency) {
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            if (CONFIG_T::filt_height > 1) {
+                compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    hls::stream<data_T> &data, hls::stream<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_2d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv_stream.h
new file mode 100644
index 00000000..b763938c
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_conv_stream.h
@@ -0,0 +1,394 @@
+#ifndef NNET_CONV_STREAM_H_
+#define NNET_CONV_STREAM_H_
+
+#include "ap_shift_reg.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+// *************************************************
+//       Encoded Implementation (Vlad's)
+// *************************************************
+template <unsigned K, unsigned S, unsigned W> unsigned scale_index_K_gte_S(const unsigned idx) {
+    #pragma HLS INLINE
+
+    if (idx < K - S) {
+        return idx;
+    }
+
+    constexpr unsigned nW = ((W - K) / S) * S + K;           // Nearest W without unused pixels on the right
+    constexpr unsigned sW = (DIV_ROUNDUP(K, S) - 1) * S + K; // Scaled W that behaves like original W
+    if (idx >= nW) {
+        return sW;
+    }
+
+    const unsigned r = nW - idx;
+    if (r <= K - S) {
+        return sW - r;
+    }
+
+    return K - S + (idx - (K - S)) % S;
+}
+
+template <unsigned K, unsigned S, unsigned W> unsigned scale_index_K_lt_S(const unsigned idx) {
+    #pragma HLS INLINE
+
+    if (idx < S - K) {
+        return idx;
+    }
+
+    constexpr unsigned nW = ((W - K) / S) * S + K;           // Nearest W without unused pixels on the right
+    constexpr unsigned sW = (DIV_ROUNDUP(S, K) - 1) * S + K; // Scaled W that behaves like original W
+    if (idx >= nW) {
+        return sW;
+    }
+
+    const unsigned r = nW - idx;
+    if (r <= S - K) {
+        return sW - r;
+    }
+
+    return S - K + (idx - (S - K)) % S;
+}
+
+template <unsigned K, unsigned S, unsigned W> class scale_index_regular {
+  public:
+    static unsigned scale_index(const unsigned idx) {
+        #pragma HLS INLINE
+
+        if (K >= S) {
+            return scale_index_K_gte_S<K, S, W>(idx);
+        } else {
+            return scale_index_K_lt_S<K, S, W>(idx);
+        }
+    }
+};
+
+template <unsigned K, unsigned S, unsigned W> class scale_index_unscaled {
+  public:
+    static unsigned scale_index(const unsigned idx) {
+        #pragma HLS INLINE
+        return idx;
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void mult_buffer(hls::stream<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                 res_T &res_pack, hls::stream<res_T> &res_stream, unsigned &outputs_ready,
+                 typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE
+
+    typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = data complete
+    typename res_T::value_type res[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = res complete
+
+InitData:
+    for (int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) {
+        #pragma HLS UNROLL
+        data[id] = data_window[id].read();
+    }
+
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    } else {
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    }
+
+CastLoop:
+    for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
+        #pragma HLS UNROLL
+        if (res_T::size / CONFIG_T::n_filt == 1) {
+            res_pack[jj] = res[jj];
+        } else {
+            res_pack[outputs_ready * CONFIG_T::n_filt + jj] = res[jj];
+        }
+    }
+
+    if (res_T::size / CONFIG_T::n_filt == 1) {
+        res_stream.write(res_pack);
+    } else {
+        if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) {
+            res_stream.write(res_pack);
+            outputs_ready = 0;
+        } else {
+            outputs_ready++;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_encoded(const data_T &in_elem,
+                            hls::stream<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                            hls::stream<res_T> &res, res_T &res_pack, unsigned &outputs_ready,
+                            typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                            typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], ap_uint<CONFIG_T::kernel_size> *pixel_idx) {
+    #pragma HLS INLINE
+
+MultLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        #pragma HLS PIPELINE II = CONFIG_T::reuse_factor
+    CopyDataFilt:
+        for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) {
+            #pragma HLS UNROLL
+        CopyDataChan:
+            for (unsigned c = 0; c < CONFIG_T::n_chan; c++) {
+                #pragma HLS UNROLL
+                if (pixel_idx[p][f])
+                    data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]);
+            }
+        }
+        if (pixel_idx[p][CONFIG_T::kernel_size - 1]) {
+            mult_buffer<data_T, res_T, CONFIG_T>(data_window, res_pack, res, outputs_ready, weights, biases);
+        }
+    }
+}
+
+// *************************************************
+//       Line Buffer Implementation (Phil's)
+// *************************************************
+template <class data_T, typename CONFIG_T>
+void kernel_shift_1d(const data_T &in_elem,
+                     typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) {
+    #pragma HLS inline
+
+    // Shift kernel_window by one step to the left (manual shift operation)
+    static const int filt_width = CONFIG_T::filt_width - 1;
+KernelShiftWidth:
+    for (int i_iw = 0; i_iw < filt_width; i_iw++) {
+        #pragma HLS PIPELINE II = 1
+    KernelShiftChannel:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+            #pragma HLS UNROLL
+            // Shift every element in kernel_window to the left
+            kernel_window[i_iw * CONFIG_T::n_chan + i_ic] = kernel_window[(i_iw + 1) * CONFIG_T::n_chan + i_ic];
+        }
+    }
+
+    // Insert shift_buffer column into right-most column of kernel
+    static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan;
+KernelPushChannel:
+    for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+        #pragma HLS UNROLL
+        kernel_window[lastheight + i_ic] = in_elem[i_ic];
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void kernel_shift_2d(
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) {
+    #pragma HLS inline
+
+    // Shift kernel_window by one step to the left (manual shift operation)
+    static const int filt_width = CONFIG_T::filt_width - 1;
+KernelShiftWidth:
+    for (int i_iw = 0; i_iw < filt_width; i_iw++) {
+        #pragma HLS PIPELINE II = 1
+    KernelShiftHeight:
+        for (unsigned i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) {
+        KernelShiftChannel:
+            for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+                // Shift every element in kernel_window to the left
+                kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_iw * CONFIG_T::n_chan + i_ic] =
+                    kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + (i_iw + 1) * CONFIG_T::n_chan + i_ic];
+            }
+        }
+    }
+
+    // Insert shift_buffer column into right-most column of kernel
+    static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan;
+KernelPushHeight:
+    for (int i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) {
+        #pragma HLS UNROLL
+    KernelPushChannel:
+        for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+            kernel_window[lastheight + i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_ic] = shift_buffer[i_ih][i_ic];
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer(
+    const data_T &in_elem,
+    ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1, 1)]
+                                                                             [CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]) {
+
+    #pragma HLS PIPELINE
+
+    // Temporary buffer for popped (shifted) elements
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = shift_buffer complete dim = 0
+
+UpdateBuffer:
+    for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+        #pragma HLS UNROLL
+
+        // Insert pixel(s) at end of shift buffer
+        shift_buffer[CONFIG_T::filt_height - 1][i_ic] = in_elem[i_ic];
+    }
+
+LineBufferDataIn:
+    for (int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+    // Shift the shift buffer into the line buffer
+    LineBufferShift:
+        for (unsigned i_ih = 1; i_ih < CONFIG_T::filt_height; i_ih++) {
+            #pragma HLS UNROLL
+            typename data_T::value_type pop_elem = line_buffer[i_ih - 1][i_ic].shift(
+                shift_buffer[CONFIG_T::filt_height - i_ih][i_ic]); // Shift the line buffer, return the popped pixel
+            shift_buffer[CONFIG_T::filt_height - i_ih - 1][i_ic] =
+                pop_elem; // Popped element placed back into shift_buffer, one row up.
+        }
+    }
+    kernel_shift_2d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_2d(
+    const data_T &in_elem,
+    ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1, 1)]
+                                                                             [CONFIG_T::n_chan],
+    hls::stream<res_T> &res_stream,
+    typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE OFF
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+    const static int lShiftY = CONFIG_T::filt_height - 1;
+
+    // Counters
+    static int pX = 0; // Pixel X
+    static int pY = 0; // Pixel Y
+
+    static int sX = 0; // Stride X
+    static int sY = 0; // Stride Y
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel to buffer
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+
+        // Dense multiply
+        // #pragma HLS INLINE recursive
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        } else {
+            dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        }
+
+    // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else {
+            pY = pY + 1;
+            // Update stride (threshold) ? subtract stride : increment stride
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1;
+        }
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+// Conv 1D compute output
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_1d(
+    const data_T &in_elem, hls::stream<res_T> &res_stream,
+    typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+
+    // Counters
+    static int pX = 0; // pixel counter
+    static int sX = 0; // stride counter
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = res_out complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel to buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
+
+        // Dense multiply
+        #pragma HLS INLINE recursive
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        } else {
+            dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        }
+
+    // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense.h
new file mode 100644
index 00000000..c5155d84
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense.h
@@ -0,0 +1,49 @@
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS inline
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_compressed.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_compressed.h
new file mode 100644
index 00000000..029b7480
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_compressed.h
@@ -0,0 +1,90 @@
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+template <typename CONFIG_T>
+void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
+               typename CONFIG_T::accum_t weight) {
+    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
+        #pragma HLS UNROLL
+        if (k == index)
+            mult[k] += weight;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc    complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
+
+#ifdef __VITIS_HLS__
+    #pragma HLS AGGREGATE variable=weights
+#else
+    #pragma HLS data_pack variable=weights struct_level
+#endif
+
+InitAccum:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        #pragma HLS UNROLL
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    // Do the compressed matrix-multiply
+    const int rufactor = CONFIG_T::reuse_factor;
+ReuseLoop:
+    for (unsigned ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE  II=1 rewind
+
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    CompressedMultLoop:
+        for (unsigned im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            unsigned w = im * rufactor + ir;
+            auto row = weights[w].row_index;
+            auto col = weights[w].col_index;
+            auto weight_cache = weights[w].weight;
+            data_T data_cache = data[row];
+            // mult[col] += weight_cache * data_cache;
+            typename CONFIG_T::accum_t prod =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
+            fill_mult<CONFIG_T>(col, mult, prod);
+        }
+
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += mult[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        #pragma HLS UNROLL
+        // res[i] = (res_T) (acc[i]);
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_latency.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_latency.h
new file mode 100644
index 00000000..02802c45
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_latency.h
@@ -0,0 +1,72 @@
+#ifndef NNET_DENSE_LATENCY_H_
+#define NNET_DENSE_LATENCY_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    data_T cache;
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Do the matrix-multiply
+Product1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        cache = data[ii];
+    Product2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
+        }
+    }
+
+// Initialize accumulator with input biases
+ResetAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+// Accumulate multiplication result
+Accum1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+    Accum2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            acc[jj] += mult[index];
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        // res[ires] = (res_T) (acc[ires]);
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_resource.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_resource.h
new file mode 100644
index 00000000..88de9472
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_resource.h
@@ -0,0 +1,263 @@
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+    int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        w_index = ir;
+        out_index = outidx[ir] /*outstep*/;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue; // check out of bounds
+            tmpmult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit)
+                continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+    AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            // int out_index = im/multscale; // This is the general case
+            // acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    #pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 00000000..ad3a972e
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,68 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_types.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    typename data_T::value_type data[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=data complete
+
+    typename res_T::value_type res[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+DataPrepare:
+    for (int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_in / data_T::size > 1) {
+            #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data[i_in * data_T::size + i_pack] = data_pack[i_pack];
+        }
+    }
+
+    dense_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+
+ResWrite:
+    for (unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) {
+        if (CONFIG_T::n_out / res_T::size > 1) {
+            #pragma HLS PIPELINE
+        }
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = res[i_out * res_T::size + i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed.h
new file mode 100644
index 00000000..dfc77afa
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed.h
@@ -0,0 +1,45 @@
+#ifndef NNET_EMBED_H_
+#define NNET_EMBED_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+struct embed_config {
+    // Internal data type definitions
+    typedef float embeddings_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 16;
+    static const unsigned vocab_size = 50;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out],
+               typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    // This can save a few cycles, but it will create a large multiplexer due to
+    // non-constant access pattern, so let's leave it out
+    //#pragma HLS ARRAY_PARTITION variable=embeddings complete
+
+InputSequence:
+    for (int j = 0; j < CONFIG_T::n_in; j++) {
+    #pragma HLS UNROLL
+    DenseEmbedding:
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            #pragma HLS UNROLL
+            res[j * CONFIG_T::n_out + i] = embeddings[data[j] * CONFIG_T::n_out + i];
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed_stream.h
new file mode 100644
index 00000000..79ae9bc1
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_embed_stream.h
@@ -0,0 +1,33 @@
+#ifndef NNET_EMBED_STREAM_H_
+#define NNET_EMBED_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(hls::stream<data_T> &data, hls::stream<res_T> &res,
+               typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+    data_T in_data = data.read();
+
+InputSequence:
+    for (int j = 0; j < data_T::size; j++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+
+    DenseEmbedding:
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            #pragma HLS UNROLL
+            res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i];
+        }
+        res.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_garnet.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_garnet.h
new file mode 100644
index 00000000..1fcd5545
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_garnet.h
@@ -0,0 +1,816 @@
+#ifndef NNET_GARNET_H_
+#define NNET_GARNET_H_
+
+#include "hls_math.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+
+namespace nnet {
+namespace garnet_utils {
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+
+    index_t index;
+    typename CONFIG_T::distance_t distance;
+
+    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
+    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
+
+    for (unsigned iw = 1; iw < table_size; ++iw) {
+        index = iw;
+        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
+        edge_weights_table[iw] = hls::exp(-distance * distance);
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    typename CONFIG_T::distance_t v = -32.;
+    for (unsigned iw = 0; iw < table_size; ++iw) {
+        edge_weights_table[iw] = std::exp(-v * v);
+        v += step;
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T>
+inline
+    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    int index = (distance + 32.) / step;
+    if (index < 0)
+        index = 0;
+    else if (index >= table_size)
+        index = table_size - 1;
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
+    if (CONFIG_T::is_stack) {
+        #pragma HLS INLINE OFF
+    }
+#ifdef __SYNTHESIS__
+    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
+    // CONFIG_T::reuse_factor);
+    // #pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
+    bool initialized = false;
+#else
+    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    static bool initialized = false;
+#endif
+    if (not initialized) {
+        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
+        initialized = true;
+    }
+
+    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                  exponent_T exponent) {
+    #pragma HLS INLINE
+    return dividend >> exponent;
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                      exponent_T exponent) {
+    #pragma HLS INLINE
+    return dividend / std::pow(2., exponent);
+}
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
+
+    Means() {
+        #pragma HLS INLINE
+        #pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
+        #pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
+        #pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = 0.;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] = 0.;
+            }
+        }
+    }
+
+    void set_weight(unsigned, edge_weight_t const &) {
+        #pragma HLS INLINE
+    }
+
+    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
+        #pragma HLS INLINE
+        // Always called within a pipelined region - no UNROLL needed
+
+        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        #pragma HLS INLINE
+        #pragma HLS UNROLL region
+
+        // accum comes divided by unroll factor
+        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        #pragma HLS INLINE
+        #pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+
+            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
+            }
+        }
+    }
+};
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+
+    WeightsAndMeans() : Means<CONFIG_T, E>() {
+        #pragma HLS INLINE
+        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
+        #pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
+    }
+
+    void set_weight(unsigned iva, edge_weight_t const &weight) {
+        #pragma HLS INLINE
+        edge_weights[iva] = weight;
+    }
+};
+
+template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t const (&output_biases)[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
+        #pragma HLS INLINE
+    }
+};
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t output_biases[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const nvtx) {
+        #pragma HLS ARRAY_PARTITION variable=output_biases complete
+        #pragma HLS UNROLL region
+
+        // Cannot add a loop label here due to a Vivado HLS bug, apparently
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
+            bias *= nvtx;
+            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
+        }
+    }
+};
+
+template <class CONFIG_T, class data_T> struct InputDataGetter {
+    typedef data_T data_t;
+
+    data_T const *dataref;
+
+    InputDataGetter(data_T const *d) : dataref{d} {
+        #pragma HLS INLINE
+    }
+    data_T const &get(unsigned iv, unsigned ix) const {
+        #pragma HLS INLINE
+        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+        return dataref[ivx];
+    }
+};
+
+template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
+    typedef data_T data_t;
+
+    data_T const (&dataref)[CONFIG_T::n_in_features];
+
+    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
+        #pragma HLS INLINE
+    }
+    data_T const &get(unsigned, unsigned ix) const {
+        #pragma HLS INLINE
+        return dataref[ix];
+    }
+};
+
+template <class CONFIG_T, class res_T> struct OutputResSetter {
+    typedef res_T res_t;
+
+    res_T *resref;
+
+    OutputResSetter(res_T *r) : resref{r} {
+        #pragma HLS INLINE
+    }
+    void set(unsigned iv, unsigned io, res_T const &acc) {
+        #pragma HLS INLINE
+        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+        resref[ivo] = acc;
+    }
+};
+
+template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
+    typedef res_T res_t;
+
+    res_T (&resref)[CONFIG_T::n_out_features];
+
+    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
+        #pragma HLS INLINE
+    }
+    void set(unsigned, unsigned io, res_T const &acc) {
+        #pragma HLS INLINE
+        resref[io] = acc;
+    }
+};
+
+template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
+inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
+                                       arrays_T &arrays) {
+    #pragma HLS INLINE
+
+Aggregators:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+    InFeatures1:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
+
+            distance += incr;
+        }
+
+        typename CONFIG_T::edge_weight_t edge_weight =
+            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
+
+        arrays_local.edge_weight_mean[ia] += edge_weight;
+
+    InFeatures2:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
+
+            arrays_local.weighted_feature_mean[iax] += incr;
+        }
+
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+        arrays.set_weight(iva, edge_weight);
+    }
+}
+
+template <class CONFIG_T, class arrays_T>
+inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
+    #pragma HLS INLINE
+    #pragma HLS UNROLL region
+
+    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
+
+InFeatures:
+    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
+        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
+    }
+
+    return aggr;
+}
+
+template <class CONFIG_T, class arrays_T>
+inline void compute_output_base(arrays_T const &arrays,
+                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
+    #pragma HLS INLINE
+    #pragma HLS UNROLL region
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+        }
+    }
+}
+
+template <class CONFIG_T, class arrays_T, class res_setter_T>
+inline void
+compute_vertex_output(arrays_T const &arrays, unsigned iv,
+                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
+                      res_setter_T &res_setter) {
+    #pragma HLS INLINE
+
+    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=edge_weights complete
+
+Aggregators1:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+        edge_weights[ia] = arrays.edge_weights[iva];
+    }
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
+
+    Aggregators2:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
+            acc += incr;
+        }
+
+        res_setter.set(iv, io, acc);
+    }
+}
+
+template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
+void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
+    InputDataGetter<CONFIG_T, data_T> data_getter(data);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
+void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    OutputResSetter<CONFIG_T, res_T> res_setter(res);
+
+    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=output_base complete
+
+    compute_output_base<CONFIG_T>(arrays, output_base);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
+        }
+    }
+}
+
+template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
+void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
+                res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS PIPELINE
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        res_T acc = output_transform_biases.output_biases[io];
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+
+            acc += arrays.edge_weight_mean[ia] * aggr;
+        }
+
+        res[io] = acc;
+    }
+}
+
+template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
+void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
+    typedef typename prev_layer_t::output_t data_T;
+
+    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=prev_output_base complete
+
+    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
+
+    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
+
+    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            data_T data[prev_layer_t::n_out_features];
+            #pragma HLS ARRAY_PARTITION variable=data complete
+
+            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
+
+            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
+
+            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
+
+            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    current_arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    #pragma HLS INLINE
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    #pragma HLS INLINE
+
+    WeightsAndMeans<current_layer_t> current_arrays;
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
+
+    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
+}
+} // namespace garnet_utils
+
+struct garnet_config {
+    // Layer specs
+    static const unsigned n_vertices_width = 8;
+    static const unsigned n_vertices = (1 << n_vertices_width);
+    static const unsigned n_in_features = 4;
+    static const unsigned n_propagate = 4;
+    static const unsigned n_aggregators = 4;
+    static const unsigned n_out_features = 4;
+    static const unsigned distance_width = 12;
+
+    // Internal data type definitions
+    typedef float input_transform_weights_t;
+    typedef float input_transform_biases_t;
+    typedef float output_transform_weights_t;
+    typedef float output_transform_biases_t;
+    typedef float aggregator_distance_weights_t;
+    typedef float aggregator_distance_biases_t;
+
+    typedef float norm_t;
+    typedef float distance_t;
+    typedef float edge_weight_t;
+    typedef float edge_weight_aggr_t;
+    typedef float aggr_t;
+    typedef float output_t;
+
+    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
+    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
+    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
+    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
+    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
+
+    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
+
+    static const unsigned output_collapse = no_collapse;
+
+    static const bool mean_by_nvert = false;
+    static const bool is_stack = false;
+
+    // Optimization specs
+    static const unsigned reuse_factor = 64;
+    static const unsigned log2_reuse_factor = 6;
+};
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    garnet_utils::Means<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
+}
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
+}
+
+/* Reference (dumb) implementation returning (Vertices, Features) */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
+
+                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
+            }
+        }
+
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
+            }
+
+            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
+        }
+    }
+
+    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            aggregated_features[iap] = 0.;
+
+            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+                if (iv == nvtx[0])
+                    break;
+
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
+            }
+        }
+    }
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            if (CONFIG_T::mean_by_nvert)
+                aggregated_features[iap] /= nvtx[0];
+            else {
+                // Not using right shift in case aggr_t is float or double
+                aggregated_features[iap] /= CONFIG_T::n_vertices;
+            }
+        }
+    }
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
+
+            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+                typename CONFIG_T::aggr_t aggr = 0.;
+
+                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
+
+                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
+                }
+
+                acc += edge_weights[iva] * aggr;
+            }
+
+            res[ivo] = acc;
+        }
+    }
+}
+
+/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_out_features]) {
+    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
+
+    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
+
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename CONFIG_T::aggr_t acc = 0.;
+
+        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+            if (iv == nvtx[0])
+                break;
+
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            acc += vertex_res[ivo];
+        }
+
+        if (CONFIG_T::mean_by_nvert)
+            acc /= nvtx[0];
+        else {
+            // Not using right shift in case aggr_t is float or double
+            acc /= CONFIG_T::n_vertices;
+        }
+
+        res[io] = acc;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_helpers.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_helpers.h
new file mode 100644
index 00000000..b8c2a48d
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_helpers.h
@@ -0,0 +1,382 @@
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include "hls_stream.h"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace nnet {
+
+#ifndef __SYNTHESIS__
+
+#ifndef WEIGHTS_DIR
+#define WEIGHTS_DIR "weights"
+#endif
+
+template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+
+        size_t i = 0;
+        while (std::getline(iss, token, ',')) {
+            std::istringstream(token) >> w[i];
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].sign >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j]);
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = save_T(data[i]);
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = save_T(ctype[j]);
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << float(data[i]) << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << float(ctype[j]) << " "; // We don't care about precision in text files
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+#endif
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+    std::copy(in_begin, in_end, dst);
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
+void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+
+    size_t i_pack = 0;
+    dst_T dst_pack;
+    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
+        dst_pack[i_pack++] = typename dst_T::value_type(*i);
+        if (i_pack == dst_T::size) {
+            i_pack = 0;
+            dst.write(dst_pack);
+        }
+    }
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++)
+        if (i == SIZE - 1) {
+            dst[i].data = src[i];
+            dst[i].last = 1;
+        } else {
+            dst[i].data = src[i];
+            dst[i].last = 0;
+        }
+}
+
+template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        out << result[i] << " ";
+    }
+    out << std::endl;
+}
+
+template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / res_T::size; i++) {
+        res_T res_pack = result.read();
+        for (int j = 0; j < res_T::size; j++) {
+            out << res_pack[j] << " ";
+        }
+        if (keep)
+            result.write(res_pack);
+    }
+    out << std::endl;
+}
+
+template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
+
+template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        if (fscanf(fp, "%f\n", &newval) != 0) {
+            data[ii] = newval;
+        } else {
+            return -2;
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class dataType, unsigned int nrows, unsigned int ncols>
+int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        for (int jj = 0; jj < ncols; jj++) {
+            if (fscanf(fp, "%f\n", &newval) != 0) {
+                data[ii][jj] = newval;
+            } else {
+                return -2;
+            }
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
+    in_T datareg;
+    hls::stream<out_T> input_trunc;
+    for (int ii = 0; ii < N_IN; ii++) {
+        out << (out_T)in.read();
+    }
+}
+
+template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
+    data_T datareg;
+    for (int ii = 0; ii < N_IN; ii++) {
+        datareg = data.read();
+        std::cout << "[" << ii << "]: " << datareg << std::endl;
+        res << datareg;
+    }
+}
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image.h
new file mode 100644
index 00000000..eeb45481
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image.h
@@ -0,0 +1,41 @@
+#ifndef NNET_IMAGE_H_
+#define NNET_IMAGE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+struct resize_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned n_chan = 10;
+    static const unsigned new_height = 10;
+    static const unsigned new_width = 10;
+};
+
+template <class data_T, typename CONFIG_T>
+void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan],
+                    data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) {
+    int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1;
+    int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1;
+    int x2, y2;
+
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::new_height; i++) {
+        for (int j = 0; j < CONFIG_T::new_width; j++) {
+            x2 = ((j * x_ratio) >> 16);
+            y2 = ((i * y_ratio) >> 16);
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] =
+                    image[(y2 * CONFIG_T::width * CONFIG_T::n_chan) + x2 * CONFIG_T::n_chan + k];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image_stream.h
new file mode 100644
index 00000000..a23a93db
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_image_stream.h
@@ -0,0 +1,66 @@
+#ifndef NNET_IMAGE_STREAM_H_
+#define NNET_IMAGE_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> void resize_nearest(hls::stream<data_T> &image, hls::stream<data_T> &resized) {
+    assert(CONFIG_T::new_height % CONFIG_T::height == 0);
+    assert(CONFIG_T::new_width % CONFIG_T::width == 0);
+    constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height;
+    constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width;
+
+ImageHeight:
+    for (unsigned h = 0; h < CONFIG_T::height; h++) {
+        #pragma HLS PIPELINE
+
+        data_T data_in_row[CONFIG_T::width];
+
+    ImageWidth:
+        for (unsigned i = 0; i < CONFIG_T::width; i++) {
+            #pragma HLS UNROLL
+
+            data_T in_data = image.read();
+
+        ImageChan:
+            for (unsigned j = 0; j < CONFIG_T::n_chan; j++) {
+                #pragma HLS UNROLL
+
+                data_in_row[i][j] = in_data[j];
+            }
+        }
+
+    ResizeHeight:
+        for (unsigned i = 0; i < ratio_height; i++) {
+            #pragma HLS UNROLL
+
+        ImageWidth2:
+            for (unsigned l = 0; l < CONFIG_T::width; l++) {
+                #pragma HLS UNROLL
+
+            ResizeWidth:
+                for (unsigned j = 0; j < ratio_width; j++) {
+                    #pragma HLS UNROLL
+
+                    data_T out_data;
+                    PRAGMA_DATA_PACK(out_data)
+
+                ResizeChan:
+                    for (unsigned k = 0; k < CONFIG_T::n_chan; k++) {
+                        #pragma HLS UNROLL
+
+                        out_data[k] = data_in_row[l][k];
+                    }
+
+                    resized.write(out_data);
+                }
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_math.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_math.h
new file mode 100644
index 00000000..c021d8eb
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_math.h
@@ -0,0 +1,178 @@
+#ifndef NNET_MATH_H_
+#define NNET_MATH_H_
+
+#include "hls_math.h"
+
+namespace nnet {
+
+// This header defines the functions that return type different from the input
+// For example, hls::sin(x) returns ap_fixed<W-I+2,2>
+// By ensuring we return the same type we can avoid casting issues in expressions
+
+template <typename T> T sin(T x) { return (T)hls::sin(x); };
+
+template <typename T> T cos(T x) { return (T)hls::cos(x); };
+
+template <typename T> T asin(T x) { return (T)hls::asin(x); };
+
+template <typename T> T acos(T x) { return (T)hls::acos(x); };
+
+template <typename T> T atan(T x) { return (T)hls::atan(x); };
+
+template <typename T> T atan2(T x, T y) { return (T)hls::atan2(x, y); };
+
+template <class T, int W, int I> void init_sincos_table(T table[1 << (W - I - 3)][2]) {
+    unsigned int NTE = 1 << (W - I - 3); // No of table entries
+    double step = M_PI / (4 * NTE);      // Interval between angles
+    double y = 0;
+    // double scaled_angle = 0;
+
+    for (unsigned int i = 0; i < NTE; i++) {
+        table[i][0] = std::cos(y);
+        table[i][1] = std::sin(y);
+        y += step;
+        // scaled_angle = y/(2*M_PI);
+        // printf("cos(%f) = %23.22f, sin(%f) = %23.22f index = %d, scaled angle = %13.12f \n", y, cos(y), y, sin(y), i,
+        // scaled_angle);
+    }
+}
+
+template <class T> void sincos_lut(const T &input, T output[2]) {
+
+    #pragma HLS INLINE
+
+    // This implementation is based on ac_sincos_lut.h from AC math library
+
+    static bool flag = true;
+    if (flag && T::width - T::iwidth > 12) {
+#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG)
+        std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl;
+        std::cout << "Warning: The output of sincos_lut will not be accurate" << std::endl;
+#endif
+        flag = false;
+    }
+    // Datatype for lookup table entries
+    typedef ap_ufixed<T::width, T::iwidth, AP_RND> luttype;
+    // Datatype for posinput which is used to handle negative inputs
+    typedef ap_ufixed<T::width - T::iwidth, 0> posinputtype;
+
+    typedef ap_uint<9> lutindextype; // 9 bits required for indexing into 512 entry table
+    typedef ap_uint<3> octanttype;   // 3 bits required for octant value range of 0 thru 7
+    T outputtemp[2];
+    lutindextype luTdex = 0;
+    posinputtype posinput = input;
+
+    // Initialize the lookup table
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    luttype sincos[512][2];
+#else
+    static bool initialized = false;
+    static luttype sincos[512][2];
+#endif
+    if (!initialized) {
+        init_sincos_table<luttype, 12, 0>(sincos);
+        initialized = true;
+    }
+
+    // Leaving this commented out makes the table to to BRAM
+    //#pragma HLS ARRAY_PARTITION variable=sincos complete dim=0
+
+    typedef ap_uint<AP_MAX(T::width - T::iwidth - 3, 1)> lutindextype1;
+    // Extracting (MSB-3:LSB) bits of scaled input to determine the lookup table index
+    lutindextype1 luTdex1 = posinput.range(AP_MAX(T::width - T::iwidth - 3, 1), 0); // Extracting the lookup table index
+
+    if (T::width - T::iwidth >= 4 && T::width - T::iwidth <= 12) {
+        luTdex(8, 12 - (T::width - T::iwidth)) = luTdex1; // stride
+    }
+    // Approximation for the scaled inputs whose number of bits are greater than 12
+    else if (T::width - T::iwidth > 12) {
+        // Lookup table index for the scaled inputs whose number of bits are greater than 12
+        luTdex = luTdex1 / (1 << (AP_MAX(T::width - T::iwidth - 12, 0)));
+        if ((luTdex1 % (1 << (AP_MAX(T::width - T::iwidth - 12, 0)))) > (1 << (AP_MAX(T::width - T::iwidth - 13, 0)))) {
+            luTdex = luTdex + 1;
+        }
+        typedef ap_ufixed<AP_MAX((AP_MAX(T::width - T::iwidth - 3, 1) + T::width - T::iwidth - 12), 1),
+                          AP_MAX(T::width - T::iwidth - 3, 1)>
+            datatype;
+        datatype x = (datatype)luTdex1;
+        x = x >> AP_MAX(T::width - T::iwidth - 12, 0);
+        if (x > 511.5) {
+            luTdex = 511;
+        }
+        if (luTdex1 <= 1 << (AP_MAX(T::width - T::iwidth - 13, 0)) && luTdex1 != 0) {
+            luTdex = 1;
+        }
+    }
+
+    if (T::width - T::iwidth >= 3) {
+        // Getting the octant 0-7 by extracting the first 3 bits from MSB side of scaled input where
+        //   octant 0 corresponds to [0-PI/4),
+        //   octant 1 corresponds to [PI/4-2PI/4),
+        //   octant 2 corresponds to [2PI/4-3PI/4) and so on
+        // octanttype octant = posinput.template slc<3>(T::width-T::iwidth-3);
+        octanttype octant = posinput(T::width - T::iwidth - 1, T::width - T::iwidth - 3);
+        luTdex = (octant[0] == 1) ? (lutindextype)(512 - luTdex) : (lutindextype)(luTdex);
+        // imaginary part is sine
+        outputtemp[1] = ((octant == 0) | (octant == 3))   ? (T)sincos[luTdex][1]
+                        : ((octant == 2) | (octant == 1)) ? (T)sincos[luTdex][0]
+                        : ((octant == 7) | (octant == 4)) ? (T)-sincos[luTdex][1]
+                                                          : (T)-sincos[luTdex][0];
+        // real part is cosine
+        outputtemp[0] = ((octant == 6) | (octant == 1))   ? (T)sincos[luTdex][1]
+                        : ((octant == 3) | (octant == 4)) ? (T)-sincos[luTdex][0]
+                        : ((octant == 2) | (octant == 5)) ? (T)-sincos[luTdex][1]
+                                                          : (T)sincos[luTdex][0];
+        // Below two are the cases when the output corresponds to + or - (0 or 1) for which there is no entry in the lookup
+        // table
+        output[1] = ((posinput == 0.125) | (posinput == 0.375))   ? T(0.7071067811865475244008)
+                    : ((posinput == 0.625) | (posinput == 0.875)) ? T(-0.7071067811865475244008)
+                                                                  : outputtemp[1];
+        output[0] = ((posinput == 0.125) | (posinput == 0.875))   ? T(0.7071067811865475244008)
+                    : ((posinput == 0.375) | (posinput == 0.625)) ? T(-0.7071067811865475244008)
+                                                                  : outputtemp[0];
+    }
+
+    if (T::width - T::iwidth <= 2) {
+        output[1] = (posinput == 0)      ? (T)0
+                    : (posinput == 0.25) ? (T)1
+                    : (posinput == 0.5)  ? (T)0
+                    : (posinput == 0.75) ? (T)-1
+                                         : outputtemp[1];
+        output[0] = (posinput == 0)      ? (T)1
+                    : (posinput == 0.25) ? (T)0
+                    : (posinput == 0.5)  ? (T)-1
+                    : (posinput == 0.75) ? (T)0
+                                         : outputtemp[0];
+    }
+
+#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG)
+    std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl;
+    std::cout << "============AP_FIXED SINCOS======================" << std::endl;
+    std::cout << "positive input is   = " << posinput << std::endl;
+    std::cout << "lut index is   = " << luTdex << std::endl;
+    std::cout << "sin value is    = " << output[1] << std::endl;
+    std::cout << "cos value is    = " << output[0] << std::endl;
+    std::cout << "=================================================" << std::endl;
+#endif
+}
+
+template <class T> T sin_lut(const T input) {
+    #pragma HLS INLINE
+    T sincos_res[2];
+    T scaled_input = input * ap_ufixed<16, 0>(0.15915494309); // 1/(2*pi)
+    sincos_lut(scaled_input, sincos_res);
+    return sincos_res[1];
+}
+
+template <class T> T cos_lut(const T input) {
+    #pragma HLS INLINE
+    T sincos_res[2];
+    T scaled_input = input * ap_ufixed<16, 0>(0.15915494309); // 1/(2*pi)
+    sincos_lut(scaled_input, sincos_res);
+    return sincos_res[0];
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge.h
new file mode 100644
index 00000000..083c3185
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge.h
@@ -0,0 +1,257 @@
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+    static const unsigned reuse_factor = 1;
+    typedef float accum_t;
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] + data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] - data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem*2], res_T res[CONFIG_T::n_elem*2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii*2] = data1[ii] * data2[ii*2];
+	res[ii*2+1] = data1[ii] * data2[ii*2+1];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+    typename CONFIG_T::accum_t acc = 0;
+
+Product:
+    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
+        #pragma HLS UNROLL
+        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
+    }
+
+Accum:
+    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
+        #pragma HLS UNROLL
+        acc += mult[i_acc];
+    }
+
+    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
+        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
+                data2[ii * CONFIG_T::n_elem2_1 + jj];
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx =
+                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
+                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge_stream.h
new file mode 100644
index 00000000..a57ec78e
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_merge_stream.h
@@ -0,0 +1,370 @@
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AddLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AddPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] + in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+SubtractLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SubtractPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] - in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MultiplyLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MultiplyPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] * in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AverageLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AveragePack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MaximumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MaximumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MinimumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MinimumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[input1_T::size + k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+        res.write(out_data);
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[input1_T::size + k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    res_T out_data;
+    PRAGMA_DATA_PACK(out_data)
+ConcatLoop1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+        input1_T in_data1 = data1.read();
+    ConcatPack1:
+        for (int j = 0; j < input1_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j + (i * input1_T::size)] = in_data1[j];
+        }
+    }
+ConcatLoop2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        #pragma HLS PIPELINE
+        input2_T in_data2 = data2.read();
+    ConcatPack2:
+        for (int j = 0; j < input2_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
+        }
+    }
+    res.write(out_data);
+}
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_mult.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_mult.h
new file mode 100644
index 00000000..00d1c6d1
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_mult.h
@@ -0,0 +1,116 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+namespace product {
+
+/* ---
+ * different methods to perform the product of input and weight, depending on the
+ * types of each.
+ * --- */
+
+class Product {};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        #pragma HLS INLINE
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        #pragma HLS INLINE
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        #pragma HLS INLINE
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        #pragma HLS INLINE
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        #pragma HLS INLINE
+        return a * w;
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
+    static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        #pragma HLS INLINE
+
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+
+} // namespace product
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
+                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<
+    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding.h
new file mode 100644
index 00000000..e48a2fb4
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding.h
@@ -0,0 +1,145 @@
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int j = 0; j < CONFIG_T::n_chan; j++) {
+        for (int i = 0; i < CONFIG_T::pad_left; i++) {
+            *(res++) = 0;
+        }
+
+        for (int i = 0; i < CONFIG_T::in_width; i++) {
+            *(res++) = (res_T) * (data++);
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_right; i++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int k = 0; k < CONFIG_T::n_chan; k++) {
+
+        for (int i = 0; i < CONFIG_T::pad_top; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::in_height; i++) {
+            for (int j = 0; j < CONFIG_T::pad_left; j++) {
+                *(res++) = 0;
+            }
+            for (int j = 0; j < CONFIG_T::in_width; j++) {
+                *(res++) = (res_T) * (data++);
+            }
+            for (int j = 0; j < CONFIG_T::pad_right; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding_stream.h
new file mode 100644
index 00000000..9df5d540
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_padding_stream.h
@@ -0,0 +1,85 @@
+#ifndef NNET_PADDING_STREAM_H_
+#define NNET_PADDING_STREAM_H_
+
+#include <math.h>
+
+namespace nnet {
+
+template <class res_T, typename CONFIG_T> void fill_zero(hls::stream<res_T> &res) {
+    #pragma HLS INLINE
+    res_T res_part;
+    for (int c = 0; c < CONFIG_T::n_chan; c++) {
+        #pragma HLS UNROLL
+        res_part[c] = 0;
+    }
+    res.write(res_part);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void fill_data(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    #pragma HLS INLINE
+    data_T data_part = data.read();
+    res_T res_part;
+    for (int c = 0; c < CONFIG_T::n_chan; c++) {
+        #pragma HLS UNROLL
+        res_part[c] = data_part[c];
+    }
+    res.write(res_part);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+PadLeft:
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        fill_zero<res_T, CONFIG_T>(res);
+    }
+
+CopyMain:
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        fill_data<data_T, res_T, CONFIG_T>(data, res);
+    }
+
+PadRight:
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        fill_zero<res_T, CONFIG_T>(res);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+
+PadTop:
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+    PadTopWidth:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+
+PadMain:
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+    PadLeft:
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    CopyMain:
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            fill_data<data_T, res_T, CONFIG_T>(data, res);
+        }
+    PadRight:
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+
+PadBottom:
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+    PadBottomWidth:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling.h
new file mode 100644
index 00000000..12ac8fe3
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling.h
@@ -0,0 +1,373 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+// Return the maximum value from an array
+template <typename T, int N> T max(T x[N]) {
+    T y = x[0];
+    for (int i = 1; i < N; i++) {
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+template <int W, int N> ap_int<W> avg(ap_int<W> (&x)[N]) {
+    // Use a wider accumulator than the input to avoid overflow
+    ap_int<W + ceillog2(N)> tmp = 0;
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_int<W> y = tmp;
+    return tmp;
+}
+
+template <int W, int N> ap_int<W> avg(ap_uint<W> (&x)[N]) {
+    // Use a wider accumulator than the input to avoid overflow
+    ap_uint<W + ceillog2(N)> tmp = 0;
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_uint<W> y = tmp;
+    return tmp;
+}
+
+template <int W, int I, int N, ap_q_mode Q, ap_o_mode O> ap_fixed<W, I, Q, O> avg(ap_fixed<W, I, Q, O> (&x)[N]) {
+    // Use a wider accumulator than the input to avoid overflow
+    ap_fixed<W + ceillog2(N), I + ceillog2(N), Q, O> tmp = 0;
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_fixed<W, I, Q, O> y = tmp;
+    return y;
+}
+
+template <int W, int I, int N, ap_q_mode Q, ap_o_mode O> ap_ufixed<W, I, Q, O> avg(ap_ufixed<W, I, Q, O> (&x)[N]) {
+    // Use a wider accumulator than the input to avoid overflow
+    ap_ufixed<W + ceillog2(N), I + ceillog2(N), Q, O> tmp = 0;
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ap_ufixed<W, I, Q, O> y = tmp;
+    return y;
+}
+
+// Return the mean value of an array
+template <typename T, int N> T avg(T (&x)[N]) {
+    T y = 0;
+    for (int i = 0; i < N; i++) {
+        y += x[i];
+    }
+    y /= N;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template <typename T, int N, Pool_Op op> T pool_op(T (&x)[N]) {
+    switch (op) {
+    case Max:
+        return max<T, N>(x);
+    case Average:
+        return avg(x);
+        // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template <typename T, Pool_Op op> T pad_val() {
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+    unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) {
+            data_T pool[CONFIG_T::pool_width];
+            #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+            // Keep track of number of pixels in image vs padding region
+            unsigned img_overlap = 0;
+            // Loop over pool window x
+            for (int jj = 0; jj < CONFIG_T::stride_width; jj++) {
+                if (ii + jj < CONFIG_T::pad_left || ii + jj >= (padded_width - CONFIG_T::pad_right)) {
+                    // Add padding
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+                    if (CONFIG_T::count_pad)
+                        img_overlap++;
+                } else {
+                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                    img_overlap++;
+                }
+            }
+            // do the pooling
+            // TODO in the case of average pooling, need to reduce width to area of pool window
+            // not overlapping padding region
+            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+            // If the pool op is Average, the zero-padding needs to be removed from the results
+            if (CONFIG_T::pool_op == Average) {
+                data_T rescale = static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool);
+    }
+}
+
+struct pooling2d_config {
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit() {
+    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt +
+                                     (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                        (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
+                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                         res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+
+FiltLoop:
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling_stream.h
new file mode 100644
index 00000000..13d5979a
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_pooling_stream.h
@@ -0,0 +1,609 @@
+#ifndef NNET_POOLING_STREAM_H_
+#define NNET_POOLING_STREAM_H_
+
+#include "ap_shift_reg.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "nnet_pooling.h"
+#include "utils/x_hls_utils.h"
+
+namespace nnet {
+
+// *************************************************
+//       Max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T> T reduce_pool(T x[N]) {
+    #pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        return reduce<T, N, Op_max<T>>(x, op_max);
+    } else {
+        Op_add<T> op_add;
+        T sum = reduce<T, N, Op_add<T>>(x, op_add);
+        return sum / N;
+    }
+}
+
+template <unsigned TABLE_SIZE, unsigned POOL_SIZE> void init_pool_table(unsigned table[TABLE_SIZE]) {
+    for (unsigned ii = 0; ii < TABLE_SIZE; ii++) {
+        table[ii] = ii % POOL_SIZE;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_encoded_2d(
+    const unsigned h_idx, const unsigned w_idx, const data_T &in_elem,
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt],
+    hls::stream<res_T> &res, res_T &res_pack, unsigned &outputs_ready) {
+    // Nearest H without unused pixels on the right
+    constexpr unsigned nH =
+        ((CONFIG_T::in_height - CONFIG_T::pool_height) / CONFIG_T::stride_height) * CONFIG_T::stride_height +
+        CONFIG_T::pool_height;
+    // Scaled H that behaves like original H
+    constexpr unsigned sH =
+        (DIV_ROUNDUP(CONFIG_T::pool_height, CONFIG_T::stride_height) - 1) * CONFIG_T::stride_height + CONFIG_T::pool_height;
+    // Nearest W without unused pixels on the right
+    constexpr unsigned nW = ((CONFIG_T::in_width - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width +
+                            CONFIG_T::pool_width;
+    // Scaled W that behaves like original W
+    constexpr unsigned sW =
+        (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width;
+
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    unsigned pool_table_height[CONFIG_T::in_height];
+    unsigned pool_table_width[CONFIG_T::in_width];
+#else
+    static bool initialized = false;
+    static unsigned pool_table_height[CONFIG_T::in_height];
+    static unsigned pool_table_width[CONFIG_T::in_width];
+#endif
+    if (!initialized) {
+        init_pool_table<CONFIG_T::in_height, CONFIG_T::pool_height>(pool_table_height);
+        init_pool_table<CONFIG_T::in_width, CONFIG_T::pool_width>(pool_table_width);
+        initialized = true;
+    }
+
+    #pragma HLS INLINE
+
+    if (data_T::size / CONFIG_T::n_filt > 1) {
+        #pragma HLS ARRAY_PARTITION variable=pool_table_height complete
+        #pragma HLS ARRAY_PARTITION variable=pool_table_width complete
+    }
+
+    typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    const unsigned sh_idx = pool_table_height[h_idx] * CONFIG_T::pool_width;
+    const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt);
+
+PixelLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+        #pragma HLS PIPELINE
+
+        ap_uint<CONFIG_T::pool_height *CONFIG_T::pool_width> filt_mask = 0;
+        if ((h_idx < nH) && (wp_idx + p < nW)) {
+            filt_mask = sh_idx + pool_table_width[wp_idx + p] + 1;
+        }
+
+    CopyDataFilt:
+        for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            if (filt_mask > 0)
+                data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write(
+                    in_elem[p * CONFIG_T::n_filt + c]);
+        }
+
+        if (filt_mask == CONFIG_T::pool_height * CONFIG_T::pool_width) {
+        FiltLoop:
+            for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            PoolLoop:
+                for (unsigned f = 0; f < CONFIG_T::pool_height * CONFIG_T::pool_width; f++) {
+                    pool_window[f] = data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + f].read();
+                }
+                if (res_T::size / CONFIG_T::n_filt ==
+                    1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                    res_pack[c] =
+                        reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(
+                            pool_window);
+                } else {
+                    res_pack[outputs_ready * CONFIG_T::n_filt + c] =
+                        reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(
+                            pool_window);
+                }
+            }
+            if (res_T::size / CONFIG_T::n_filt ==
+                1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                res.write(res_pack);
+            } else {
+                if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) {
+                    res.write(res_pack);
+                    outputs_ready = 0;
+                } else {
+                    outputs_ready++;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    constexpr int win_depth = CONFIG_T::pool_height * CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    constexpr int pack_factor = data_T::size / CONFIG_T::n_filt;
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (pack_factor); i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            if (res_T::size / CONFIG_T::n_filt == 1) {
+                #pragma HLS PIPELINE II=pack_factor
+            }
+            compute_pool_encoded_2d<data_T, res_T, CONFIG_T>(i_ih, i_iw, data.read(), data_window, res, res_pack,
+                                                             outputs_ready);
+        }
+    }
+}
+
+// *************************************************
+//       Line Buffer Implementation (Phil's)
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_2d(const data_T &in_elem,
+                            ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+                                line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt],
+                            hls::stream<res_T> &res) {
+    #pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    const static int lShiftY = CONFIG_T::pool_height - 1;
+    static int pX = 0; // pixel X
+    static int pY = 0; // pixel Y
+    static int sX = 0; // stride X
+    static int sY = 0; // stride Y
+
+    typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel into line buffer, return pooling kernels
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Can compute pooling output
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+    FiltLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+        #pragma HLS PIPELINE
+
+        // Retrieve data for current channel
+        PoolLoop:
+            for (unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) {
+                pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic];
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] =
+                reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else { // Next line
+            pY = pY + 1;
+            // Update stride (threshold) ? subtract stride : increment stride
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1;
+        }
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::pool_height - 1, 1)]
+                                                                                    [CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            #pragma HLS PIPELINE
+
+            compute_pool_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        pooling2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case conv_implementation::encoded:
+        pooling2d_encoded_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//                  Pooling 1D
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_encoded_1d(const unsigned w_idx, const data_T &in_elem,
+                             hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_width * CONFIG_T::n_filt],
+                             hls::stream<res_T> &res, res_T &res_pack, unsigned &outputs_ready) {
+    // Nearest W without unused pixels on the right
+    constexpr unsigned nW =
+        ((CONFIG_T::n_in - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + CONFIG_T::pool_width;
+    // Scaled W that behaves like original W
+    constexpr unsigned sW =
+        (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width;
+
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    unsigned pool_table_width[CONFIG_T::n_in];
+#else
+    static bool initialized = false;
+    static unsigned pool_table_width[CONFIG_T::n_in];
+#endif
+    if (!initialized) {
+        init_pool_table<CONFIG_T::n_in, CONFIG_T::pool_width>(pool_table_width);
+        initialized = true;
+    }
+
+    #pragma HLS INLINE
+
+    if (data_T::size / CONFIG_T::n_filt > 1) {
+        #pragma HLS ARRAY_PARTITION variable=pool_table_width complete
+    }
+
+    typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt);
+
+PixelLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+        #pragma HLS PIPELINE
+
+        ap_uint<CONFIG_T::pool_width> filt_mask = 0;
+        if (wp_idx + p < nW) {
+            filt_mask = pool_table_width[wp_idx + p] + 1;
+        }
+
+    CopyDataFilt:
+        for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            if (filt_mask > 0)
+                data_window[c * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write(in_elem[p * CONFIG_T::n_filt + c]);
+        }
+
+        if (filt_mask == CONFIG_T::pool_width) {
+        FiltLoop:
+            for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            PoolLoop:
+                for (unsigned f = 0; f < CONFIG_T::pool_width; f++) {
+                    pool_window[f] = data_window[c * CONFIG_T::pool_width + f].read();
+                }
+                if (res_T::size / CONFIG_T::n_filt ==
+                    1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                    res_pack[c] = reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+                } else {
+                    res_pack[outputs_ready * CONFIG_T::n_filt + c] =
+                        reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+                }
+            }
+            if (res_T::size / CONFIG_T::n_filt ==
+                1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                res.write(res_pack);
+            } else {
+                if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) {
+                    res.write(res_pack);
+                    outputs_ready = 0;
+                } else {
+                    outputs_ready++;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    constexpr int win_depth = CONFIG_T::n_out;
+    for (unsigned i_out = 0; i_out < CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    constexpr int pack_factor = data_T::size / CONFIG_T::n_filt;
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (pack_factor); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (res_T::size / CONFIG_T::n_filt == 1) {
+            #pragma HLS PIPELINE II=pack_factor
+        }
+        compute_pool_encoded_1d<data_T, res_T, CONFIG_T>(i_iw, data.read(), data_window, res, res_pack, outputs_ready);
+    }
+}
+
+// *************************************************
+//       Line Buffer Implementation (Phil's) 1D
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_1d(const data_T &in_elem, hls::stream<res_T> &res) {
+    #pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    // Counters
+    static int pX = 0;
+    static int sX = 0;
+
+    typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_width];
+    #pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel into line buffer, return pooling kernels
+    // 1D case line buffer not necessary. Put directly into the kernel_data buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Can compute pooling output
+    if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
+    FiltLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+        #pragma HLS PIPELINE
+
+        // Retrieve data for current channel
+        PoolLoop:
+            for (unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) {
+                pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic];
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] = reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::n_in) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        #pragma HLS PIPELINE
+        compute_pool_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        pooling1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case conv_implementation::encoded:
+        pooling1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       Global max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T> T reduce_global_pool(T x, T y[N]) {
+    #pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        T y_max = reduce<T, N, Op_max<T>>(y, op_max);
+        return (x > y_max) ? x : y_max;
+    } else {
+        Op_add<T> op_add;
+        T y_sum = reduce<T, N, Op_add<T>>(y, op_add);
+        return x + y_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]) {
+PoolFilt:
+    for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+        #pragma HLS UNROLL
+
+        typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt];
+        #pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0
+
+    PixelLoop:
+        for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+            #pragma HLS UNROLL
+            data_pack[p] = in_elem[p * CONFIG_T::n_filt + c];
+        }
+        data_window[c] = reduce_global_pool<typename CONFIG_T::accum_t, data_T::size / CONFIG_T::n_filt, CONFIG_T>(
+            data_window[c], data_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+    }
+
+PoolInitLoop:
+    for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        #pragma HLS UNROLL
+        data_window[i_init] = init;
+    }
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+        }
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+    MaxPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        MaxPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+    AvgPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        AvgPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width);
+            }
+            res.write(res_pack);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+    }
+
+PoolInitLoop:
+    for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        #pragma HLS UNROLL
+        data_window[i_init] = init;
+    }
+
+ReadInput:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+    MaxPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        MaxPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+    AvgPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            #pragma HLS PIPELINE
+
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        AvgPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in;
+            }
+            res.write(res_pack);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recr_activations.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recr_activations.h
new file mode 100644
index 00000000..f68d8066
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recr_activations.h
@@ -0,0 +1,56 @@
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recurrent.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recurrent.h
new file mode 100644
index 00000000..6e868148
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_recurrent.h
@@ -0,0 +1,571 @@
+#ifndef NNET_RECURSIVE_H_
+#define NNET_RECURSIVE_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recr_activations.h"
+
+namespace nnet {
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_parts = 20;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+// Long Short term Memory NN (LSTM)
+// Resources:
+// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
+// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
+// https://en.wikipedia.org/wiki/Long_short-term_memory
+// Notes:
+//  - LSTM naming conventions adopted from the above links
+//      - s_newstate = activation(U*input + W*state)
+//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
+//  - If softmax is needed on output, perform *outside* this operations
+//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
+//  dense network at the end)
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_newstate, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        #pragma HLS UNROLL
+        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                 res_T s_newstate[CONFIG_T::n_state],
+                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    static res_T h_state[CONFIG_T::n_state];
+    static res_T s_state[CONFIG_T::n_state];
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=h_state      complete
+    #pragma HLS ARRAY_PARTITION variable=s_state      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    if (reset_state) {
+        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
+            #pragma HLS UNROLL
+            s_state[i_state] = 0;
+            h_state[i_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
+                                                                                    param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+        s_newstate[iacc] = s_state[iacc];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_state, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        #pragma HLS UNROLL
+        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    res_T h_newstate[CONFIG_T::n_state];
+    res_T s_newstate[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            #pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                       param_br);
+        else
+            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                #pragma HLS UNROLL
+                res[i] = h_newstate[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            #pragma HLS UNROLL
+            res[i] = h_newstate[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    typename res_T::value_type s_newstate[CONFIG_T::n_state];
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        else
+            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+// Struct for the GRU template
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_sequence = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
+                                                                                    // weights - refer page in copy!!
+         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    static res_T h_state[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    #pragma HLS ARRAY_PARTITION variable=h_state         complete
+    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    if (reset_state) {
+        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
+            #pragma HLS UNROLL
+            h_state[i_h_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    res_T h_state[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    #pragma HLS ARRAY_PARTITION variable=h_state complete
+    #pragma HLS ARRAY_PARTITION variable=data_in complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_state[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            #pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                #pragma HLS UNROLL
+                res[i] = h_state[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            #pragma HLS UNROLL
+            res[i] = h_state[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
+                                                                                         param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv1d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv1d_stream.h
new file mode 100644
index 00000000..254fc506
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv1d_stream.h
@@ -0,0 +1,119 @@
+#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_
+#define NNET_SEPARABLE_CONV1D_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv1d_stream.h"
+#include "nnet_sepconv_stream.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_depthwise_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready,
+                                                                  weights, biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                                 typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                                 typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                          typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        depthwise_conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        depthwise_conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        if (i_iw % CONFIG_T::stride_width == 0) {
+            pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        } else {
+            data.read();
+        }
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    #pragma HLS DATAFLOW
+
+    hls::stream<dw_res_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_1d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv2d_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv2d_stream.h
new file mode 100644
index 00000000..d56ed6d9
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv2d_stream.h
@@ -0,0 +1,143 @@
+#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_
+#define NNET_SEPARABLE_CONV2D_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv2d_stream.h"
+#include "nnet_sepconv_stream.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_encoded_cl(
+    hls::stream<data_T> &data, hls::stream<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == CONFIG_T::filt_width);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_height * CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            compute_scaled_indices_2d<data_T, CONFIG_T>(i_ih, i_iw, pixel_idx);
+            compute_depthwise_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready,
+                                                                      weights, biases, pixel_idx);
+        }
+    }
+}
+
+// Line Buffer Implementation (Phil's)
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_buffer_cl(
+    hls::stream<data_T> &data, hls::stream<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::filt_height - 1]
+                                                                                    [CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            #pragma HLS LOOP_FLATTEN
+            if (CONFIG_T::strategy == nnet::latency) {
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            if (CONFIG_T::filt_height > 1) {
+                compute_depthwise_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(
+    hls::stream<data_T> &data, hls::stream<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        depthwise_conv_2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        depthwise_conv_2d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+            if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+                #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) {
+                pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            } else {
+                data.read();
+            }
+        }
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_height *
+                                                CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    #pragma HLS DATAFLOW
+
+    hls::stream<dw_res_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_2d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv_stream.h
new file mode 100644
index 00000000..9c16de19
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_sepconv_stream.h
@@ -0,0 +1,306 @@
+#ifndef NNET_SEPARABLE_CONV_STREAM_H_
+#define NNET_SEPARABLE_CONV_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan];
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Do the matrix-multiply
+Product:
+    for (int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) {
+        #pragma HLS UNROLL
+        mult[ii] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+            data[ii], weights[ii]);
+    }
+
+// Initialize accumulator with input biases
+ResetAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+// Accumulate multiplication result
+Accum1:
+    for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) {
+    Accum2:
+        for (int jj = 0; jj < CONFIG_T::n_chan; jj++) {
+            int index = ii * CONFIG_T::n_chan + jj;
+            acc[jj] += mult[index];
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_chan; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_mult_buffer(hls::stream<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                           res_T &res_pack, hls::stream<res_T> &res_stream, unsigned &outputs_ready,
+                           typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                           typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE
+
+    typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=data complete
+    typename res_T::value_type res[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+InitData:
+    for (int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) {
+        #pragma HLS UNROLL
+        data[id] = data_window[id].read();
+    }
+
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    } else {
+        assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+    }
+
+CastLoop:
+    for (unsigned jj = 0; jj < CONFIG_T::n_chan; jj++) {
+        #pragma HLS UNROLL
+        if (res_T::size / CONFIG_T::n_chan == 1) {
+            res_pack[jj] = res[jj];
+        } else {
+            res_pack[outputs_ready * CONFIG_T::n_chan + jj] = res[jj];
+        }
+    }
+
+    if (res_T::size / CONFIG_T::n_chan == 1) {
+        res_stream.write(res_pack);
+    } else {
+        if (outputs_ready == (res_T::size / CONFIG_T::n_chan) - 1) {
+            res_stream.write(res_pack);
+            outputs_ready = 0;
+        } else {
+            outputs_ready++;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_depthwise_output_encoded(
+    const data_T &in_elem, hls::stream<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+    hls::stream<res_T> &res, res_T &res_pack, unsigned &outputs_ready,
+    typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan], ap_uint<CONFIG_T::kernel_size> *pixel_idx) {
+    #pragma HLS INLINE
+
+MultLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    CopyDataFilt:
+        for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) {
+        #pragma HLS UNROLL
+        CopyDataChan:
+            for (unsigned c = 0; c < CONFIG_T::n_chan; c++) {
+                #pragma HLS UNROLL
+                if (pixel_idx[p][f])
+                    data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]);
+            }
+        }
+        if (pixel_idx[p][CONFIG_T::kernel_size - 1]) {
+            depthwise_mult_buffer<data_T, res_T, CONFIG_T>(data_window, res_pack, res, outputs_ready, weights, biases);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_mult_buffer(const data_T &data_pack, hls::stream<res_T> &res_stream,
+                           typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                           typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE
+
+    typename data_T::value_type data[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=data complete
+
+    typename res_T::value_type res[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+InitData:
+    for (int id = 0; id < CONFIG_T::n_chan; id++) {
+        #pragma HLS UNROLL
+        data[id] = data_pack[id];
+    }
+
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    } else {
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    }
+
+CastLoop:
+    for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
+        #pragma HLS UNROLL
+        res_pack[jj] = res[jj];
+    }
+
+    res_stream.write(res_pack);
+}
+
+// Line Buffer Implementation (Phil's)
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream<res_T> &res_stream,
+                                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+
+    // Counters
+    static int pX = 0;
+    static int sX = 0;
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel to buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
+        // Dense multiply
+        #pragma HLS INLINE recursive
+        if (CONFIG_T::strategy == nnet::latency) {
+            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+                                                                                                 weights, biases);
+        } else {
+            assert("Resource strategy for DepthwiseConv1D is not supported." && false);
+        }
+
+    // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Pointer Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_depthwise_output_buffer_2d(const data_T &in_elem,
+                                        ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+                                            line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
+                                        hls::stream<res_T> &res_stream,
+                                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+    const static int lShiftY = CONFIG_T::filt_height - 1;
+
+    // counters
+    static int pX = 0; // pixel X
+    static int pY = 0; // pixel Y
+
+    static int sX = 0; // stride X
+    static int sY = 0; // stride Y
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel to buffer
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+        // Dense multiply
+        #pragma HLS INLINE recursive
+        if (CONFIG_T::strategy == nnet::latency) {
+            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+                                                                                                 weights, biases);
+        } else {
+            assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+        }
+
+    // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            #pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Pointer Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else {
+            pY = pY + 1;
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1;
+        }
+    } else {
+        pX = pX + 1;
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_stream.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_stream.h
new file mode 100644
index 00000000..900db16c
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_stream.h
@@ -0,0 +1,207 @@
+
+#ifndef NNET_STREAM_H
+#define NNET_STREAM_H
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+
+namespace nnet {
+
+struct broadcast_config {
+    static const unsigned in_height = 1;
+    static const unsigned in_width = 1;
+    static const unsigned in_chan = 3;
+    static const unsigned out_height = 2;
+    static const unsigned out_width = 2;
+    static const unsigned out_chan = 3;
+};
+
+template <class data_T, class res_T, int N>
+void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stream<res_T> &res2) {
+CloneLoop:
+    for (int i = 0; i < N / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+        PRAGMA_DATA_PACK(out_data1)
+        PRAGMA_DATA_PACK(out_data2)
+
+    ClonePack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+    }
+}
+
+template <class data_T, class res_T, int N>
+void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stream<res_T> &res2, hls::stream<res_T> &res3) {
+CloneLoop:
+    for (int i = 0; i < N / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+        res_T out_data3;
+        PRAGMA_DATA_PACK(out_data1)
+        PRAGMA_DATA_PACK(out_data2)
+        PRAGMA_DATA_PACK(out_data3)
+
+    ClonePack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+            out_data3[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+        res3.write(out_data3);
+    }
+}
+
+template <class data_T, class res_T, int N> void repack_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    if (data_T::size == res_T::size) {
+        for (int i = 0; i < N / data_T::size; i++) {
+            #pragma HLS PIPELINE
+
+            data_T in_data = data.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+            for (int j = 0; j < data_T::size; j++) {
+                #pragma HLS UNROLL
+                out_data[j] = in_data[j];
+            }
+
+            res.write(out_data);
+        }
+    } else if (data_T::size > res_T::size) {
+        constexpr unsigned pack_diff = data_T::size / res_T::size;
+        for (int i = 0; i < N / data_T::size; i++) {
+            if (N / data_T::size > 1) {
+                #pragma HLS PIPELINE
+            }
+
+            data_T in_data = data.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+            for (int j = 0; j < pack_diff; j++) {
+                #pragma HLS PIPELINE
+
+                res_T out_data;
+                for (int k = 0; k < res_T::size; k++) {
+                    #pragma HLS UNROLL
+                    out_data[k] = in_data[j * res_T::size + k];
+                }
+                res.write(out_data);
+            }
+        }
+    } else { // data_T::size < res_T::size
+        res_T out_data;
+        constexpr unsigned pack_diff = res_T::size / data_T::size;
+        unsigned pack_cnt = 0;
+        for (int i = 0; i < N / data_T::size; i++) {
+            #pragma HLS PIPELINE
+
+            data_T in_data = data.read();
+            for (int j = 0; j < data_T::size; j++) {
+                #pragma HLS UNROLL
+                out_data[pack_cnt * data_T::size + j] = in_data[j];
+            }
+
+            if (pack_cnt == pack_diff - 1) {
+                res.write(out_data);
+                pack_cnt = 0;
+            } else {
+                pack_cnt++;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream_1x1xC(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan);
+    int n_dupl = (CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::out_chan) /
+                 (CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan);
+BroadcastLoop:
+    for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+        for (int j = 0; j < n_dupl; j++) {
+            #pragma HLS PIPELINE
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+            for (int k = 0; k < res_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data[k];
+            }
+            res.write(out_data);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream_HxWx1(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height &&
+           CONFIG_T::in_width == CONFIG_T::out_width);
+BroadcastLoop:
+    for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+        for (int k = 0; k < res_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data[0];
+        }
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) {
+        broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
+    } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height &&
+               CONFIG_T::in_width == CONFIG_T::out_width) {
+        broadcast_stream_HxWx1<data_T, res_T, CONFIG_T>(data, res);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];
+    #pragma HLS ARRAY_PARTITION variable=data_array complete
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+        for (int j = 0; j < data_T::size; j++) {
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
+        #pragma HLS PIPELINE
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+        }
+        res.write(out_data);
+    }
+}
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_types.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_types.h
new file mode 100644
index 00000000..0fcac134
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/nnet_utils/nnet_types.h
@@ -0,0 +1,64 @@
+#ifndef NNET_TYPES_H_
+#define NNET_TYPES_H_
+
+#include <assert.h>
+#include <cstddef>
+#include <cstdio>
+
+namespace nnet {
+
+// Fixed-size array
+template <typename T, unsigned N> struct array {
+    typedef T value_type;
+    static const unsigned size = N;
+
+    T data[N];
+
+    T &operator[](size_t pos) { return data[pos]; }
+
+    const T &operator[](size_t pos) const { return data[pos]; }
+
+    array &operator=(const array &other) {
+        if (&other == this)
+            return *this;
+
+        assert(N == other.size && "Array sizes must match.");
+
+        for (unsigned i = 0; i < N; i++) {
+            #pragma HLS UNROLL
+            data[i] = other[i];
+        }
+        return *this;
+    }
+};
+
+// Generic lookup-table implementation, for use in approximations of math functions
+template <typename T, unsigned N, T (*func)(T)> class lookup_table {
+  public:
+    lookup_table(T from, T to) : range_start(from), range_end(to), base_div(ap_uint<16>(N) / T(to - from)) {
+        T step = (range_end - range_start) / ap_uint<16>(N);
+        for (size_t i = 0; i < N; i++) {
+            T num = range_start + ap_uint<16>(i) * step;
+            T sample = func(num);
+            samples[i] = sample;
+        }
+    }
+
+    T operator()(T n) const {
+        int index = (n - range_start) * base_div;
+        if (index < 0)
+            index = 0;
+        else if (index > N - 1)
+            index = N - 1;
+        return samples[index];
+    }
+
+  private:
+    T samples[N];
+    const T range_start, range_end;
+    ap_fixed<20, 16> base_div;
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/parameters.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/parameters.h
new file mode 100644
index 00000000..9d4d11a0
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/parameters.h
@@ -0,0 +1,247 @@
+#ifndef PARAMETERS_H_
+#define PARAMETERS_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+
+#include "nnet_utils/nnet_code_gen.h"
+#include "nnet_utils/nnet_helpers.h"
+// hls-fpga-machine-learning insert includes
+#include "nnet_utils/nnet_activation.h"
+#include "nnet_utils/nnet_activation_stream.h"
+#include "nnet_utils/nnet_conv1d.h"
+#include "nnet_utils/nnet_embed.h"
+#include "nnet_utils/nnet_embed_stream.h"
+#include "nnet_utils/nnet_merge.h"
+#include "nnet_utils/nnet_merge_stream.h"
+#include "nnet_utils/nnet_pooling.h"
+#include "nnet_utils/nnet_pooling_stream.h"
+#include "nnet_utils/nnet_sepconv1d_stream.h"
+
+// hls-fpga-machine-learning insert weights
+#include "weights/e3.h"
+#include "weights/e4.h"
+#include "weights/w22.h"
+#include "weights/b22.h"
+#include "weights/w23.h"
+#include "weights/b23.h"
+#include "weights/w24.h"
+#include "weights/b24.h"
+
+// hls-fpga-machine-learning insert layer-config
+// embedding0
+struct config3 : nnet::embed_config {
+    static const unsigned n_in = 100;
+    static const unsigned n_out = 2;
+    static const unsigned vocab_size = 6;
+    static const unsigned io_type = nnet::io_parallel;
+    static const unsigned reuse_factor = 1;
+    typedef embedding0_embeddings_t embeddings_t;
+};
+
+// embedding1
+struct config4 : nnet::embed_config {
+    static const unsigned n_in = 100;
+    static const unsigned n_out = 2;
+    static const unsigned vocab_size = 4;
+    static const unsigned io_type = nnet::io_parallel;
+    static const unsigned reuse_factor = 1;
+    typedef embedding1_embeddings_t embeddings_t;
+};
+
+// concatenate
+struct config6 : nnet::concat_config {
+    static const unsigned n_elem1_0 = 100;
+    static const unsigned n_elem1_1 = 2;
+    static const unsigned n_elem1_2 = 0;
+    static const unsigned n_elem2_0 = 100;
+    static const unsigned n_elem2_1 = 2;
+    static const unsigned n_elem2_2 = 0;
+
+    static const int axis = -1;
+};
+
+// concatenate_1
+struct config7 : nnet::concat_config {
+    static const unsigned n_elem1_0 = 100;
+    static const unsigned n_elem1_1 = 4;
+    static const unsigned n_elem1_2 = 0;
+    static const unsigned n_elem2_0 = 100;
+    static const unsigned n_elem2_1 = 4;
+    static const unsigned n_elem2_2 = 0;
+
+    static const int axis = -1;
+};
+
+// dense
+struct config22_mult : nnet::dense_config {
+    static const unsigned n_in = 8;
+    static const unsigned n_out = 12;
+    static const unsigned reuse_factor = 1;
+    static const unsigned strategy = nnet::latency;
+    static const unsigned n_zeros = 0;
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    typedef model_default_t accum_t;
+    typedef dense_bias_t bias_t;
+    typedef dense_weight_t weight_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct config22 : nnet::conv1d_config {
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 100;
+    static const unsigned n_chan = 8;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 12;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 100;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::latency;
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::linebuffer;
+    static const unsigned min_width = 100;
+    static const ap_uint<filt_width> pixels[min_width];
+    static const unsigned n_partitions = 100;
+    static const unsigned n_pixels = out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::fill_buffer_22<data_T, CONFIG_T>;
+    typedef model_default_t accum_t;
+    typedef dense_bias_t bias_t;
+    typedef dense_weight_t weight_t;
+    typedef config22_mult mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index = nnet::scale_index_unscaled<K, S, W>;
+};
+const ap_uint<config22::filt_width> config22::pixels[] = {0};
+
+// activation
+struct tanh_config11 : nnet::activ_config {
+    static const unsigned n_in = 1200;
+    static const unsigned table_size = 1024;
+    static const unsigned io_type = nnet::io_parallel;
+    static const unsigned reuse_factor = 1;
+    typedef activation_table_t table_t;
+};
+
+// dense_1
+struct config23_mult : nnet::dense_config {
+    static const unsigned n_in = 12;
+    static const unsigned n_out = 36;
+    static const unsigned reuse_factor = 1;
+    static const unsigned strategy = nnet::latency;
+    static const unsigned n_zeros = 0;
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    typedef model_default_t accum_t;
+    typedef dense_1_bias_t bias_t;
+    typedef dense_1_weight_t weight_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct config23 : nnet::conv1d_config {
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 100;
+    static const unsigned n_chan = 12;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 36;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 100;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::latency;
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::linebuffer;
+    static const unsigned min_width = 100;
+    static const ap_uint<filt_width> pixels[min_width];
+    static const unsigned n_partitions = 100;
+    static const unsigned n_pixels = out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::fill_buffer_23<data_T, CONFIG_T>;
+    typedef model_default_t accum_t;
+    typedef dense_1_bias_t bias_t;
+    typedef dense_1_weight_t weight_t;
+    typedef config23_mult mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index = nnet::scale_index_unscaled<K, S, W>;
+};
+const ap_uint<config23::filt_width> config23::pixels[] = {0};
+
+// activation_1
+struct tanh_config15 : nnet::activ_config {
+    static const unsigned n_in = 3600;
+    static const unsigned table_size = 1024;
+    static const unsigned io_type = nnet::io_parallel;
+    static const unsigned reuse_factor = 1;
+    typedef activation_1_table_t table_t;
+};
+
+// met_weight
+struct config24_mult : nnet::dense_config {
+    static const unsigned n_in = 36;
+    static const unsigned n_out = 1;
+    static const unsigned reuse_factor = 1;
+    static const unsigned strategy = nnet::latency;
+    static const unsigned n_zeros = 0;
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    typedef model_default_t accum_t;
+    typedef met_weight_bias_t bias_t;
+    typedef met_weight_weight_t weight_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct config24 : nnet::conv1d_config {
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 100;
+    static const unsigned n_chan = 36;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 100;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::latency;
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::linebuffer;
+    static const unsigned min_width = 100;
+    static const ap_uint<filt_width> pixels[min_width];
+    static const unsigned n_partitions = 100;
+    static const unsigned n_pixels = out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::fill_buffer_24<data_T, CONFIG_T>;
+    typedef model_default_t accum_t;
+    typedef met_weight_bias_t bias_t;
+    typedef met_weight_weight_t weight_t;
+    typedef config24_mult mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index = nnet::scale_index_unscaled<K, S, W>;
+};
+const ap_uint<config24::filt_width> config24::pixels[] = {0};
+
+// multiply
+struct config20 : nnet::merge_config {
+    static const unsigned n_elem = N_OUTPUTS_24*N_FILT_24;
+};
+
+// output
+struct config21 : nnet::pooling1d_config {
+    static const unsigned n_in = 100;
+    static const unsigned n_filt = 2;
+    static const nnet::Pool_Op pool_op = nnet::Average;
+    static const unsigned reuse_factor = 1;
+    typedef model_default_t accum_t;
+};
+
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.h
new file mode 100644
index 00000000..e9c30326
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.h
@@ -0,0 +1,15 @@
+//Numpy array shape [12]
+//Min -0.455119371414
+//Max 0.398226708174
+//Number of zeros 0
+
+#ifndef B22_H_
+#define B22_H_
+
+#ifndef __SYNTHESIS__
+dense_bias_t b22[12];
+#else
+dense_bias_t b22[12] = {-0.227416396141052, -0.321803480386734, -0.105886071920395, 0.004980653524399, -1.102990508079529, 1.840189456939697, -0.065355993807316, -0.420345693826675, -0.125013768672943, -0.633407652378082, 0.452038317918777, -0.057287767529488};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.txt
new file mode 100644
index 00000000..c6c56a2f
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b22.txt
@@ -0,0 +1 @@
+-0.227416396141052, -0.321803480386734, -0.105886071920395, 0.004980653524399, -1.102990508079529, 1.840189456939697, -0.065355993807316, -0.420345693826675, -0.125013768672943, -0.633407652378082, 0.452038317918777, -0.057287767529488
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.h
new file mode 100644
index 00000000..2665bfe2
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.h
@@ -0,0 +1,15 @@
+//Numpy array shape [36]
+//Min -0.522930324078
+//Max 0.388318747282
+//Number of zeros 0
+
+#ifndef B23_H_
+#define B23_H_
+
+#ifndef __SYNTHESIS__
+dense_1_bias_t b23[36];
+#else
+dense_1_bias_t b23[36] = {-28.527759552001953, -6.611515045166016, -14.351591110229492, -3.294915914535522, 14.957226753234863, -5.450253486633301, -5.768840312957764, 1.048536539077759, -1.573255777359009, -4.288578033447266, -2.320878744125366, 2.320586442947388, -2.193000793457031, 14.887507438659668, 2.135548591613770, -6.345302581787109, 1.965700864791870, -6.714401245117188, -1.507563710212708, -7.482578754425049, -5.760603904724121, -8.901734352111816, 4.178072929382324, -7.702874183654785, -5.517005920410156, 2.493387222290039, -5.700569152832031, 3.564873695373535, 1.121586322784424, 8.881909370422363, 6.257650375366211, -0.310464382171631, 1.509941101074219, 5.575150012969971, -4.270040988922119, 4.464414119720459};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.txt
new file mode 100644
index 00000000..e14f7cf8
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b23.txt
@@ -0,0 +1 @@
+-28.527759552001953, -6.611515045166016, -14.351591110229492, -3.294915914535522, 14.957226753234863, -5.450253486633301, -5.768840312957764, 1.048536539077759, -1.573255777359009, -4.288578033447266, -2.320878744125366, 2.320586442947388, -2.193000793457031, 14.887507438659668, 2.135548591613770, -6.345302581787109, 1.965700864791870, -6.714401245117188, -1.507563710212708, -7.482578754425049, -5.760603904724121, -8.901734352111816, 4.178072929382324, -7.702874183654785, -5.517005920410156, 2.493387222290039, -5.700569152832031, 3.564873695373535, 1.121586322784424, 8.881909370422363, 6.257650375366211, -0.310464382171631, 1.509941101074219, 5.575150012969971, -4.270040988922119, 4.464414119720459
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.h
new file mode 100644
index 00000000..9daede16
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.h
@@ -0,0 +1,15 @@
+//Numpy array shape [1]
+//Min 3.417605638504
+//Max 3.417605638504
+//Number of zeros 0
+
+#ifndef B24_H_
+#define B24_H_
+
+#ifndef __SYNTHESIS__
+met_weight_bias_t b24[1];
+#else
+met_weight_bias_t b24[1] = {2.417605638504028};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.txt
new file mode 100644
index 00000000..42659b3e
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/b24.txt
@@ -0,0 +1 @@
+2.417605638504028
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.h
new file mode 100644
index 00000000..34773dd1
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.h
@@ -0,0 +1,15 @@
+//Numpy array shape [6, 2]
+//Min -2.672395467758
+//Max 2.548557043076
+//Number of zeros 0
+
+#ifndef E3_H_
+#define E3_H_
+
+#ifndef __SYNTHESIS__
+embedding0_embeddings_t e3[12];
+#else
+embedding0_embeddings_t e3[12] = {1.620906114578247, -0.427226632833481, -2.672395467758179, -0.035970680415630, 2.548557043075562, 0.323681503534317, 1.538867950439453, 1.997532844543457, -0.704283535480499, 0.116950742900372, -0.906534552574158, 0.974053442478180};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.txt
new file mode 100644
index 00000000..3c0038cb
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e3.txt
@@ -0,0 +1 @@
+1.620906114578247, -0.427226632833481, -2.672395467758179, -0.035970680415630, 2.548557043075562, 0.323681503534317, 1.538867950439453, 1.997532844543457, -0.704283535480499, 0.116950742900372, -0.906534552574158, 0.974053442478180
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.h
new file mode 100644
index 00000000..5835c2a4
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.h
@@ -0,0 +1,15 @@
+//Numpy array shape [4, 2]
+//Min -1.666811108589
+//Max 1.295734167099
+//Number of zeros 0
+
+#ifndef E4_H_
+#define E4_H_
+
+#ifndef __SYNTHESIS__
+embedding1_embeddings_t e4[8];
+#else
+embedding1_embeddings_t e4[8] = {1.295734167098999, 0.254000633955002, -1.661195635795593, 0.048672962933779, -0.138032227754593, -0.875923097133636, -1.666811108589172, 0.035932607948780};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.txt
new file mode 100644
index 00000000..d6d8ec98
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/e4.txt
@@ -0,0 +1 @@
+1.295734167098999, 0.254000633955002, -1.661195635795593, 0.048672962933779, -0.138032227754593, -0.875923097133636, -1.666811108589172, 0.035932607948780
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.h
new file mode 100644
index 00000000..2716062b
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.h
@@ -0,0 +1,15 @@
+//Numpy array shape [8, 12]
+//Min -3.111060142517
+//Max 1.904985547066
+//Number of zeros 0
+
+#ifndef W22_H_
+#define W22_H_
+
+#ifndef __SYNTHESIS__
+dense_weight_t w22[96];
+#else
+dense_weight_t w22[96] = {-0.005496513564140, -0.077705480158329, -0.291069507598877, 0.003703390946612, 0.009928826242685, 0.002178243128583, 0.007691420149058, -0.122642949223518, -0.004901545587927, 0.184459403157234, 0.077915966510773, -0.002935178112239, -0.630976676940918, -0.129218742251396, 0.172030463814735, -0.613496303558350, -0.006485627032816, -0.007314948830754, -0.013219951651990, 0.035634342581034, 0.011636621318758, -0.013739237561822, -0.052910525351763, 0.007741326466203, -0.003901405725628, 0.006636092904955, 0.014814227819443, 0.002152013825253, -0.000235362793319, -0.003903909819201, 0.000954345799983, 0.004304980859160, -0.000281526794424, 0.014572271145880, -0.007630184758455, 0.001944163814187, -0.545304834842682, 0.224154502153397, 1.193614006042480, 0.672130286693573, 0.079618625342846, -0.729031324386597, 1.117634415626526, 0.088703706860542, 0.263513863086700, 0.946384370326996, 0.078555844724178, 0.085146762430668, -0.199110791087151, -0.093624226748943, 0.001692321849987, 0.204557403922081, 0.073481321334839, 0.260788798332214, -0.122535862028599, -0.085562728345394, 0.025333112105727, -0.131282433867455, -0.406875669956207, -0.066440477967262, -0.042630787938833, 0.427074193954468, 1.956082224845886, 0.046955518424511, 0.030683849006891, 0.232642397284508, -0.598365366458893, -0.853525161743164, -0.292229890823364, -2.031559944152832, 0.012307391501963, 0.127083599567413, 0.060593571513891, -0.268928855657578, -0.487386792898178, -0.127690494060516, -0.012389726005495, 0.656857013702393, 0.665676295757294, -0.315022528171539, -0.161770179867744, 0.515646219253540, -0.374600380659103, -0.031053755432367, 0.023489914834499, -0.527695715427399, -0.117961816489697, -0.055053103715181, -0.132891759276390, -0.345012873411179, -0.197673514485359, 0.346816360950470, 0.160986021161079, -0.146570160984993, -0.089796856045723, -0.088734544813633};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.txt
new file mode 100644
index 00000000..e518ed71
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w22.txt
@@ -0,0 +1 @@
+-0.005496513564140, -0.077705480158329, -0.291069507598877, 0.003703390946612, 0.009928826242685, 0.002178243128583, 0.007691420149058, -0.122642949223518, -0.004901545587927, 0.184459403157234, 0.077915966510773, -0.002935178112239, -0.630976676940918, -0.129218742251396, 0.172030463814735, -0.613496303558350, -0.006485627032816, -0.007314948830754, -0.013219951651990, 0.035634342581034, 0.011636621318758, -0.013739237561822, -0.052910525351763, 0.007741326466203, -0.003901405725628, 0.006636092904955, 0.014814227819443, 0.002152013825253, -0.000235362793319, -0.003903909819201, 0.000954345799983, 0.004304980859160, -0.000281526794424, 0.014572271145880, -0.007630184758455, 0.001944163814187, -0.545304834842682, 0.224154502153397, 1.193614006042480, 0.672130286693573, 0.079618625342846, -0.729031324386597, 1.117634415626526, 0.088703706860542, 0.263513863086700, 0.946384370326996, 0.078555844724178, 0.085146762430668, -0.199110791087151, -0.093624226748943, 0.001692321849987, 0.204557403922081, 0.073481321334839, 0.260788798332214, -0.122535862028599, -0.085562728345394, 0.025333112105727, -0.131282433867455, -0.406875669956207, -0.066440477967262, -0.042630787938833, 0.427074193954468, 1.956082224845886, 0.046955518424511, 0.030683849006891, 0.232642397284508, -0.598365366458893, -0.853525161743164, -0.292229890823364, -2.031559944152832, 0.012307391501963, 0.127083599567413, 0.060593571513891, -0.268928855657578, -0.487386792898178, -0.127690494060516, -0.012389726005495, 0.656857013702393, 0.665676295757294, -0.315022528171539, -0.161770179867744, 0.515646219253540, -0.374600380659103, -0.031053755432367, 0.023489914834499, -0.527695715427399, -0.117961816489697, -0.055053103715181, -0.132891759276390, -0.345012873411179, -0.197673514485359, 0.346816360950470, 0.160986021161079, -0.146570160984993, -0.089796856045723, -0.088734544813633
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.h
new file mode 100644
index 00000000..ed36b365
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.h
@@ -0,0 +1,15 @@
+//Numpy array shape [12, 36]
+//Min -1.362776517868
+//Max 1.903477072716
+//Number of zeros 0
+
+#ifndef W23_H_
+#define W23_H_
+
+#ifndef __SYNTHESIS__
+dense_1_weight_t w23[432];
+#else
+dense_1_weight_t w23[432] = {-34.272384643554688, -15.242276191711426, -12.153550148010254, -0.517302453517914, 16.459009170532227, -0.927374601364136, -16.420297622680664, 2.281730651855469, -10.332298278808594, 1.368979692459106, -14.852247238159180, 2.289415359497070, 4.450272083282471, 15.091560363769531, 13.234894752502441, 5.019698619842529, 0.659490466117859, -10.297682762145996, -6.819073200225830, 0.061284162104130, -17.845451354980469, -12.342288970947266, 9.018982887268066, -7.944165229797363, -9.916581153869629, -9.689590454101562, 1.593392252922058, -0.548580884933472, 2.834589481353760, 13.109604835510254, -13.948617935180664, 4.110248565673828, -2.018397331237793, -6.860967636108398, 2.082887887954712, -1.957600474357605, -0.321475505828857, 0.820872783660889, 2.677054405212402, -1.768133521080017, -1.177917003631592, -1.118692636489868, 3.211776494979858, -3.037288188934326, 5.288896083831787, -0.581319391727448, 0.694460690021515, 0.631229758262634, 3.280242681503296, 6.218200206756592, -2.129676103591919, 8.349854469299316, 1.471119880676270, 7.490054130554199, 0.753753662109375, -0.399599075317383, 2.416818141937256, 1.339800357818604, -0.718787252902985, 0.338143020868301, 1.145107865333557, 5.754922389984131, -4.704513549804688, -1.252747058868408, 0.465840101242065, -3.112505197525024, 4.487011432647705, -0.808801710605621, 7.409661293029785, -8.000079154968262, 0.356041222810745, -1.234661579132080, -13.651395797729492, 4.551627635955811, 3.547161102294922, -1.346346378326416, -6.481750965118408, 0.371593445539474, -0.909239649772644, 0.803896009922028, -0.864329278469086, -0.167551159858704, 1.271770358085632, 0.128098145127296, -0.319244086742401, -8.963575363159180, -4.575497150421143, -4.347470760345459, 0.099872648715973, 1.076389431953430, 1.537157297134399, -0.342850208282471, -3.088666439056396, 1.880550146102905, -2.499561071395874, 0.960815191268921, 1.989226579666138, 5.396582126617432, 4.611053466796875, 1.478802204132080, -0.381258249282837, -1.447740316390991, -0.485423654317856, 1.209582686424255, -6.765387535095215, 0.879579961299896, 3.126605033874512, -1.396452188491821, 35.301498413085938, 16.390518188476562, 10.991186141967773, 0.457286953926086, -16.055135726928711, 0.732447206974030, 12.833724975585938, -0.869582533836365, 5.935638427734375, 2.171858549118042, 15.994698524475098, -1.975315093994141, -0.577428340911865, -16.300628662109375, -12.036094665527344, -12.248717308044434, -0.296559274196625, 9.253703117370605, 7.236478328704834, 0.100461378693581, 15.662371635437012, 13.149472236633301, -9.011061668395996, 9.156368255615234, 9.083997726440430, 8.143834114074707, 6.395058631896973, 0.768283843994141, -2.189213037490845, -12.856546401977539, 10.946484565734863, -3.122458934783936, 2.356916427612305, 10.203166007995605, 3.314955234527588, 2.006448984146118, -4.138628959655762, 9.784881591796875, -7.701581478118896, -2.161497592926025, 5.081796169281006, 0.722472250461578, -6.947623729705811, 0.428102672100067, -1.017104268074036, -5.616028785705566, 7.207549571990967, -3.425596952438354, -0.324499905109406, -1.508072257041931, -0.423026353120804, -6.807011127471924, -2.165873289108276, -6.257976055145264, -1.110751748085022, -0.680330693721771, -8.726241111755371, 6.876333713531494, 0.122669994831085, -6.020811080932617, -1.936614274978638, 7.679961681365967, 6.832388401031494, 2.089343547821045, -5.815147399902344, 1.034743905067444, 6.128062248229980, 3.326957702636719, -6.113448143005371, 0.656117796897888, -0.316450953483582, 0.792564570903778, 10.708021163940430, 10.246310234069824, -4.989016532897949, -2.966490268707275, 4.010641574859619, 1.000328898429871, -35.920978546142578, -1.863970279693604, 1.379239320755005, -0.364904999732971, 3.232958555221558, -0.646893203258514, 4.449232578277588, -6.601441383361816, 7.810013294219971, 0.764219939708710, -0.887412309646606, 4.851296424865723, -3.773882389068604, 0.953490376472473, -28.108135223388672, 7.164631843566895, 5.078193187713623, -4.744826793670654, -7.120871067047119, -7.749808311462402, 10.820018768310547, 0.171118795871735, -1.084927797317505, 1.892885923385620, -28.651664733886719, 10.953318595886230, -5.435957431793213, -21.623348236083984, 2.465915918350220, -8.539632797241211, -7.903433799743652, -3.474239349365234, 0.243321105837822, -0.380062937736511, 5.332633972167969, 2.151208877563477, 2.022930383682251, -1.463849902153015, -2.719141244888306, -1.985015749931335, 1.754704952239990, -3.623456001281738, 4.132822036743164, -2.998028755187988, 4.864254474639893, 7.019001007080078, -2.887226343154907, -2.157429456710815, -13.182174682617188, -0.038866952061653, -5.827670574188232, 5.151016235351562, 2.542974710464478, -12.306578636169434, -1.044925689697266, 11.257448196411133, -1.981187462806702, -1.172790408134460, -1.593691825866699, 5.988854408264160, 11.212390899658203, 6.184563636779785, 1.851197481155396, -7.376731395721436, 2.947922706604004, -3.116251468658447, 9.032855033874512, 11.189463615417480, -14.047230720520020, -1.882185339927673, 13.061312675476074, -3.894136667251587, -18.382831573486328, 5.108212947845459, 1.680236458778381, 5.855550289154053, 1.753978013992310, 8.817825317382812, 4.784208774566650, -9.456546783447266, 6.749723434448242, -3.826550960540771, 8.439210891723633, 2.381058931350708, -6.254682064056396, 0.979307055473328, -12.932164192199707, 6.614181041717529, 7.724326133728027, -8.186627388000488, -11.564584732055664, -5.705511093139648, 0.615724623203278, 2.294805049896240, 8.561786651611328, 10.862165451049805, -11.637836456298828, 8.550187110900879, -2.799665927886963, -4.847795963287354, 2.903936386108398, -6.381844997406006, 6.450922012329102, 26.763093948364258, 3.013844728469849, -0.924964666366577, -6.920816898345947, -2.560798168182373, -34.196998596191406, 1.623008966445923, 10.048088073730469, 0.985973894596100, 23.329315185546875, 1.722676992416382, -0.090961724519730, -5.953221797943115, 0.280752390623093, -14.521141052246094, 1.948345661163330, 6.979897975921631, 4.035674571990967, 1.044640779495239, -19.007211685180664, 21.142364501953125, -1.837882161140442, 2.050447940826416, 1.542031645774841, -7.565482616424561, 26.773376464843750, 0.171006053686142, 3.358534336090088, -8.234274864196777, -27.216566085815430, 7.212102413177490, -5.000186920166016, -20.917554855346680, 2.152885198593140, -3.181938886642456, 10.018072128295898, 6.692709445953369, 7.562778472900391, -0.397445559501648, -11.695134162902832, 1.699540257453918, 1.424039125442505, -1.628181338310242, 4.050493717193604, 0.047106776386499, -0.717159509658813, -3.311089277267456, -2.847960948944092, -9.831811904907227, -7.529915332794189, -2.769558668136597, -2.932808637619019, 6.914423465728760, 13.812906265258789, 0.466079294681549, -0.697627902030945, 0.283607631921768, -7.250504493713379, 13.122053146362305, 4.825413227081299, 2.828585863113403, 6.724539756774902, -0.596229493618011, -4.759947776794434, -9.946646690368652, -2.232836484909058, -3.401717901229858, -1.199927449226379, 3.097918748855591, 0.726092457771301, 1.352552175521851, -1.831664323806763, -6.564773082733154, 1.155098319053650, -2.088497400283813, -0.057716656476259, -0.293432414531708, -5.829917907714844, -2.137289047241211, 2.680857658386230, -3.795029640197754, 0.601609170436859, -2.534255266189575, 0.599966049194336, 9.936664581298828, -1.825383901596069, 2.551906108856201, -2.613932371139526, 5.252158164978027, -0.459596127271652, -1.080929756164551, -5.785776615142822, -4.251605510711670, 1.853045225143433, 1.728189826011658, -1.679710865020752, 3.655097484588623, -4.362958908081055, 1.981420755386353, -4.094293117523193, 0.941113770008087, -7.290192604064941, 2.577519655227661, 0.405787110328674, -5.861212253570557, -2.390504837036133, -4.859991073608398, 21.085351943969727, 2.001378059387207, -24.684366226196289, -4.457293987274170, 22.137004852294922, -1.187330603599548, -37.353851318359375, -1.755694746971130, 1.482097148895264, -1.574132204055786, 12.119773864746094, -4.834329605102539, 0.834708034992218, -21.962982177734375, 39.640460968017578, -9.078592300415039, -3.510553598403931, -16.044708251953125, -4.902245998382568, 0.224997147917747, -34.180931091308594, 7.509442806243896, 3.701504945755005, -11.197209358215332, -22.056798934936523, -27.012636184692383, 15.913613319396973, 4.364429473876953, -1.197503089904785, 8.571378707885742, -19.667821884155273, 22.258554458618164, -7.341328144073486, -17.936431884765625, -0.463554143905640, -5.726800918579102};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.txt
new file mode 100644
index 00000000..d99bb2c7
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w23.txt
@@ -0,0 +1 @@
+-34.272384643554688, -15.242276191711426, -12.153550148010254, -0.517302453517914, 16.459009170532227, -0.927374601364136, -16.420297622680664, 2.281730651855469, -10.332298278808594, 1.368979692459106, -14.852247238159180, 2.289415359497070, 4.450272083282471, 15.091560363769531, 13.234894752502441, 5.019698619842529, 0.659490466117859, -10.297682762145996, -6.819073200225830, 0.061284162104130, -17.845451354980469, -12.342288970947266, 9.018982887268066, -7.944165229797363, -9.916581153869629, -9.689590454101562, 1.593392252922058, -0.548580884933472, 2.834589481353760, 13.109604835510254, -13.948617935180664, 4.110248565673828, -2.018397331237793, -6.860967636108398, 2.082887887954712, -1.957600474357605, -0.321475505828857, 0.820872783660889, 2.677054405212402, -1.768133521080017, -1.177917003631592, -1.118692636489868, 3.211776494979858, -3.037288188934326, 5.288896083831787, -0.581319391727448, 0.694460690021515, 0.631229758262634, 3.280242681503296, 6.218200206756592, -2.129676103591919, 8.349854469299316, 1.471119880676270, 7.490054130554199, 0.753753662109375, -0.399599075317383, 2.416818141937256, 1.339800357818604, -0.718787252902985, 0.338143020868301, 1.145107865333557, 5.754922389984131, -4.704513549804688, -1.252747058868408, 0.465840101242065, -3.112505197525024, 4.487011432647705, -0.808801710605621, 7.409661293029785, -8.000079154968262, 0.356041222810745, -1.234661579132080, -13.651395797729492, 4.551627635955811, 3.547161102294922, -1.346346378326416, -6.481750965118408, 0.371593445539474, -0.909239649772644, 0.803896009922028, -0.864329278469086, -0.167551159858704, 1.271770358085632, 0.128098145127296, -0.319244086742401, -8.963575363159180, -4.575497150421143, -4.347470760345459, 0.099872648715973, 1.076389431953430, 1.537157297134399, -0.342850208282471, -3.088666439056396, 1.880550146102905, -2.499561071395874, 0.960815191268921, 1.989226579666138, 5.396582126617432, 4.611053466796875, 1.478802204132080, -0.381258249282837, -1.447740316390991, -0.485423654317856, 1.209582686424255, -6.765387535095215, 0.879579961299896, 3.126605033874512, -1.396452188491821, 35.301498413085938, 16.390518188476562, 10.991186141967773, 0.457286953926086, -16.055135726928711, 0.732447206974030, 12.833724975585938, -0.869582533836365, 5.935638427734375, 2.171858549118042, 15.994698524475098, -1.975315093994141, -0.577428340911865, -16.300628662109375, -12.036094665527344, -12.248717308044434, -0.296559274196625, 9.253703117370605, 7.236478328704834, 0.100461378693581, 15.662371635437012, 13.149472236633301, -9.011061668395996, 9.156368255615234, 9.083997726440430, 8.143834114074707, 6.395058631896973, 0.768283843994141, -2.189213037490845, -12.856546401977539, 10.946484565734863, -3.122458934783936, 2.356916427612305, 10.203166007995605, 3.314955234527588, 2.006448984146118, -4.138628959655762, 9.784881591796875, -7.701581478118896, -2.161497592926025, 5.081796169281006, 0.722472250461578, -6.947623729705811, 0.428102672100067, -1.017104268074036, -5.616028785705566, 7.207549571990967, -3.425596952438354, -0.324499905109406, -1.508072257041931, -0.423026353120804, -6.807011127471924, -2.165873289108276, -6.257976055145264, -1.110751748085022, -0.680330693721771, -8.726241111755371, 6.876333713531494, 0.122669994831085, -6.020811080932617, -1.936614274978638, 7.679961681365967, 6.832388401031494, 2.089343547821045, -5.815147399902344, 1.034743905067444, 6.128062248229980, 3.326957702636719, -6.113448143005371, 0.656117796897888, -0.316450953483582, 0.792564570903778, 10.708021163940430, 10.246310234069824, -4.989016532897949, -2.966490268707275, 4.010641574859619, 1.000328898429871, -35.920978546142578, -1.863970279693604, 1.379239320755005, -0.364904999732971, 3.232958555221558, -0.646893203258514, 4.449232578277588, -6.601441383361816, 7.810013294219971, 0.764219939708710, -0.887412309646606, 4.851296424865723, -3.773882389068604, 0.953490376472473, -28.108135223388672, 7.164631843566895, 5.078193187713623, -4.744826793670654, -7.120871067047119, -7.749808311462402, 10.820018768310547, 0.171118795871735, -1.084927797317505, 1.892885923385620, -28.651664733886719, 10.953318595886230, -5.435957431793213, -21.623348236083984, 2.465915918350220, -8.539632797241211, -7.903433799743652, -3.474239349365234, 0.243321105837822, -0.380062937736511, 5.332633972167969, 2.151208877563477, 2.022930383682251, -1.463849902153015, -2.719141244888306, -1.985015749931335, 1.754704952239990, -3.623456001281738, 4.132822036743164, -2.998028755187988, 4.864254474639893, 7.019001007080078, -2.887226343154907, -2.157429456710815, -13.182174682617188, -0.038866952061653, -5.827670574188232, 5.151016235351562, 2.542974710464478, -12.306578636169434, -1.044925689697266, 11.257448196411133, -1.981187462806702, -1.172790408134460, -1.593691825866699, 5.988854408264160, 11.212390899658203, 6.184563636779785, 1.851197481155396, -7.376731395721436, 2.947922706604004, -3.116251468658447, 9.032855033874512, 11.189463615417480, -14.047230720520020, -1.882185339927673, 13.061312675476074, -3.894136667251587, -18.382831573486328, 5.108212947845459, 1.680236458778381, 5.855550289154053, 1.753978013992310, 8.817825317382812, 4.784208774566650, -9.456546783447266, 6.749723434448242, -3.826550960540771, 8.439210891723633, 2.381058931350708, -6.254682064056396, 0.979307055473328, -12.932164192199707, 6.614181041717529, 7.724326133728027, -8.186627388000488, -11.564584732055664, -5.705511093139648, 0.615724623203278, 2.294805049896240, 8.561786651611328, 10.862165451049805, -11.637836456298828, 8.550187110900879, -2.799665927886963, -4.847795963287354, 2.903936386108398, -6.381844997406006, 6.450922012329102, 26.763093948364258, 3.013844728469849, -0.924964666366577, -6.920816898345947, -2.560798168182373, -34.196998596191406, 1.623008966445923, 10.048088073730469, 0.985973894596100, 23.329315185546875, 1.722676992416382, -0.090961724519730, -5.953221797943115, 0.280752390623093, -14.521141052246094, 1.948345661163330, 6.979897975921631, 4.035674571990967, 1.044640779495239, -19.007211685180664, 21.142364501953125, -1.837882161140442, 2.050447940826416, 1.542031645774841, -7.565482616424561, 26.773376464843750, 0.171006053686142, 3.358534336090088, -8.234274864196777, -27.216566085815430, 7.212102413177490, -5.000186920166016, -20.917554855346680, 2.152885198593140, -3.181938886642456, 10.018072128295898, 6.692709445953369, 7.562778472900391, -0.397445559501648, -11.695134162902832, 1.699540257453918, 1.424039125442505, -1.628181338310242, 4.050493717193604, 0.047106776386499, -0.717159509658813, -3.311089277267456, -2.847960948944092, -9.831811904907227, -7.529915332794189, -2.769558668136597, -2.932808637619019, 6.914423465728760, 13.812906265258789, 0.466079294681549, -0.697627902030945, 0.283607631921768, -7.250504493713379, 13.122053146362305, 4.825413227081299, 2.828585863113403, 6.724539756774902, -0.596229493618011, -4.759947776794434, -9.946646690368652, -2.232836484909058, -3.401717901229858, -1.199927449226379, 3.097918748855591, 0.726092457771301, 1.352552175521851, -1.831664323806763, -6.564773082733154, 1.155098319053650, -2.088497400283813, -0.057716656476259, -0.293432414531708, -5.829917907714844, -2.137289047241211, 2.680857658386230, -3.795029640197754, 0.601609170436859, -2.534255266189575, 0.599966049194336, 9.936664581298828, -1.825383901596069, 2.551906108856201, -2.613932371139526, 5.252158164978027, -0.459596127271652, -1.080929756164551, -5.785776615142822, -4.251605510711670, 1.853045225143433, 1.728189826011658, -1.679710865020752, 3.655097484588623, -4.362958908081055, 1.981420755386353, -4.094293117523193, 0.941113770008087, -7.290192604064941, 2.577519655227661, 0.405787110328674, -5.861212253570557, -2.390504837036133, -4.859991073608398, 21.085351943969727, 2.001378059387207, -24.684366226196289, -4.457293987274170, 22.137004852294922, -1.187330603599548, -37.353851318359375, -1.755694746971130, 1.482097148895264, -1.574132204055786, 12.119773864746094, -4.834329605102539, 0.834708034992218, -21.962982177734375, 39.640460968017578, -9.078592300415039, -3.510553598403931, -16.044708251953125, -4.902245998382568, 0.224997147917747, -34.180931091308594, 7.509442806243896, 3.701504945755005, -11.197209358215332, -22.056798934936523, -27.012636184692383, 15.913613319396973, 4.364429473876953, -1.197503089904785, 8.571378707885742, -19.667821884155273, 22.258554458618164, -7.341328144073486, -17.936431884765625, -0.463554143905640, -5.726800918579102
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.h b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.h
new file mode 100644
index 00000000..421ca42b
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.h
@@ -0,0 +1,15 @@
+//Numpy array shape [36, 1]
+//Min -16.967756271362
+//Max 12.259524345398
+//Number of zeros 0
+
+#ifndef W24_H_
+#define W24_H_
+
+#ifndef __SYNTHESIS__
+met_weight_weight_t w24[36];
+#else
+met_weight_weight_t w24[36] = {-16.967756271362305, -3.760226726531982, 3.262881755828857, -9.485597610473633, -3.357334852218628, -15.149440765380859, 3.543870449066162, -2.800054788589478, 4.344166755676270, -2.786701679229736, 6.405607700347900, -3.039294004440308, -2.860914230346680, 2.979121685028076, -3.144270658493042, -3.578038454055786, -2.965110778808594, 3.106849431991577, 3.355989456176758, -2.746005535125732, 3.465666294097900, -3.180762529373169, -2.911018371582031, 3.576281547546387, 3.597542285919189, 3.606025695800781, -3.061075925827026, 12.259524345397949, -3.002163410186768, -3.301740884780884, 3.924034357070923, -3.431127548217773, 2.811718702316284, 2.879392385482788, -2.894979476928711, 3.125239610671997};
+#endif
+
+#endif
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.txt b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.txt
new file mode 100644
index 00000000..5fab1cdb
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/firmware/weights/w24.txt
@@ -0,0 +1 @@
+-16.967756271362305, -3.760226726531982, 3.262881755828857, -9.485597610473633, -3.357334852218628, -15.149440765380859, 3.543870449066162, -2.800054788589478, 4.344166755676270, -2.786701679229736, 6.405607700347900, -3.039294004440308, -2.860914230346680, 2.979121685028076, -3.144270658493042, -3.578038454055786, -2.965110778808594, 3.106849431991577, 3.355989456176758, -2.746005535125732, 3.465666294097900, -3.180762529373169, -2.911018371582031, 3.576281547546387, 3.597542285919189, 3.606025695800781, -3.061075925827026, 12.259524345397949, -3.002163410186768, -3.301740884780884, 3.924034357070923, -3.431127548217773, 2.811718702316284, 2.879392385482788, -2.894979476928711, 3.125239610671997
\ No newline at end of file
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/hls4ml_config.yml b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/hls4ml_config.yml
new file mode 100644
index 00000000..466ce955
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/hls4ml_config.yml
@@ -0,0 +1,119 @@
+Backend: Vivado
+ClockPeriod: 5
+HLSConfig:
+  LayerName:
+    activation:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    activation_1:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    batch_normalization:
+      Precision:
+        bias: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+        scale: ap_fixed<32,16>
+      Trace: true
+    batch_normalization_1:
+      Precision:
+        bias: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+        scale: ap_fixed<32,16>
+      Trace: true
+    concatenate:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    concatenate_1:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    dense:
+      Precision:
+        bias: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+        weight: ap_fixed<32,16>
+      Trace: true
+    dense_1:
+      Precision:
+        bias: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+        weight: ap_fixed<32,16>
+      Trace: true
+    dense_1_linear:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    dense_linear:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    embedding0:
+      Precision:
+        embeddings: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+      Trace: true
+    embedding1:
+      Precision:
+        embeddings: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+      Trace: true
+    input_cat0:
+      Precision:
+        result: ap_uint<4>
+      Trace: true
+    input_cat1:
+      Precision:
+        result: ap_uint<4>
+      Trace: true
+    input_cont:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    input_pxpy:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    met_weight:
+      Precision:
+        bias: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+        weight: ap_fixed<32,16>
+      Trace: true
+    met_weight_linear:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+    met_weight_minus_one:
+      Precision:
+        bias: ap_fixed<32,16>
+        result: ap_fixed<32,16>
+        scale: ap_fixed<32,16>
+      Trace: true
+    multiply:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+      n_elem: 100
+    output:
+      Precision:
+        result: ap_fixed<32,16>
+      Trace: true
+      n_filt: 2
+  Model:
+    BramFactor: 1000000000
+    Precision: ap_fixed<32,16>
+    ReuseFactor: 1
+    Strategy: Latency
+    TraceOutput: false
+IOType: io_parallel
+InputData: null
+KerasModel: !keras_model 'hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5'
+OutputDir: hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>
+OutputPredictions: null
+Part: xcvu13p-flga2577-2-e
+ProjectName: L1METML_v1
+Stamp: 95715E3e
+Version: 1.0.0
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5 b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5
new file mode 100644
index 00000000..13a4d599
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/keras_model.h5 differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model.h5 b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model.h5
new file mode 100644
index 00000000..13a4d599
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model.h5 differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model_hls4ml.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model_hls4ml.png
new file mode 100644
index 00000000..f04b84a5
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/model_hls4ml.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET.png
new file mode 100644
index 00000000..2545be2b
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_x.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_x.png
new file mode 100644
index 00000000..633ee2cc
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_x.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_y.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_y.png
new file mode 100644
index 00000000..beacc4dd
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_MET_y.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation.png
new file mode 100644
index 00000000..7886541a
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation_1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation_1.png
new file mode 100644
index 00000000..5d5a9d33
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_activation_1.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate.png
new file mode 100644
index 00000000..0cf674a0
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate_1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate_1.png
new file mode 100644
index 00000000..c22563c7
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_concatenate_1.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense.png
new file mode 100644
index 00000000..cda2f599
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense_1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense_1.png
new file mode 100644
index 00000000..c10e1cad
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_dense_1.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding0.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding0.png
new file mode 100644
index 00000000..f0a1ab32
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding0.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding1.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding1.png
new file mode 100644
index 00000000..e54c2a54
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_embedding1.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_met_weight.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_met_weight.png
new file mode 100644
index 00000000..cf1781d7
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_met_weight.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_multiply.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_multiply.png
new file mode 100644
index 00000000..dc0aed93
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_multiply.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_output.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_output.png
new file mode 100644
index 00000000..776dda69
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/profiling_output.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/project.tcl b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/project.tcl
new file mode 100644
index 00000000..d5cf7610
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/project.tcl
@@ -0,0 +1,12 @@
+variable project_name
+set project_name "L1METML_v1"
+variable backend
+set backend "vivado"
+variable part
+set part "xcvu13p-flga2577-2-e"
+variable clock_period
+set clock_period 5
+variable clock_uncertainty
+set clock_uncertainty 12.5%
+variable version
+set version "1.0.0"
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/response_MET.png b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/response_MET.png
new file mode 100644
index 00000000..c6699875
Binary files /dev/null and b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/response_MET.png differ
diff --git a/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/vivado_synth.tcl b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/vivado_synth.tcl
new file mode 100644
index 00000000..4634b166
--- /dev/null
+++ b/hls_output_trained_DeepMET_io_parallel_Latency_rf1_ap_fixed<32,16>/vivado_synth.tcl
@@ -0,0 +1,6 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+add_files ${project_name}_prj/solution1/syn/vhdl
+synth_design -top ${project_name} -part $part
+report_utilization -file vivado_synth.rpt
diff --git a/l1metml-job2.yml b/l1metml-job2.yml
new file mode 100644
index 00000000..99616905
--- /dev/null
+++ b/l1metml-job2.yml
@@ -0,0 +1,35 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: l1metml
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: gitlab-registry.nrp-nautilus.io/jmduarte/l1metml:latest
+        command:
+        - "/bin/bash"
+        - "-c"
+        - " git clone https://github.com/ucsd-hep-ex/L1METML.git -b gnn &&
+            cd L1METML &&
+	    python train.py --workflowType dataGenerator --input /home/users/dprimosc/data/l1_trigger_ntuples/TTbar  --mode 1 --epochs 500 --maxNPF 100 --batch-size 256 --units 12 36 --output models/quantized-dense-embedding/ --quantized 8 2 --model dense_embedding --compute-edge-feat 0 --model-output models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test --normFac 1"
+        volumeMounts:
+        - mountPath: /l1metmlvol
+          name: l1metmlvol
+        resources:
+          limits:
+            memory: 32Gi
+            cpu: "2"
+            nvidia.com/gpu: "1"
+          requests:
+            memory: 16Gi
+            cpu: "1"
+            nvidia.com/gpu: "1"
+      volumes:
+      - name: l1metmlvol
+        persistentVolumeClaim:
+          claimName: l1metmlvol
+
+      restartPolicy: Never
+  backoffLimit: 0
diff --git a/loss.py b/loss.py
index 581a96ae..b32670fe 100644
--- a/loss.py
+++ b/loss.py
@@ -4,7 +4,6 @@ def custom_loss_wrapper(normFac=1):
     by balancing the response above one and below one
     '''
 
-
     def custom_loss(y_true, y_pred):
         import tensorflow.keras.backend as K
         import tensorflow as tf
@@ -16,8 +15,8 @@ def custom_loss(y_true, y_pred):
 
         pt_truth = K.sqrt(px_truth*px_truth + py_truth*py_truth)
 
-        #px_truth1 = px_truth / pt_truth
-        #py_truth1 = py_truth / pt_truth
+        # px_truth1 = px_truth / pt_truth
+        # py_truth1 = py_truth / pt_truth
 
         # using absolute response
         # upar_pred = (px_truth1 * px_pred + py_truth1 * py_pred)/pt_truth
@@ -26,7 +25,7 @@ def custom_loss(y_true, y_pred):
         upar_pred = tf.boolean_mask(upar_pred, pt_cut)
         pt_truth_filtered = tf.boolean_mask(pt_truth, pt_cut)
 
-        #filter_bin0 = pt_truth_filtered < 50./normFac
+        # filter_bin0 = pt_truth_filtered < 50./normFac
         filter_bin0 = tf.logical_and(pt_truth_filtered > 50./normFac,  pt_truth_filtered < 100./normFac)
         filter_bin1 = tf.logical_and(pt_truth_filtered > 100./normFac, pt_truth_filtered < 200./normFac)
         filter_bin2 = tf.logical_and(pt_truth_filtered > 200./normFac, pt_truth_filtered < 300./normFac)
@@ -43,21 +42,21 @@ def custom_loss(y_true, y_pred):
         upar_pred_neg_bin3 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin3, upar_pred < 0.))
         upar_pred_pos_bin4 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin4, upar_pred > 0.))
         upar_pred_neg_bin4 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin4, upar_pred < 0.))
-        #upar_pred_pos_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred > 0.))
-        #upar_pred_neg_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred < 0.))
+        # upar_pred_pos_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred > 0.))
+        # upar_pred_neg_bin5 = tf.boolean_mask(upar_pred, tf.logical_and(filter_bin5, upar_pred < 0.))
         norm = tf.reduce_sum(pt_truth_filtered)
         dev = tf.abs(tf.reduce_sum(upar_pred_pos_bin0) + tf.reduce_sum(upar_pred_neg_bin0))
         dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin1) + tf.reduce_sum(upar_pred_neg_bin1))
         dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin2) + tf.reduce_sum(upar_pred_neg_bin2))
         dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin3) + tf.reduce_sum(upar_pred_neg_bin3))
         dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin4) + tf.reduce_sum(upar_pred_neg_bin4))
-        #dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin5) + tf.reduce_sum(upar_pred_neg_bin5))
+        # dev += tf.abs(tf.reduce_sum(upar_pred_pos_bin5) + tf.reduce_sum(upar_pred_neg_bin5))
         dev /= norm
 
         loss = 0.5*normFac**2*K.mean((px_pred - px_truth)**2 + (py_pred - py_truth)**2)
 
-        #loss += 200.*dev
+        # loss += 200.*dev
         loss += 5000.*dev
         return loss
-        
+
     return custom_loss
diff --git a/micromamba_setup.sh b/micromamba_setup.sh
new file mode 100644
index 00000000..d9ef8f2e
--- /dev/null
+++ b/micromamba_setup.sh
@@ -0,0 +1,2 @@
+micromamba create --file environment.yml --name l1metml
+micromamba activate l1metml
diff --git a/models/quantized-dense-embedding/MET_pt.png b/models/quantized-dense-embedding/MET_pt.png
new file mode 100644
index 00000000..735e5ff1
Binary files /dev/null and b/models/quantized-dense-embedding/MET_pt.png differ
diff --git a/models/quantized-dense-embedding/MET_response.png b/models/quantized-dense-embedding/MET_response.png
new file mode 100644
index 00000000..ec94cb55
Binary files /dev/null and b/models/quantized-dense-embedding/MET_response.png differ
diff --git a/models/quantized-dense-embedding/MET_x.png b/models/quantized-dense-embedding/MET_x.png
new file mode 100644
index 00000000..5f321c1c
Binary files /dev/null and b/models/quantized-dense-embedding/MET_x.png differ
diff --git a/models/quantized-dense-embedding/MET_y.png b/models/quantized-dense-embedding/MET_y.png
new file mode 100644
index 00000000..c27b3845
Binary files /dev/null and b/models/quantized-dense-embedding/MET_y.png differ
diff --git a/models/quantized-dense-embedding/Phi_abs_err.png b/models/quantized-dense-embedding/Phi_abs_err.png
new file mode 100644
index 00000000..d0adbe09
Binary files /dev/null and b/models/quantized-dense-embedding/Phi_abs_err.png differ
diff --git a/models/quantized-dense-embedding/PrVSGen.png b/models/quantized-dense-embedding/PrVSGen.png
new file mode 100644
index 00000000..53599066
Binary files /dev/null and b/models/quantized-dense-embedding/PrVSGen.png differ
diff --git a/models/quantized-dense-embedding/Pt_abs_error.png b/models/quantized-dense-embedding/Pt_abs_error.png
new file mode 100644
index 00000000..3646eecd
Binary files /dev/null and b/models/quantized-dense-embedding/Pt_abs_error.png differ
diff --git a/models/quantized-dense-embedding/TTbar_feature_array_MLMET.npy b/models/quantized-dense-embedding/TTbar_feature_array_MLMET.npy
new file mode 100644
index 00000000..ad976743
Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_feature_array_MLMET.npy differ
diff --git a/models/quantized-dense-embedding/TTbar_feature_array_PUMET.npy b/models/quantized-dense-embedding/TTbar_feature_array_PUMET.npy
new file mode 100644
index 00000000..bb232dd5
Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_feature_array_PUMET.npy differ
diff --git a/models/quantized-dense-embedding/TTbar_target_array_MLMET.npy b/models/quantized-dense-embedding/TTbar_target_array_MLMET.npy
new file mode 100644
index 00000000..fd349a9b
Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_target_array_MLMET.npy differ
diff --git a/models/quantized-dense-embedding/TTbar_target_array_PUMET.npy b/models/quantized-dense-embedding/TTbar_target_array_PUMET.npy
new file mode 100644
index 00000000..fd349a9b
Binary files /dev/null and b/models/quantized-dense-embedding/TTbar_target_array_PUMET.npy differ
diff --git a/models/quantized-dense-embedding/XY_resolution_plots.png b/models/quantized-dense-embedding/XY_resolution_plots.png
new file mode 100644
index 00000000..ccd6a61e
Binary files /dev/null and b/models/quantized-dense-embedding/XY_resolution_plots.png differ
diff --git a/models/quantized-dense-embedding/loss_history.log b/models/quantized-dense-embedding/loss_history.log
new file mode 100644
index 00000000..b1d96145
--- /dev/null
+++ b/models/quantized-dense-embedding/loss_history.log
@@ -0,0 +1,97 @@
+epoch,loss,lr,mean_absolute_error,mean_squared_error,val_loss,val_mean_absolute_error,val_mean_squared_error
+0,5365.49755859375,0.0003,36.625160217285156,2374.283447265625,2642.266845703125,34.32746124267578,2102.33837890625
+1,2478.82275390625,0.0003,32.99213790893555,1934.72412109375,2403.994873046875,31.916332244873047,1810.71142578125
+2,2352.487060546875,0.0003,32.17837142944336,1842.6265869140625,2313.86962890625,32.05146408081055,1838.9298095703125
+3,2296.711181640625,0.0003,31.804044723510742,1801.236083984375,2271.6572265625,31.789581298828125,1806.3812255859375
+4,2270.8251953125,0.0003,31.612276077270508,1779.804931640625,2257.845947265625,31.16737174987793,1735.5069580078125
+5,2258.5390625,0.0003,31.538549423217773,1771.061279296875,2253.50732421875,31.11001205444336,1727.9681396484375
+6,2249.881103515625,0.0003,31.480377197265625,1765.043701171875,2246.50927734375,31.05948829650879,1725.2110595703125
+7,2245.15283203125,0.0003,31.43949317932129,1761.727294921875,2232.66845703125,31.450876235961914,1769.3106689453125
+8,2243.132080078125,0.0003,31.429136276245117,1762.26416015625,2232.498046875,31.162405014038086,1736.4827880859375
+9,2245.653076171875,0.0003,31.421859741210938,1761.437744140625,2226.985595703125,31.18681526184082,1740.423583984375
+10,2245.4873046875,0.0003,31.400266647338867,1760.3131103515625,2225.4541015625,31.112810134887695,1733.8387451171875
+11,2241.443115234375,0.0003,31.413358688354492,1761.822021484375,2239.147705078125,30.872859954833984,1705.24267578125
+12,2238.0751953125,0.0003,31.35942840576172,1756.65625,2220.971435546875,31.09974479675293,1733.426513671875
+13,2233.3056640625,0.0003,31.350557327270508,1755.82958984375,2251.1357421875,30.87445640563965,1705.130126953125
+14,2231.28759765625,0.0003,31.316877365112305,1752.25634765625,2260.2607421875,30.62598419189453,1676.5386962890625
+15,2228.76171875,0.0003,31.30657196044922,1751.412841796875,2219.97412109375,31.45556640625,1772.76025390625
+16,2227.719482421875,0.0003,31.29245376586914,1750.1439208984375,2221.25048828125,31.31220054626465,1755.291259765625
+17,2225.812255859375,0.0003,31.29678726196289,1751.6265869140625,2214.671630859375,31.47364616394043,1776.827392578125
+18,2223.05078125,0.0003,31.272422790527344,1749.5374755859375,2209.434814453125,30.99241828918457,1719.739990234375
+19,2218.818115234375,0.0003,31.24805450439453,1746.823974609375,2209.388671875,31.06031036376953,1731.7838134765625
+20,2218.491943359375,0.0003,31.245458602905273,1747.4522705078125,2269.787109375,30.592639923095703,1676.3988037109375
+21,2216.6357421875,0.0003,31.25051498413086,1748.0118408203125,2218.935546875,30.852811813354492,1710.2021484375
+22,2215.057861328125,0.0003,31.226913452148438,1746.06103515625,2243.838623046875,31.873579025268555,1820.3507080078125
+23,2216.621337890625,0.0003,31.237436294555664,1747.4488525390625,2210.60400390625,31.3365478515625,1761.2696533203125
+24,2214.016845703125,0.0003,31.231857299804688,1746.2939453125,2228.604736328125,30.702482223510742,1688.9754638671875
+25,2211.27490234375,0.0003,31.219966888427734,1744.6165771484375,2205.664306640625,31.038711547851562,1723.0206298828125
+26,2207.486328125,0.0003,31.195711135864258,1742.8887939453125,2194.477294921875,30.95041275024414,1719.947265625
+27,2210.222412109375,0.0003,31.204673767089844,1743.8411865234375,2200.50927734375,31.29070281982422,1757.7618408203125
+28,2211.854248046875,0.0003,31.204816818237305,1743.28076171875,2227.3544921875,30.58995819091797,1679.155029296875
+29,2213.34765625,0.0003,31.25057601928711,1747.6446533203125,2206.81494140625,30.798568725585938,1700.878173828125
+30,2212.620849609375,0.0003,31.226085662841797,1743.9351806640625,2213.621826171875,30.753026962280273,1697.3277587890625
+31,2210.94921875,0.0003,31.220905303955078,1744.9833984375,2286.822998046875,30.77813148498535,1682.1134033203125
+32,2209.916748046875,0.0003,31.20001792907715,1742.58544921875,2209.856201171875,30.90184783935547,1712.124267578125
+33,2212.094970703125,0.0003,31.221921920776367,1744.3253173828125,2207.29931640625,30.911880493164062,1712.8521728515625
+34,2210.8544921875,0.0003,31.212121963500977,1742.1922607421875,2209.007568359375,30.852182388305664,1704.3905029296875
+35,2208.66650390625,0.0003,31.210289001464844,1743.2523193359375,2215.027587890625,30.700241088867188,1692.73486328125
+36,2209.481201171875,0.0003,31.19826316833496,1743.426513671875,2215.21923828125,31.48198890686035,1778.38330078125
+37,2210.159423828125,0.0003,31.214448928833008,1744.8856201171875,2220.15966796875,30.610177993774414,1681.4605712890625
+38,2210.488525390625,0.0003,31.215967178344727,1744.15771484375,2313.10498046875,32.419185638427734,1888.31103515625
+39,2210.69482421875,0.0003,31.20148468017578,1741.19287109375,2201.347412109375,31.019954681396484,1725.19677734375
+40,2207.256103515625,0.0003,31.194265365600586,1741.663818359375,2195.50732421875,31.203859329223633,1751.430419921875
+41,2204.386474609375,0.0003,31.183460235595703,1743.04345703125,2198.1875,30.85906410217285,1708.7384033203125
+42,2202.40478515625,0.0003,31.156558990478516,1739.5374755859375,2208.386474609375,30.73008918762207,1691.7432861328125
+43,2199.3857421875,0.0003,31.13861083984375,1737.30615234375,2193.83056640625,30.826841354370117,1705.76806640625
+44,2199.031494140625,0.0003,31.13396644592285,1736.3363037109375,2196.921142578125,30.907873153686523,1710.51025390625
+45,2201.519287109375,0.0003,31.154233932495117,1739.555419921875,2194.886474609375,31.024913787841797,1729.93701171875
+46,2201.720458984375,0.0003,31.153945922851562,1738.5020751953125,2200.976806640625,30.806833267211914,1699.372314453125
+47,2201.043212890625,0.0003,31.148744583129883,1737.4951171875,2198.75927734375,30.850374221801758,1708.83447265625
+48,2204.1826171875,0.0003,31.15572738647461,1737.10693359375,2208.6689453125,30.689353942871094,1687.5474853515625
+49,2203.097900390625,0.0003,31.155651092529297,1736.9141845703125,2193.733154296875,30.915498733520508,1712.6759033203125
+50,2200.683837890625,0.0003,31.16202735900879,1737.0792236328125,2192.33544921875,31.115854263305664,1736.875732421875
+51,2201.66357421875,0.0003,31.151227951049805,1735.9931640625,2207.9580078125,30.661632537841797,1682.8206787109375
+52,2203.215576171875,0.0003,31.143098831176758,1735.8565673828125,2196.600830078125,30.820405960083008,1703.871826171875
+53,2201.114013671875,0.0003,31.127216339111328,1736.264404296875,2195.122802734375,31.004093170166016,1726.9163818359375
+54,2199.779296875,0.0003,31.147878646850586,1738.2113037109375,2194.158203125,31.19964599609375,1750.343017578125
+55,2201.775146484375,0.0003,31.15053367614746,1738.500244140625,2190.744140625,30.997722625732422,1726.93310546875
+56,2204.49267578125,0.0003,31.181379318237305,1741.8153076171875,2204.71142578125,30.76068878173828,1698.86474609375
+57,2202.5205078125,0.0003,31.1776065826416,1740.7110595703125,2207.032470703125,30.753690719604492,1696.7855224609375
+58,2203.484130859375,0.0003,31.152755737304688,1737.710205078125,2196.21435546875,30.987199783325195,1721.74169921875
+59,2204.532958984375,0.0003,31.17318344116211,1739.4593505859375,2194.90087890625,30.944604873657227,1718.6607666015625
+60,2205.913818359375,0.0003,31.170658111572266,1738.680419921875,2200.27880859375,30.946495056152344,1720.03759765625
+61,2205.884033203125,0.0003,31.18505859375,1740.677001953125,2195.715576171875,31.04903793334961,1728.323486328125
+62,2207.506591796875,0.0003,31.178726196289062,1739.660400390625,2202.005126953125,30.92181396484375,1710.3450927734375
+63,2205.39697265625,0.0003,31.189022064208984,1739.7410888671875,2346.319091796875,30.911865234375,1687.64453125
+64,2201.806884765625,0.0003,31.17337417602539,1738.7607421875,2193.619384765625,31.078243255615234,1731.8031005859375
+65,2204.08642578125,0.0003,31.168397903442383,1739.8743896484375,2200.516845703125,30.913759231567383,1714.22412109375
+66,2203.070068359375,0.0003,31.17813491821289,1739.9871826171875,2212.877685546875,30.637502670288086,1680.600830078125
+67,2203.95458984375,0.0003,31.193883895874023,1742.715576171875,2200.10888671875,30.81962776184082,1707.2403564453125
+68,2207.74951171875,0.0003,31.195785522460938,1743.4793701171875,2211.089111328125,30.718929290771484,1690.9881591796875
+69,2206.971435546875,0.0003,31.193485260009766,1743.16796875,2215.083251953125,31.532989501953125,1789.220703125
+70,2206.98388671875,0.0003,31.192052841186523,1742.7562255859375,2197.8544921875,31.161483764648438,1738.7413330078125
+71,2209.69873046875,0.0003,31.207612991333008,1744.2576904296875,2203.982421875,30.88957405090332,1713.370361328125
+72,2210.47314453125,0.0003,31.24172019958496,1748.81640625,2205.13037109375,31.064043045043945,1731.40966796875
+73,2206.58642578125,0.0003,31.204240798950195,1743.779541015625,2203.295654296875,31.299882888793945,1757.2547607421875
+74,2210.520263671875,0.0003,31.234872817993164,1748.2420654296875,2194.092041015625,31.166475296020508,1744.5223388671875
+75,2208.08203125,0.0003,31.211593627929688,1744.8179931640625,2201.28857421875,31.07516860961914,1735.3389892578125
+76,2209.787841796875,0.0003,31.226259231567383,1746.712646484375,2265.71875,32.07892990112305,1854.2498779296875
+77,2208.861328125,0.0003,31.226646423339844,1746.5357666015625,2217.926513671875,30.675195693969727,1688.148193359375
+78,2207.03125,0.0003,31.2060604095459,1744.0958251953125,2195.196044921875,31.022859573364258,1725.2369384765625
+79,2207.662841796875,0.0003,31.2130069732666,1745.017333984375,2199.871826171875,30.941226959228516,1715.8800048828125
+80,2208.997802734375,0.0003,31.231077194213867,1747.3485107421875,2204.09130859375,30.840715408325195,1705.426513671875
+81,2209.9228515625,0.0003,31.214202880859375,1745.3836669921875,2196.96923828125,31.148229598999023,1743.1463623046875
+82,2209.47314453125,0.0003,31.219953536987305,1746.4140625,2200.6884765625,31.20162582397461,1746.3389892578125
+83,2210.9306640625,0.0003,31.211957931518555,1745.69970703125,2207.14599609375,30.89093589782715,1716.6256103515625
+84,2211.049560546875,0.0003,31.225465774536133,1747.16357421875,2202.51220703125,30.883039474487305,1713.4200439453125
+85,2212.10107421875,0.0003,31.219194412231445,1747.7564697265625,2203.7255859375,31.25891876220703,1749.3304443359375
+86,2212.598876953125,0.0003,31.23923110961914,1749.4521484375,2238.5771484375,31.83201026916504,1819.1221923828125
+87,2210.39501953125,0.0003,31.237064361572266,1748.4912109375,2207.60107421875,30.849815368652344,1704.5650634765625
+88,2211.44775390625,0.0003,31.229516983032227,1748.939208984375,2210.279541015625,31.19259262084961,1746.7022705078125
+89,2210.99072265625,0.0003,31.244524002075195,1750.119384765625,2227.881591796875,30.589513778686523,1679.3865966796875
+90,2211.454833984375,0.0003,31.226518630981445,1749.206787109375,2197.9755859375,31.094615936279297,1737.0526123046875
+91,2209.25732421875,0.0003,31.223045349121094,1748.1630859375,2199.78173828125,30.957643508911133,1720.9564208984375
+92,2209.30908203125,0.0003,31.23832130432129,1749.5999755859375,2210.670654296875,30.77962303161621,1699.195068359375
+93,2209.58154296875,0.0003,31.230850219726562,1749.1483154296875,2232.6435546875,31.708250045776367,1807.9969482421875
+94,2211.325927734375,0.0003,31.207908630371094,1746.332763671875,2208.9716796875,30.927019119262695,1717.478759765625
+95,2211.26318359375,0.0003,31.248823165893555,1751.1241455078125,2198.548828125,31.21225929260254,1749.599365234375
diff --git a/models/quantized-dense-embedding/model.h5 b/models/quantized-dense-embedding/model.h5
new file mode 100644
index 00000000..f3337314
Binary files /dev/null and b/models/quantized-dense-embedding/model.h5 differ
diff --git a/models/quantized-dense-embedding/pt_resolution_plots.png b/models/quantized-dense-embedding/pt_resolution_plots.png
new file mode 100644
index 00000000..03c1655c
Binary files /dev/null and b/models/quantized-dense-embedding/pt_resolution_plots.png differ
diff --git a/models/quantized-dense-embedding/rel_error_opaque.png b/models/quantized-dense-embedding/rel_error_opaque.png
new file mode 100644
index 00000000..cedd7ca8
Binary files /dev/null and b/models/quantized-dense-embedding/rel_error_opaque.png differ
diff --git a/models/quantized-dense-embedding/time.txt b/models/quantized-dense-embedding/time.txt
new file mode 100644
index 00000000..ec9f99a9
--- /dev/null
+++ b/models/quantized-dense-embedding/time.txt
@@ -0,0 +1 @@
+Working Time (s) : 19636.019178152084Working Time (m) : 327.26698630253475
\ No newline at end of file
diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_tes.h5 b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_tes.h5
new file mode 100644
index 00000000..fbdc91c8
Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_tes.h5 differ
diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/fingerprint.pb b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/fingerprint.pb
new file mode 100644
index 00000000..57fbb672
Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/fingerprint.pb differ
diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/keras_metadata.pb b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/keras_metadata.pb
new file mode 100644
index 00000000..970106c8
--- /dev/null
+++ b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/keras_metadata.pb
@@ -0,0 +1,24 @@
+
+��root"_tf_keras_network*��{"name": "model", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Functional", "config": {"name": "model", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat0"}, "name": "input_cat0", "inbound_nodes": []}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat1"}, "name": "input_cat1", "inbound_nodes": []}, {"class_name": "Embedding", "config": {"name": "embedding0", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 6, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding0", "inbound_nodes": [[["input_cat0", 0, 0, {}]]]}, {"class_name": "Embedding", "config": {"name": "embedding1", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 4, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding1", "inbound_nodes": [[["input_cat1", 0, 0, {}]]]}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cont"}, "name": "input_cont", "inbound_nodes": []}, {"class_name": "Concatenate", "config": {"name": "concatenate", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate", "inbound_nodes": [[["embedding0", 0, 0, {}], ["embedding1", 0, 0, {}]]]}, {"class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate_1", "inbound_nodes": [[["input_cont", 0, 0, {}], ["concatenate", 0, 0, {}]]]}, {"class_name": "QDense", "config": {"name": "q_dense", "trainable": true, "dtype": "float32", "units": 12, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 10}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 11}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 12}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 13}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 14}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense", "inbound_nodes": [[["concatenate_1", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization", "inbound_nodes": [[["q_dense", 0, 0, {}]]]}, {"class_name": "QActivation", "config": {"name": "q_activation", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation", "inbound_nodes": [[["batch_normalization", 0, 0, {}]]]}, {"class_name": "QDense", "config": {"name": "q_dense_1", "trainable": true, "dtype": "float32", "units": 36, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 23}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 24}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 25}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 26}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 27}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense_1", "inbound_nodes": [[["q_activation", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_1", "inbound_nodes": [[["q_dense_1", 0, 0, {}]]]}, {"class_name": "QActivation", "config": {"name": "q_activation_1", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation_1", "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]]}, {"class_name": "QDense", "config": {"name": "met_weight", "trainable": true, "dtype": "float32", "units": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "VarianceScaling", "config": {"scale": 0.02, "mode": "fan_in", "distribution": "truncated_normal", "seed": null}, "__passive_serialization__": true, "shared_object_id": 35}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 36}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 38}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 39}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "met_weight", "inbound_nodes": [[["q_activation_1", 0, 0, {}]]]}, {"class_name": "BatchNormalization", "config": {"name": "met_weight_minus_one", "trainable": false, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": false, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "met_weight_minus_one", "inbound_nodes": [[["met_weight", 0, 0, {}]]]}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_pxpy"}, "name": "input_pxpy", "inbound_nodes": []}, {"class_name": "Multiply", "config": {"name": "multiply", "trainable": true, "dtype": "float32"}, "name": "multiply", "inbound_nodes": [[["met_weight_minus_one", 0, 0, {}], ["input_pxpy", 0, 0, {}]]]}, {"class_name": "GlobalAveragePooling1D", "config": {"name": "output", "trainable": true, "dtype": "float32", "data_format": "channels_last", "keepdims": false}, "name": "output", "inbound_nodes": [[["multiply", 0, 0, {}]]]}], "input_layers": [["input_cont", 0, 0], ["input_pxpy", 0, 0], ["input_cat0", 0, 0], ["input_cat1", 0, 0]], "output_layers": [["output", 0, 0]]}, "shared_object_id": 49, "input_spec": [{"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}}, {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}}, {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}, {"class_name": "InputSpec", "config": {"dtype": null, "shape": {"class_name": "__tuple__", "items": [null, 100]}, "ndim": 2, "max_ndim": null, "min_ndim": null, "axes": {}}}], "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 4]}, {"class_name": "TensorShape", "items": [null, 100, 2]}, {"class_name": "TensorShape", "items": [null, 100]}, {"class_name": "TensorShape", "items": [null, 100]}], "is_graph_network": true, "full_save_spec": {"class_name": "__tuple__", "items": [[[{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 4]}, "float32", "input_cont"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 2]}, "float32", "input_pxpy"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat0"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat1"]}]], {}]}, "save_spec": [{"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 4]}, "float32", "input_cont"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100, 2]}, "float32", "input_pxpy"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat0"]}, {"class_name": "TypeSpec", "type_spec": "tf.TensorSpec", "serialized": [{"class_name": "TensorShape", "items": [null, 100]}, "float32", "input_cat1"]}], "keras_version": "2.11.0", "backend": "tensorflow", "model_config": {"class_name": "Functional", "config": {"name": "model", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat0"}, "name": "input_cat0", "inbound_nodes": [], "shared_object_id": 0}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat1"}, "name": "input_cat1", "inbound_nodes": [], "shared_object_id": 1}, {"class_name": "Embedding", "config": {"name": "embedding0", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 6, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 2}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding0", "inbound_nodes": [[["input_cat0", 0, 0, {}]]], "shared_object_id": 3}, {"class_name": "Embedding", "config": {"name": "embedding1", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 4, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 4}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "name": "embedding1", "inbound_nodes": [[["input_cat1", 0, 0, {}]]], "shared_object_id": 5}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cont"}, "name": "input_cont", "inbound_nodes": [], "shared_object_id": 6}, {"class_name": "Concatenate", "config": {"name": "concatenate", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate", "inbound_nodes": [[["embedding0", 0, 0, {}], ["embedding1", 0, 0, {}]]], "shared_object_id": 7}, {"class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "dtype": "float32", "axis": -1}, "name": "concatenate_1", "inbound_nodes": [[["input_cont", 0, 0, {}], ["concatenate", 0, 0, {}]]], "shared_object_id": 8}, {"class_name": "QDense", "config": {"name": "q_dense", "trainable": true, "dtype": "float32", "units": 12, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 10}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 11}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 12}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 13}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 14}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense", "inbound_nodes": [[["concatenate_1", 0, 0, {}]]], "shared_object_id": 15}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 16}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 17}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 18}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 19}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization", "inbound_nodes": [[["q_dense", 0, 0, {}]]], "shared_object_id": 20}, {"class_name": "QActivation", "config": {"name": "q_activation", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation", "inbound_nodes": [[["batch_normalization", 0, 0, {}]]], "shared_object_id": 22}, {"class_name": "QDense", "config": {"name": "q_dense_1", "trainable": true, "dtype": "float32", "units": 36, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 23}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 24}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 25}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 26}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 27}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "q_dense_1", "inbound_nodes": [[["q_activation", 0, 0, {}]]], "shared_object_id": 28}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 29}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 30}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 31}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 32}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "batch_normalization_1", "inbound_nodes": [[["q_dense_1", 0, 0, {}]]], "shared_object_id": 33}, {"class_name": "QActivation", "config": {"name": "q_activation_1", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "name": "q_activation_1", "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]], "shared_object_id": 34}, {"class_name": "QDense", "config": {"name": "met_weight", "trainable": true, "dtype": "float32", "units": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "VarianceScaling", "config": {"scale": 0.02, "mode": "fan_in", "distribution": "truncated_normal", "seed": null}, "__passive_serialization__": true, "shared_object_id": 35}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 36}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 38}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 39}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "name": "met_weight", "inbound_nodes": [[["q_activation_1", 0, 0, {}]]], "shared_object_id": 40}, {"class_name": "BatchNormalization", "config": {"name": "met_weight_minus_one", "trainable": false, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": false, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 41}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 42}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 43}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 44}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "name": "met_weight_minus_one", "inbound_nodes": [[["met_weight", 0, 0, {}]]], "shared_object_id": 45}, {"class_name": "InputLayer", "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_pxpy"}, "name": "input_pxpy", "inbound_nodes": [], "shared_object_id": 46}, {"class_name": "Multiply", "config": {"name": "multiply", "trainable": true, "dtype": "float32"}, "name": "multiply", "inbound_nodes": [[["met_weight_minus_one", 0, 0, {}], ["input_pxpy", 0, 0, {}]]], "shared_object_id": 47}, {"class_name": "GlobalAveragePooling1D", "config": {"name": "output", "trainable": true, "dtype": "float32", "data_format": "channels_last", "keepdims": false}, "name": "output", "inbound_nodes": [[["multiply", 0, 0, {}]]], "shared_object_id": 48}], "input_layers": [["input_cont", 0, 0], ["input_pxpy", 0, 0], ["input_cat0", 0, 0], ["input_cat1", 0, 0]], "output_layers": [["output", 0, 0]]}}, "training_config": {"loss": "custom_loss", "metrics": [[{"class_name": "MeanMetricWrapper", "config": {"name": "mean_absolute_error", "dtype": "float32", "fn": "mean_absolute_error"}, "shared_object_id": 54}, {"class_name": "MeanMetricWrapper", "config": {"name": "mean_squared_error", "dtype": "float32", "fn": "mean_squared_error"}, "shared_object_id": 55}]], "weighted_metrics": null, "loss_weights": null, "optimizer_config": {"class_name": "Custom>Adam", "config": {"name": "Adam", "weight_decay": null, "clipnorm": 1.0, "global_clipnorm": null, "clipvalue": null, "use_ema": false, "ema_momentum": 0.99, "ema_overwrite_frequency": null, "jit_compile": false, "is_legacy_optimizer": false, "learning_rate": 0.0003000000142492354, "beta_1": 0.9, "beta_2": 0.999, "epsilon": 1e-07, "amsgrad": false}}}}2
+�root.layer-0"_tf_keras_input_layer*�{"class_name": "InputLayer", "name": "input_cat0", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat0"}}2
+�root.layer-1"_tf_keras_input_layer*�{"class_name": "InputLayer", "name": "input_cat1", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cat1"}}2
+�root.layer_with_weights-0"_tf_keras_layer*�{"name": "embedding0", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Embedding", "config": {"name": "embedding0", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 6, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 2}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "inbound_nodes": [[["input_cat0", 0, 0, {}]]], "shared_object_id": 3, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100]}}2
+�root.layer_with_weights-1"_tf_keras_layer*�{"name": "embedding1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": false, "class_name": "Embedding", "config": {"name": "embedding1", "trainable": true, "dtype": "float32", "batch_input_shape": {"class_name": "__tuple__", "items": [null, null]}, "input_dim": 4, "output_dim": 2, "embeddings_initializer": {"class_name": "RandomNormal", "config": {"mean": 0, "stddev": 0.2, "seed": null}, "shared_object_id": 4}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}, "inbound_nodes": [[["input_cat1", 0, 0, {}]]], "shared_object_id": 5, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100]}}2
+�root.layer-4"_tf_keras_input_layer*�{"class_name": "InputLayer", "name": "input_cont", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 4]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_cont"}}2
+�root.layer-5"_tf_keras_layer*�{"name": "concatenate", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Concatenate", "config": {"name": "concatenate", "trainable": true, "dtype": "float32", "axis": -1}, "inbound_nodes": [[["embedding0", 0, 0, {}], ["embedding1", 0, 0, {}]]], "shared_object_id": 7, "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 2]}, {"class_name": "TensorShape", "items": [null, 100, 2]}]}2
+�root.layer-6"_tf_keras_layer*�{"name": "concatenate_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "dtype": "float32", "axis": -1}, "inbound_nodes": [[["input_cont", 0, 0, {}], ["concatenate", 0, 0, {}]]], "shared_object_id": 8, "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 4]}, {"class_name": "TensorShape", "items": [null, 100, 4]}]}2
+�root.layer_with_weights-2"_tf_keras_layer*�{"name": "q_dense", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QDense", "config": {"name": "q_dense", "trainable": true, "dtype": "float32", "units": 12, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 10}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 11}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 12}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 13}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 14}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "inbound_nodes": [[["concatenate_1", 0, 0, {}]]], "shared_object_id": 15, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 8}}, "shared_object_id": 56}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 8]}}2
+�		root.layer_with_weights-3"_tf_keras_layer*�	{"name": "batch_normalization", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 16}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 17}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 18}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 19}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["q_dense", 0, 0, {}]]], "shared_object_id": 20, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 12}}, "shared_object_id": 57}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 12]}}2
+�
+root.layer-9"_tf_keras_layer*�{"name": "q_activation", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QActivation", "config": {"name": "q_activation", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "inbound_nodes": [[["batch_normalization", 0, 0, {}]]], "shared_object_id": 22, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 12]}}2
+�root.layer_with_weights-4"_tf_keras_layer*�{"name": "q_dense_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QDense", "config": {"name": "q_dense_1", "trainable": true, "dtype": "float32", "units": 36, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "LecunUniform", "config": {"seed": null}, "__passive_serialization__": true, "shared_object_id": 23}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 24}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 25}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 26}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 27}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "inbound_nodes": [[["q_activation", 0, 0, {}]]], "shared_object_id": 28, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 12}}, "shared_object_id": 58}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 12]}}2
+�	root.layer_with_weights-5"_tf_keras_layer*�	{"name": "batch_normalization_1", "trainable": true, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.95, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 29}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 30}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 31}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 32}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["q_dense_1", 0, 0, {}]]], "shared_object_id": 33, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 36}}, "shared_object_id": 59}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 36]}}2
+�root.layer-12"_tf_keras_layer*�{"name": "q_activation_1", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QActivation", "config": {"name": "q_activation_1", "trainable": true, "dtype": "float32", "activation": {"class_name": "quantized_relu", "config": {"bits": 8, "integer": 2, "use_sigmoid": 0, "negative_slope": 0.0, "use_stochastic_rounding": false, "relu_upper_bound": null, "qnoise_factor": 1.0}, "__passive_serialization__": true, "shared_object_id": 21}}, "inbound_nodes": [[["batch_normalization_1", 0, 0, {}]]], "shared_object_id": 34, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 36]}}2
+�root.layer_with_weights-6"_tf_keras_layer*�{"name": "met_weight", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "QDense", "config": {"name": "met_weight", "trainable": true, "dtype": "float32", "units": 1, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "QInitializer", "config": {"initializer": {"class_name": "VarianceScaling", "config": {"scale": 0.02, "mode": "fan_in", "distribution": "truncated_normal", "seed": null}, "__passive_serialization__": true, "shared_object_id": 35}, "use_scale": true, "quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}}, "shared_object_id": 36}, "bias_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 37}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 38}, "bias_constraint": {"class_name": "Clip", "config": {"min_value": -4.0, "max_value": 4.0}, "shared_object_id": 39}, "kernel_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "bias_quantizer": {"class_name": "quantized_bits", "config": {"bits": 8, "integer": 2, "symmetric": 0, "alpha": 1, "keep_negative": true, "use_stochastic_rounding": false, "qnoise_factor": 1.0}, "shared_object_id": 9, "__passive_serialization__": true}, "kernel_range": null, "bias_range": null}, "inbound_nodes": [[["q_activation_1", 0, 0, {}]]], "shared_object_id": 40, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": null, "max_ndim": null, "min_ndim": 2, "axes": {"-1": 36}}, "shared_object_id": 60}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 36]}}2
+�	root.layer_with_weights-7"_tf_keras_layer*�	{"name": "met_weight_minus_one", "trainable": false, "expects_training_arg": true, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "BatchNormalization", "config": {"name": "met_weight_minus_one", "trainable": false, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": false, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 41}, "gamma_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 42}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}, "shared_object_id": 43}, "moving_variance_initializer": {"class_name": "Ones", "config": {}, "shared_object_id": 44}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}, "inbound_nodes": [[["met_weight", 0, 0, {}]]], "shared_object_id": 45, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {"2": 1}}, "shared_object_id": 61}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 1]}}2
+�root.layer-15"_tf_keras_input_layer*�{"class_name": "InputLayer", "name": "input_pxpy", "dtype": "float32", "sparse": false, "ragged": false, "batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "config": {"batch_input_shape": {"class_name": "__tuple__", "items": [null, 100, 2]}, "dtype": "float32", "sparse": false, "ragged": false, "name": "input_pxpy"}}2
+�root.layer-16"_tf_keras_layer*�{"name": "multiply", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "Multiply", "config": {"name": "multiply", "trainable": true, "dtype": "float32"}, "inbound_nodes": [[["met_weight_minus_one", 0, 0, {}], ["input_pxpy", 0, 0, {}]]], "shared_object_id": 47, "build_input_shape": [{"class_name": "TensorShape", "items": [null, 100, 1]}, {"class_name": "TensorShape", "items": [null, 100, 2]}]}2
+�root.layer-17"_tf_keras_layer*�{"name": "output", "trainable": true, "expects_training_arg": false, "dtype": "float32", "batch_input_shape": null, "stateful": false, "must_restore_from_config": false, "preserve_input_structure_in_config": false, "autocast": true, "class_name": "GlobalAveragePooling1D", "config": {"name": "output", "trainable": true, "dtype": "float32", "data_format": "channels_last", "keepdims": false}, "inbound_nodes": [[["multiply", 0, 0, {}]]], "shared_object_id": 48, "input_spec": {"class_name": "InputSpec", "config": {"dtype": null, "shape": null, "ndim": 3, "max_ndim": null, "min_ndim": null, "axes": {}}, "shared_object_id": 62}, "build_input_shape": {"class_name": "TensorShape", "items": [null, 100, 2]}}2
+��root.keras_api.metrics.0"_tf_keras_metric*�{"class_name": "Mean", "name": "loss", "dtype": "float32", "config": {"name": "loss", "dtype": "float32"}, "shared_object_id": 63}2
+��root.keras_api.metrics.1"_tf_keras_metric*�{"class_name": "MeanMetricWrapper", "name": "mean_absolute_error", "dtype": "float32", "config": {"name": "mean_absolute_error", "dtype": "float32", "fn": "mean_absolute_error"}, "shared_object_id": 54}2
+��root.keras_api.metrics.2"_tf_keras_metric*�{"class_name": "MeanMetricWrapper", "name": "mean_squared_error", "dtype": "float32", "config": {"name": "mean_squared_error", "dtype": "float32", "fn": "mean_squared_error"}, "shared_object_id": 55}2
\ No newline at end of file
diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/saved_model.pb b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/saved_model.pb
new file mode 100644
index 00000000..d3b371bd
Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/saved_model.pb differ
diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.data-00000-of-00001 b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000..6b749f73
Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.data-00000-of-00001 differ
diff --git a/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.index b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.index
new file mode 100644
index 00000000..5077d741
Binary files /dev/null and b/models/saved_keras_models/quantized_dense_model_100pf_500epochs_1_test/variables/variables.index differ
diff --git a/train.py b/train.py
index 9b1d2ecd..01dd340c 100644
--- a/train.py
+++ b/train.py
@@ -11,7 +11,7 @@
 import matplotlib.pyplot as plt
 import argparse
 import math
-#import setGPU
+# import setGPU
 import time
 import os
 import pathlib
diff --git a/utils.py b/utils.py
index 6fc43146..a0575396 100644
--- a/utils.py
+++ b/utils.py
@@ -101,7 +101,7 @@ def MakePlots(trueXY, mlXY, puppiXY, path_out):
 
     # width of a distribution at 1 standard deviation
     def resolqt(y):
-        return(np.percentile(y, 84)-np.percentile(y, 16))/2.0
+        return (np.percentile(y, 84)-np.percentile(y, 16))/2.0
 
     # response correction factors
     # the events are split into 20 bins based on true_pt and get assigned the corresponding `truth_means/ml_means` of all events in that bin